mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-08 00:14:35 +08:00
fix for issue #670 with double endstream
when a stream contains two endstream declarations with no data between them then the first declared endstream should be obeyed
This commit is contained in:
@@ -406,6 +406,52 @@ endobj";
|
||||
Assert.Equal(7, token.Number.ObjectNumber);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStreamWithDoubleEndstreamSimple()
|
||||
{
|
||||
const string s =
|
||||
"""
|
||||
250 0 obj
|
||||
<< /Filter /FlateDecode >>
|
||||
stream
|
||||
012endstream
|
||||
endstream
|
||||
endobj
|
||||
""";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStreamWithDoubleEndstream()
|
||||
{
|
||||
const string s =
|
||||
"""
|
||||
1974 0 obj
|
||||
<<
|
||||
/Filter /FlateDecode
|
||||
/Length 1975 0 R
|
||||
>>
|
||||
stream
|
||||
xœ]ÔÏnÚ@€ñ'ð;øØ"Œg !Ué…Cÿ¨´ ö:B*Æ2äÀÛw¿™MZõ'þ°½ë›]<ï>ïÆÓ^|Ÿ/Ý>Ýêá4ösº^^ç.ÕÇôr«e[÷§îVÎüØSµÈ7ïï×[:ïÆáRm6ÕâGþðz›ïõ‡Oýå˜>V‹osŸæÓøRøõ¼Ïçû×iúÎi¼ÕMµÝÖ}òƒ¾¦¯‡sª~ÛîϟŸn÷‡|Ïß+~Þ§T·~¾ŒÉt—>]§C—æÃø’ªM»ÜÖ›U³ÒØÿ÷ÙJã–ãðïµ~†&msh Y„ –K‚4BK0‚yÈ¿rXVzš°Žà}$<zÐðDxò`þÐáAGÂ1‚:BÏða{B{$$Bа&
|
||||
„!ÂSÒä¿ýCC€B£e…PHx´x-Ã
|
||||
R<˜º@!á!>,âW@!á!¼œ@!áÑ2uBÂC=@!á¡þP(¤xðU
|
||||
R< (¤xø°PHx(SW(4<”—S(4<´#@¡á¡ÌT¡Ð²><@¡á¡Œ¢PhxSW(4<”õ¡Phxè‘ …†Ç’£PhY|Q
|
||||
…†GëÃB¡e}à¡Phx˜¿
|
||||
†‡B¡áÑú°Phx´ÆÔ
|
||||
+,ƒÂÂ#/× °²>3(¬xð.……‡¡nPXx˜_……‡ùC¡°²>x}ƒÂÂCx9ƒÂНoPXxˆ…š&ùPø!ÙÚ¯€ÂŠÿ•……‡ ¶jbky
|
||||
y‡yÛJØlØßw±îužó曦ï\ìY§1½ï«Óeâ.ÿùz°gAendstream
|
||||
endstream
|
||||
endobj
|
||||
""";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStringsWithMissingEndBracket()
|
||||
{
|
||||
@@ -661,7 +707,6 @@ endobj";
|
||||
var tokens = ReadToEnd(strictScanner);
|
||||
Assert.Empty(tokens);
|
||||
|
||||
|
||||
var lenientScanner = GetScanner(input, useLenientParsing: true);
|
||||
tokens = ReadToEnd(lenientScanner);
|
||||
|
||||
|
@@ -377,17 +377,11 @@
|
||||
|
||||
long streamDataStart = inputBytes.CurrentOffset;
|
||||
|
||||
PossibleStreamEndLocation? possibleEndLocation = null;
|
||||
|
||||
|
||||
// Negative indicates endobj.
|
||||
bool isEndData = false;
|
||||
Stack<EndLoc> endLocations = new Stack<EndLoc>();
|
||||
while (inputBytes.MoveNext())
|
||||
{
|
||||
if (length.HasValue && read == length)
|
||||
{
|
||||
// TODO: read ahead and check we're at the end...
|
||||
// break;
|
||||
}
|
||||
|
||||
// We are reading 'end' (possibly).
|
||||
if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
|
||||
{
|
||||
@@ -401,12 +395,14 @@
|
||||
endObjPosition = 0;
|
||||
endStreamPosition++;
|
||||
|
||||
// We've finished reading 'endstream', add it to the end tokens we've seen.
|
||||
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
|
||||
if (endStreamPosition == streamPart.Length)
|
||||
{
|
||||
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
|
||||
|
||||
possibleEndLocation = token;
|
||||
// Token is at end of stream or is followed by whitespace
|
||||
if (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))
|
||||
{
|
||||
var location = inputBytes.CurrentOffset - EndstreamBytes.Length;
|
||||
endLocations.Push(new EndLoc(true, location, !isEndData));
|
||||
isEndData = true;
|
||||
|
||||
if (length.HasValue && read > length)
|
||||
{
|
||||
@@ -414,31 +410,33 @@
|
||||
}
|
||||
|
||||
endStreamPosition = 0;
|
||||
commonPartPosition = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (inputBytes.CurrentByte == objPart[endObjPosition])
|
||||
{
|
||||
// We are reading 'obj' after 'end'
|
||||
|
||||
endStreamPosition = 0;
|
||||
endObjPosition++;
|
||||
|
||||
// We have finished reading 'endobj'.
|
||||
if (endObjPosition == objPart.Length)
|
||||
{
|
||||
var hasPreviousEndToken = endLocations.Count > 0;
|
||||
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
|
||||
if (possibleEndLocation != null)
|
||||
if (hasPreviousEndToken)
|
||||
{
|
||||
var lastEndToken = possibleEndLocation.Value;
|
||||
var lastEndTokenLocation = endLocations.Peek();
|
||||
|
||||
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
|
||||
var correction = lastEndTokenLocation.IsEndStream ? EndstreamBytes.Length : "endobj".Length;
|
||||
inputBytes.Seek(lastEndTokenLocation.Offset + correction + 1);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
|
||||
|
||||
possibleEndLocation = token;
|
||||
endLocations.Push(new EndLoc(false, inputBytes.CurrentOffset, !isEndData));
|
||||
isEndData = true;
|
||||
|
||||
if (read > length)
|
||||
{
|
||||
@@ -454,6 +452,7 @@
|
||||
endStreamPosition = 0;
|
||||
endObjPosition = 0;
|
||||
commonPartPosition = 0;
|
||||
isEndData = false;
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -463,6 +462,11 @@
|
||||
endStreamPosition = 0;
|
||||
endObjPosition = 0;
|
||||
commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
|
||||
|
||||
if (commonPartPosition == 0 && !ReadHelper.IsWhitespace(inputBytes.CurrentByte))
|
||||
{
|
||||
isEndData = false;
|
||||
}
|
||||
}
|
||||
|
||||
read++;
|
||||
@@ -470,15 +474,22 @@
|
||||
|
||||
long streamDataEnd = inputBytes.CurrentOffset + 1;
|
||||
|
||||
if (possibleEndLocation == null)
|
||||
if (endLocations.Count == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var lastEnd = possibleEndLocation;
|
||||
// Read until the first endstream or obj token indicator preceded by data.
|
||||
EndLoc endLoc;
|
||||
do
|
||||
{
|
||||
endLoc = endLocations.Pop();
|
||||
} while (!endLoc.HasDataPreceding && endLocations.Count > 0);
|
||||
|
||||
var dataLength = lastEnd.Value.Offset - startDataOffset;
|
||||
var dataLength = endLoc.Offset - startDataOffset;
|
||||
|
||||
// 3 characters, 'e', '\n' and possibly '\r'
|
||||
inputBytes.Seek(lastEnd.Value.Offset - 3);
|
||||
inputBytes.Seek(endLoc.Offset - 3);
|
||||
inputBytes.MoveNext();
|
||||
|
||||
if (inputBytes.CurrentByte == '\r')
|
||||
@@ -902,5 +913,21 @@
|
||||
inputBytes?.Dispose();
|
||||
isDisposed = true;
|
||||
}
|
||||
|
||||
private record EndLoc
|
||||
{
|
||||
public bool IsEndStream { get; }
|
||||
|
||||
public long Offset { get; }
|
||||
|
||||
public bool HasDataPreceding { get; }
|
||||
|
||||
public EndLoc(bool isEndStream, long offset, bool hasDataPreceding)
|
||||
{
|
||||
IsEndStream = isEndStream;
|
||||
Offset = offset;
|
||||
HasDataPreceding = hasDataPreceding;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,35 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using Tokens;
|
||||
|
||||
/// <summary>
|
||||
/// Used internally by the <see cref="PdfTokenScanner"/> when reading streams to store any occurrences of 'endobj' or 'endstream' observed.
|
||||
/// </summary>
|
||||
internal readonly struct PossibleStreamEndLocation
|
||||
{
|
||||
/// <summary>
|
||||
/// The offset at which the token started in the file.
|
||||
/// </summary>
|
||||
public long Offset { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The type, one of either <see cref="OperatorToken.EndObject"/> or <see cref="OperatorToken.EndStream"/>.
|
||||
/// </summary>
|
||||
public OperatorToken Type { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="PossibleStreamEndLocation"/>
|
||||
/// </summary>
|
||||
public PossibleStreamEndLocation(long offset, OperatorToken type)
|
||||
{
|
||||
Offset = offset;
|
||||
Type = type ?? throw new ArgumentNullException(nameof(type));
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{Offset}: {Type}";
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user