fix for issue #670 with double endstream

when a stream contains two endstream declarations with
no data between them then the first declared endstream
should be obeyed
This commit is contained in:
EliotJones
2025-07-06 17:09:59 -05:00
parent daaac9350d
commit 7134032188
3 changed files with 103 additions and 66 deletions

View File

@@ -406,6 +406,52 @@ endobj";
Assert.Equal(7, token.Number.ObjectNumber);
}
[Fact]
public void ReadsStreamWithDoubleEndstreamSimple()
{
const string s =
"""
250 0 obj
<< /Filter /FlateDecode >>
stream
012endstream
endstream
endobj
""";
var scanner = GetScanner(s);
var tokens = ReadToEnd(scanner);
}
[Fact]
public void ReadsStreamWithDoubleEndstream()
{
const string s =
"""
1974 0 obj
<<
/Filter /FlateDecode
/Length 1975 0 R
>>
stream
]ÔÏ@ñ'ð;øØ"Œg !Ué…Cÿ¨´ ö:B*Æ2äÀÛw¿™MZõ'þ°½ë]<ï>ïÆÓ­^|Ÿ/Ý>Ýêá4ösº^^ç.ÕÇôr«e[÷§îVÎü؝SµÈ7ïï×[:ïÆáRm6ÕâGþðzïõ‡Oýå˜>VosŸæÓøRøõ¼ÏçûםÎi¼ÕMµÝÖ}òƒ¾¦¯‡sª~ÛîϟŸn÷‡|Ïß+~Þ§T·~¾ŒÉt—>]§C—æÃøªM»ÜÖ­ÒØÿ÷ÙJããðïµ~†&msh ­YK4BK0yÈ¿rXVzš°Žà}$<zЁðDxò`þÐáAGÂ1:BÏða{B{$$Bа&
!ÂSÒä¿ýCC£ePHx´x-Ã
R<˜º@!á!>,âW@!á!¼œ@!áÑ2uBÂC=@!á¡þP(¤xðU
R< (¤°PHx(SW(4<S(4<´#@¡á¡ÌT¡Ð²><@¡á¡Œ¢PhxSW(4<õ¡Phxè Ç£PhY|Q
GëÃB¡e}à¡Phx˜¿
B¡áÑú°Phx´ÆÔ
+,ƒÂÂ#/× °²>3(¬.¡nPXx˜_ùC¡°²>x}ƒÂÂCx9ƒÂНoPXxˆš&ù!ÙگŠÿ jbky
yyÛJØlØßw±îužóæ¦ï\ìY§1½ï«Ó.ÿùz°gAendstream
endstream
endobj
""";
var scanner = GetScanner(s);
var tokens = ReadToEnd(scanner);
}
[Fact]
public void ReadsStringsWithMissingEndBracket()
{
@@ -661,7 +707,6 @@ endobj";
var tokens = ReadToEnd(strictScanner);
Assert.Empty(tokens);
var lenientScanner = GetScanner(input, useLenientParsing: true);
tokens = ReadToEnd(lenientScanner);

View File

@@ -377,17 +377,11 @@
long streamDataStart = inputBytes.CurrentOffset;
PossibleStreamEndLocation? possibleEndLocation = null;
// Negative indicates endobj.
bool isEndData = false;
Stack<EndLoc> endLocations = new Stack<EndLoc>();
while (inputBytes.MoveNext())
{
if (length.HasValue && read == length)
{
// TODO: read ahead and check we're at the end...
// break;
}
// We are reading 'end' (possibly).
if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
{
@@ -401,44 +395,48 @@
endObjPosition = 0;
endStreamPosition++;
// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
if (endStreamPosition == streamPart.Length)
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
possibleEndLocation = token;
if (length.HasValue && read > length)
// Token is at end of stream or is followed by whitespace
if (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))
{
break;
}
var location = inputBytes.CurrentOffset - EndstreamBytes.Length;
endLocations.Push(new EndLoc(true, location, !isEndData));
isEndData = true;
endStreamPosition = 0;
if (length.HasValue && read > length)
{
break;
}
endStreamPosition = 0;
commonPartPosition = 0;
}
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'
endStreamPosition = 0;
endObjPosition++;
// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
{
var hasPreviousEndToken = endLocations.Count > 0;
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (possibleEndLocation != null)
if (hasPreviousEndToken)
{
var lastEndToken = possibleEndLocation.Value;
var lastEndTokenLocation = endLocations.Peek();
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
var correction = lastEndTokenLocation.IsEndStream ? EndstreamBytes.Length : "endobj".Length;
inputBytes.Seek(lastEndTokenLocation.Offset + correction + 1);
break;
}
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
possibleEndLocation = token;
endLocations.Push(new EndLoc(false, inputBytes.CurrentOffset, !isEndData));
isEndData = true;
if (read > length)
{
@@ -454,6 +452,7 @@
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = 0;
isEndData = false;
}
}
else
@@ -463,6 +462,11 @@
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
if (commonPartPosition == 0 && !ReadHelper.IsWhitespace(inputBytes.CurrentByte))
{
isEndData = false;
}
}
read++;
@@ -470,15 +474,22 @@
long streamDataEnd = inputBytes.CurrentOffset + 1;
if (possibleEndLocation == null)
if (endLocations.Count == 0)
{
return false;
}
var lastEnd = possibleEndLocation;
// Read until the first endstream or obj token indicator preceded by data.
EndLoc endLoc;
do
{
endLoc = endLocations.Pop();
} while (!endLoc.HasDataPreceding && endLocations.Count > 0);
var dataLength = lastEnd.Value.Offset - startDataOffset;
var dataLength = endLoc.Offset - startDataOffset;
// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Value.Offset - 3);
inputBytes.Seek(endLoc.Offset - 3);
inputBytes.MoveNext();
if (inputBytes.CurrentByte == '\r')
@@ -902,5 +913,21 @@
inputBytes?.Dispose();
isDisposed = true;
}
private record EndLoc
{
public bool IsEndStream { get; }
public long Offset { get; }
public bool HasDataPreceding { get; }
public EndLoc(bool isEndStream, long offset, bool hasDataPreceding)
{
IsEndStream = isEndStream;
Offset = offset;
HasDataPreceding = hasDataPreceding;
}
}
}
}

View File

@@ -1,35 +0,0 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using System;
using Tokens;
/// <summary>
/// Used internally by the <see cref="PdfTokenScanner"/> when reading streams to store any occurrences of 'endobj' or 'endstream' observed.
/// </summary>
internal readonly struct PossibleStreamEndLocation
{
/// <summary>
/// The offset at which the token started in the file.
/// </summary>
public long Offset { get; }
/// <summary>
/// The type, one of either <see cref="OperatorToken.EndObject"/> or <see cref="OperatorToken.EndStream"/>.
/// </summary>
public OperatorToken Type { get; }
/// <summary>
/// Create a new <see cref="PossibleStreamEndLocation"/>
/// </summary>
public PossibleStreamEndLocation(long offset, OperatorToken type)
{
Offset = offset;
Type = type ?? throw new ArgumentNullException(nameof(type));
}
public override string ToString()
{
return $"{Offset}: {Type}";
}
}
}