Support relaxed parsing of missing or garbage-prepended endobj/endtream tokens

This commit is contained in:
Arnaud TAMAILLON
2024-09-02 10:13:24 +02:00
committed by BobLd
parent f4d1456489
commit fc3cd81c96
3 changed files with 246 additions and 6 deletions

View File

@@ -528,12 +528,200 @@ endobj";
Assert.NotNull(dictionaryToken);
}
private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null)
[Fact]
public void ReadsDictionaryWithoutEndObjBeforeNextObject()
{
const string input = @"1 0 obj
<</Type /XRef>>
2 0 obj
<</Length 15>>
endobj";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
Assert.Equal(2, tokens.Count);
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[0].Data);
var typeValue = dictionaryToken.Data["Type"];
Assert.IsType<NameToken>(typeValue);
dictionaryToken = tokens[1].Data as DictionaryToken;
Assert.NotNull(dictionaryToken);
typeValue = dictionaryToken.Data["Length"];
Assert.IsType<NumericToken>(typeValue);
}
[Fact]
public void ReadsStreamWithoutEndObjBeforeNextObject()
{
const string input = @"1 0 obj
<</Length 4>>
stream
aaaa
endstream
2 0 obj
<</Length 15>>
endobj";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
Assert.Equal(2, tokens.Count);
Assert.IsType<StreamToken>(tokens[0].Data);
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
var typeValue = dictionaryToken.Data["Length"];
Assert.IsType<NumericToken>(typeValue);
}
[Theory]
[InlineData("startxref")]
[InlineData("xref")]
public void ReadsStreamWithoutEndObjBeforeToken(string token)
{
string input = @$"1 0 obj
<</Length 4>>
stream
aaaa
endstream
{token}";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
Assert.Single(tokens);
Assert.IsType<StreamToken>(tokens[0].Data);
}
[Theory]
[InlineData("startxref")]
[InlineData("xref")]
public void ReadsDictionaryWithoutEndObjBeforeToken(string token)
{
string input = @$"1 0 obj
<</Type /XRef>>
{token}";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
Assert.Single(tokens);
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[0].Data);
var typeValue = dictionaryToken.Data["Type"];
Assert.IsType<NameToken>(typeValue);
}
[Fact]
public void ReadsStreamWithoutEndStreamBeforeEndObj()
{
const string input = @"1 0 obj
<</Length 4>>
stream
aaaa
endobj
2 0 obj
<</Length 15>>
endobj";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
Assert.Equal(2, tokens.Count);
Assert.IsType<StreamToken>(tokens[0].Data);
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
var lengthValue = dictionaryToken.Data["Length"];
Assert.IsType<NumericToken>(lengthValue);
}
[Theory]
[InlineData(">>")]
[InlineData("randomstring")]
public void ReadsIndirectObjectsDictionaryWithContentBeforeEndObj(string addedContent)
{
string input = @$"1 0 obj
<</Type /XRef>>
{addedContent}endobj
2 0 obj
<</Length 15>>
endobj";
var strictScanner = GetScanner(input);
var tokens = ReadToEnd(strictScanner);
Assert.Empty(tokens);
var lenientScanner = GetScanner(input, useLenientParsing: true);
tokens = ReadToEnd(lenientScanner);
Assert.Equal(2, tokens.Count);
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[0].Data);
var typeValue = dictionaryToken.Data["Type"];
Assert.IsType<NameToken>(typeValue);
dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
var lengthValue = dictionaryToken.Data["Length"];
Assert.IsType<NumericToken>(lengthValue);
}
[Theory]
[InlineData(">>")]
[InlineData("randomstring")]
public void ReadsIndirectObjectsStreamWithAddedContentBeforeStream(string addedContent)
{
string input = @$"1 0 obj
<</length 4>>
{addedContent}stream
aaaa
endstream
endobj
2 0 obj
<</Length 15>>
endobj";
var strictScanner = GetScanner(input);
var tokens = ReadToEnd(strictScanner);
Assert.Equal(2, tokens.Count);
// this is linked to the parsing choosing the last token parsed in obj.
// It can probably be challenged against taking the first one.
var operatorToken = Assert.IsType<OperatorToken>(tokens[0].Data);
Assert.Equal("endstream", operatorToken.Data);
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
var lengthValue = dictionaryToken.Data["Length"];
Assert.IsType<NumericToken>(lengthValue);
var lenientScanner = GetScanner(input, useLenientParsing:true);
tokens = ReadToEnd(lenientScanner);
Assert.Equal(2, tokens.Count);
Assert.IsType<StreamToken>(tokens[0].Data);
dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
lengthValue = dictionaryToken.Data["Length"];
Assert.IsType<NumericToken>(lengthValue);
}
private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null, bool useLenientParsing = false)
{
var input = StringBytesTestConverter.Convert(s, false);
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff);
new TestFilterProvider(), NoOpEncryptionHandler.Instance, useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff);
}
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)

View File

@@ -117,6 +117,11 @@
/// </summary>
public static readonly OperatorToken Xref = new OperatorToken("xref");
/// <summary>
/// Cross reference section offset.
/// </summary>
public static readonly OperatorToken StartXref = new OperatorToken("startxref");
/// <inheritdoc />
public string Data { get; }
@@ -163,6 +168,7 @@
"Tf" => Tf,
"W*" => WStar,
"xref" => Xref,
"startxref" => StartXref,
_ => new OperatorToken(data.ToString())
};
}

View File

@@ -135,7 +135,7 @@
var readStream = false;
// Read all tokens between obj and endobj.
while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
while (coreTokenScanner.MoveNext() && !IsToken(coreTokenScanner, OperatorToken.EndObject, out _))
{
if (coreTokenScanner.CurrentToken is CommentToken)
{
@@ -171,7 +171,35 @@
return false;
}
if (ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.StartStream))
if (IsToken(coreTokenScanner, OperatorToken.Xref, out _) || IsToken(coreTokenScanner, OperatorToken.StartXref, out _))
{
if (readStream && readTokens[0] is StreamToken streamRead)
{
readTokens.Clear();
readTokens.Add(streamRead);
coreTokenScanner.Seek(previousTokenPositions[2]);
break;
}
if (readTokens.Count == 1)
{
// An obj was encountered after reading the actual token and the object and generation number of the following token.
var actualReference = new IndirectReference(objectNumber.Int, generation.Int);
var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]);
CurrentToken = new ObjectToken(startPosition, actualReference, actualToken);
readTokens.Clear();
coreTokenScanner.Seek(previousTokenPositions[2]);
return true;
}
// This should never happen.
Debug.Assert(false, "Encountered a '{' operator before the end of the previous object.");
return false;
}
if (IsToken(coreTokenScanner, OperatorToken.StartStream, out var actualStartStreamPosition))
{
var streamIdentifier = new IndirectReference(objectNumber.Long, generation.Int);
@@ -185,7 +213,7 @@
callingObject = streamIdentifier;
// Read stream: special case.
if (TryReadStream(coreTokenScanner.CurrentTokenStart, getLengthFromFile, out var stream))
if (TryReadStream(actualStartStreamPosition.Value, getLengthFromFile, out var stream))
{
readTokens.Clear();
readTokens.Add(stream);
@@ -212,7 +240,7 @@
previousTokenPositions[2] = coreTokenScanner.CurrentTokenStart;
}
if (!readStream && !ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
if (!readStream && !IsToken(coreTokenScanner, OperatorToken.EndObject, out _))
{
readTokens.Clear();
return false;
@@ -263,6 +291,24 @@
return true;
}
private bool IsToken(CoreTokenScanner scanner, OperatorToken token, [NotNullWhen(true)] out long? actualTokenStart)
{
if (ReferenceEquals(scanner.CurrentToken, token))
{
actualTokenStart = scanner.CurrentTokenStart;
return true;
}
if (parsingOptions.UseLenientParsing && scanner.CurrentToken is OperatorToken opToken && opToken.Data.EndsWith(token.Data))
{
actualTokenStart = scanner.CurrentTokenStart + opToken.Data.Length - token.Data.Length;
return true;
}
actualTokenStart = null;
return false;
}
private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNullWhen(true)] out StreamToken? stream)
{
stream = null;