mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
Support relaxed parsing of missing or garbage-prepended endobj/endtream tokens
This commit is contained in:
@@ -528,12 +528,200 @@ endobj";
|
||||
Assert.NotNull(dictionaryToken);
|
||||
}
|
||||
|
||||
private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null)
|
||||
[Fact]
|
||||
public void ReadsDictionaryWithoutEndObjBeforeNextObject()
|
||||
{
|
||||
const string input = @"1 0 obj
|
||||
<</Type /XRef>>
|
||||
2 0 obj
|
||||
<</Length 15>>
|
||||
endobj";
|
||||
|
||||
var scanner = GetScanner(input);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
|
||||
Assert.Equal(2, tokens.Count);
|
||||
|
||||
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[0].Data);
|
||||
var typeValue = dictionaryToken.Data["Type"];
|
||||
Assert.IsType<NameToken>(typeValue);
|
||||
|
||||
dictionaryToken = tokens[1].Data as DictionaryToken;
|
||||
Assert.NotNull(dictionaryToken);
|
||||
typeValue = dictionaryToken.Data["Length"];
|
||||
Assert.IsType<NumericToken>(typeValue);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStreamWithoutEndObjBeforeNextObject()
|
||||
{
|
||||
const string input = @"1 0 obj
|
||||
<</Length 4>>
|
||||
stream
|
||||
aaaa
|
||||
endstream
|
||||
2 0 obj
|
||||
<</Length 15>>
|
||||
endobj";
|
||||
|
||||
var scanner = GetScanner(input);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
|
||||
Assert.Equal(2, tokens.Count);
|
||||
|
||||
Assert.IsType<StreamToken>(tokens[0].Data);
|
||||
|
||||
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
|
||||
var typeValue = dictionaryToken.Data["Length"];
|
||||
Assert.IsType<NumericToken>(typeValue);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("startxref")]
|
||||
[InlineData("xref")]
|
||||
public void ReadsStreamWithoutEndObjBeforeToken(string token)
|
||||
{
|
||||
string input = @$"1 0 obj
|
||||
<</Length 4>>
|
||||
stream
|
||||
aaaa
|
||||
endstream
|
||||
{token}";
|
||||
|
||||
var scanner = GetScanner(input);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
|
||||
Assert.Single(tokens);
|
||||
|
||||
Assert.IsType<StreamToken>(tokens[0].Data);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("startxref")]
|
||||
[InlineData("xref")]
|
||||
public void ReadsDictionaryWithoutEndObjBeforeToken(string token)
|
||||
{
|
||||
string input = @$"1 0 obj
|
||||
<</Type /XRef>>
|
||||
{token}";
|
||||
|
||||
var scanner = GetScanner(input);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
|
||||
Assert.Single(tokens);
|
||||
|
||||
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[0].Data);
|
||||
var typeValue = dictionaryToken.Data["Type"];
|
||||
Assert.IsType<NameToken>(typeValue);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStreamWithoutEndStreamBeforeEndObj()
|
||||
{
|
||||
const string input = @"1 0 obj
|
||||
<</Length 4>>
|
||||
stream
|
||||
aaaa
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 15>>
|
||||
endobj";
|
||||
|
||||
var scanner = GetScanner(input);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
|
||||
Assert.Equal(2, tokens.Count);
|
||||
|
||||
Assert.IsType<StreamToken>(tokens[0].Data);
|
||||
|
||||
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
|
||||
var lengthValue = dictionaryToken.Data["Length"];
|
||||
Assert.IsType<NumericToken>(lengthValue);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(">>")]
|
||||
[InlineData("randomstring")]
|
||||
public void ReadsIndirectObjectsDictionaryWithContentBeforeEndObj(string addedContent)
|
||||
{
|
||||
string input = @$"1 0 obj
|
||||
<</Type /XRef>>
|
||||
{addedContent}endobj
|
||||
2 0 obj
|
||||
<</Length 15>>
|
||||
endobj";
|
||||
|
||||
var strictScanner = GetScanner(input);
|
||||
|
||||
var tokens = ReadToEnd(strictScanner);
|
||||
Assert.Empty(tokens);
|
||||
|
||||
|
||||
var lenientScanner = GetScanner(input, useLenientParsing: true);
|
||||
tokens = ReadToEnd(lenientScanner);
|
||||
|
||||
Assert.Equal(2, tokens.Count);
|
||||
|
||||
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[0].Data);
|
||||
var typeValue = dictionaryToken.Data["Type"];
|
||||
Assert.IsType<NameToken>(typeValue);
|
||||
|
||||
dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
|
||||
var lengthValue = dictionaryToken.Data["Length"];
|
||||
Assert.IsType<NumericToken>(lengthValue);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(">>")]
|
||||
[InlineData("randomstring")]
|
||||
public void ReadsIndirectObjectsStreamWithAddedContentBeforeStream(string addedContent)
|
||||
{
|
||||
string input = @$"1 0 obj
|
||||
<</length 4>>
|
||||
{addedContent}stream
|
||||
aaaa
|
||||
endstream
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Length 15>>
|
||||
endobj";
|
||||
|
||||
var strictScanner = GetScanner(input);
|
||||
|
||||
var tokens = ReadToEnd(strictScanner);
|
||||
Assert.Equal(2, tokens.Count);
|
||||
// this is linked to the parsing choosing the last token parsed in obj.
|
||||
// It can probably be challenged against taking the first one.
|
||||
var operatorToken = Assert.IsType<OperatorToken>(tokens[0].Data);
|
||||
Assert.Equal("endstream", operatorToken.Data);
|
||||
|
||||
var dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
|
||||
var lengthValue = dictionaryToken.Data["Length"];
|
||||
Assert.IsType<NumericToken>(lengthValue);
|
||||
|
||||
var lenientScanner = GetScanner(input, useLenientParsing:true);
|
||||
tokens = ReadToEnd(lenientScanner);
|
||||
|
||||
Assert.Equal(2, tokens.Count);
|
||||
|
||||
Assert.IsType<StreamToken>(tokens[0].Data);
|
||||
|
||||
dictionaryToken = Assert.IsType<DictionaryToken>(tokens[1].Data);
|
||||
lengthValue = dictionaryToken.Data["Length"];
|
||||
Assert.IsType<NumericToken>(lengthValue);
|
||||
}
|
||||
|
||||
private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null, bool useLenientParsing = false)
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert(s, false);
|
||||
|
||||
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
|
||||
new TestFilterProvider(), NoOpEncryptionHandler.Instance, ParsingOptions.LenientParsingOff);
|
||||
new TestFilterProvider(), NoOpEncryptionHandler.Instance, useLenientParsing ? new ParsingOptions() : ParsingOptions.LenientParsingOff);
|
||||
}
|
||||
|
||||
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
|
||||
|
||||
@@ -117,6 +117,11 @@
|
||||
/// </summary>
|
||||
public static readonly OperatorToken Xref = new OperatorToken("xref");
|
||||
|
||||
/// <summary>
|
||||
/// Cross reference section offset.
|
||||
/// </summary>
|
||||
public static readonly OperatorToken StartXref = new OperatorToken("startxref");
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Data { get; }
|
||||
|
||||
@@ -163,6 +168,7 @@
|
||||
"Tf" => Tf,
|
||||
"W*" => WStar,
|
||||
"xref" => Xref,
|
||||
"startxref" => StartXref,
|
||||
_ => new OperatorToken(data.ToString())
|
||||
};
|
||||
}
|
||||
|
||||
@@ -135,7 +135,7 @@
|
||||
|
||||
var readStream = false;
|
||||
// Read all tokens between obj and endobj.
|
||||
while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
|
||||
while (coreTokenScanner.MoveNext() && !IsToken(coreTokenScanner, OperatorToken.EndObject, out _))
|
||||
{
|
||||
if (coreTokenScanner.CurrentToken is CommentToken)
|
||||
{
|
||||
@@ -171,7 +171,35 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.StartStream))
|
||||
if (IsToken(coreTokenScanner, OperatorToken.Xref, out _) || IsToken(coreTokenScanner, OperatorToken.StartXref, out _))
|
||||
{
|
||||
if (readStream && readTokens[0] is StreamToken streamRead)
|
||||
{
|
||||
readTokens.Clear();
|
||||
readTokens.Add(streamRead);
|
||||
coreTokenScanner.Seek(previousTokenPositions[2]);
|
||||
break;
|
||||
}
|
||||
|
||||
if (readTokens.Count == 1)
|
||||
{
|
||||
// An obj was encountered after reading the actual token and the object and generation number of the following token.
|
||||
var actualReference = new IndirectReference(objectNumber.Int, generation.Int);
|
||||
var actualToken = encryptionHandler.Decrypt(actualReference, readTokens[0]);
|
||||
|
||||
CurrentToken = new ObjectToken(startPosition, actualReference, actualToken);
|
||||
readTokens.Clear();
|
||||
coreTokenScanner.Seek(previousTokenPositions[2]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// This should never happen.
|
||||
Debug.Assert(false, "Encountered a '{' operator before the end of the previous object.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (IsToken(coreTokenScanner, OperatorToken.StartStream, out var actualStartStreamPosition))
|
||||
{
|
||||
var streamIdentifier = new IndirectReference(objectNumber.Long, generation.Int);
|
||||
|
||||
@@ -185,7 +213,7 @@
|
||||
callingObject = streamIdentifier;
|
||||
|
||||
// Read stream: special case.
|
||||
if (TryReadStream(coreTokenScanner.CurrentTokenStart, getLengthFromFile, out var stream))
|
||||
if (TryReadStream(actualStartStreamPosition.Value, getLengthFromFile, out var stream))
|
||||
{
|
||||
readTokens.Clear();
|
||||
readTokens.Add(stream);
|
||||
@@ -212,7 +240,7 @@
|
||||
previousTokenPositions[2] = coreTokenScanner.CurrentTokenStart;
|
||||
}
|
||||
|
||||
if (!readStream && !ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
|
||||
if (!readStream && !IsToken(coreTokenScanner, OperatorToken.EndObject, out _))
|
||||
{
|
||||
readTokens.Clear();
|
||||
return false;
|
||||
@@ -263,6 +291,24 @@
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool IsToken(CoreTokenScanner scanner, OperatorToken token, [NotNullWhen(true)] out long? actualTokenStart)
|
||||
{
|
||||
if (ReferenceEquals(scanner.CurrentToken, token))
|
||||
{
|
||||
actualTokenStart = scanner.CurrentTokenStart;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (parsingOptions.UseLenientParsing && scanner.CurrentToken is OperatorToken opToken && opToken.Data.EndsWith(token.Data))
|
||||
{
|
||||
actualTokenStart = scanner.CurrentTokenStart + opToken.Data.Length - token.Data.Length;
|
||||
return true;
|
||||
}
|
||||
|
||||
actualTokenStart = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
private bool TryReadStream(long startStreamTokenOffset, bool getLength, [NotNullWhen(true)] out StreamToken? stream)
|
||||
{
|
||||
stream = null;
|
||||
|
||||
Reference in New Issue
Block a user