start migrating cross reference parsing process to token scanner

This commit is contained in:
Eliot Jones
2018-01-04 21:09:47 +00:00
parent 1c41618950
commit 6b4bd8689f
22 changed files with 758 additions and 377 deletions

View File

@@ -2,6 +2,7 @@
{
using System;
using Exceptions;
using Pdf.Parser.FileStructure;
using Pdf.Parser.Parts;
using Xunit;

View File

@@ -1,77 +1,94 @@
namespace UglyToad.Pdf.Tests.Parser.Parts.CrossReference
namespace UglyToad.Pdf.Tests.Parser.Parts.FileStructure
{
using System;
using System.Linq;
using IO;
using Exceptions;
using Pdf.Cos;
using Pdf.Parser.Parts.CrossReference;
using Pdf.Util;
using Pdf.Parser.FileStructure;
using Pdf.Tokenization.Scanner;
using Xunit;
public class CrossReferenceTableParserTests
{
private readonly CosObjectPool objectPool = new CosObjectPool();
private readonly CrossReferenceTableParser parser = new CrossReferenceTableParser(new TestingLog(),
new TestDictionaryParser(),
new TestBaseParser());
private readonly CrossReferenceTableParser parser = new CrossReferenceTableParser();
[Fact]
public void OffsetNotXrefFalse()
public void ParseNewDefaultTable()
{
var input = GetReader("12 0 obj <<>> endobj xref");
var input = StringBytesTestConverter.Scanner(@"one xref
0 6
0000000003 65535 f
0000000090 00000 n
0000000081 00000 n
0000000000 00007 f
0000000331 00000 n
0000000409 00000 n
var result = parser.TryParse(input, 4, false, objectPool, out var _);
trailer
<< >>");
Assert.False(result);
var result = parser.Parse(input, 4, false);
Assert.Equal(4, result.ObjectOffsets.Count);
}
[Fact]
public void OffsetXButNotXrefFalse()
public void OffsetNotXrefThrows()
{
var input = GetReader("12 0 obj <<>> endobj xref");
Action action = () => parser.Parse(input, 4, false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void OffsetXButNotXrefThrows()
{
var input = GetReader(@"xtable
trailer");
var result = parser.TryParse(input, 0, false, objectPool, out var _);
Action action = () => parser.Parse(input, 0, false);
Assert.False(result);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void EmptyTableFalse()
public void EmptyTableReturnsEmpty()
{
var input = GetReader(@"xref
trailer");
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var _);
var result = parser.Parse(input, 0, false);
Assert.False(result);
Assert.Empty(result.ObjectOffsets);
}
[Fact]
public void InvalidSubsectionDefinitionLenientTrue()
public void InvalidSubsectionDefinitionLenientSkips()
{
var input = GetReader(@"xref
ab 12
trailer
<<>>");
var result = parser.TryParse(input, 0, true, objectPool, out var _);
var result = parser.Parse(input, 0, true);
Assert.True(result);
Assert.Empty(result.ObjectOffsets);
}
[Fact]
public void InvalidSubsectionDefinitionNotLenientFalse()
public void InvalidSubsectionDefinitionNotLenientThrows()
{
var input = GetReader(@"xref
ab 12
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var _);
Action action = () => parser.Parse(input, 0, false);
Assert.False(result);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
@@ -83,15 +100,11 @@ trailer
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var table);
Assert.True(result);
var built = table.AsCrossReferenceTablePart();
Assert.Empty(built.ObjectOffsets);
Assert.Equal(0, built.Offset);
Assert.Equal(CrossReferenceType.Table, built.Type);
var result = parser.Parse(input, 0, false);
Assert.Empty(result.ObjectOffsets);
Assert.Equal(0, result.Offset);
Assert.Equal(CrossReferenceType.Table, result.Type);
}
[Fact]
@@ -105,15 +118,11 @@ trailer
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var table);
var result = parser.Parse(input, 0, false);
Assert.Equal(2, result.ObjectOffsets.Count);
Assert.True(result);
var built = table.AsCrossReferenceTablePart();
Assert.Equal(2, built.ObjectOffsets.Count);
var results = built.ObjectOffsets.Select(x => new {x.Key.Number, x.Key.Generation, x.Value}).ToList();
var results = result.ObjectOffsets.Select(x => new {x.Key.Number, x.Key.Generation, x.Value}).ToList();
Assert.Equal(100, results[0].Value);
Assert.Equal(1, results[0].Number);
@@ -134,15 +143,11 @@ trailer
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var table);
var result = parser.Parse(input, 0, false);
Assert.Equal(2, result.ObjectOffsets.Count);
Assert.True(result);
var built = table.AsCrossReferenceTablePart();
Assert.Equal(2, built.ObjectOffsets.Count);
var results = built.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
var results = result.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
Assert.Equal(190, results[0].Value);
Assert.Equal(15, results[0].Number);
@@ -164,15 +169,11 @@ trailer
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var table);
var result = parser.Parse(input, 0, false);
Assert.Equal(2, result.ObjectOffsets.Count);
Assert.True(result);
var built = table.AsCrossReferenceTablePart();
Assert.Equal(2, built.ObjectOffsets.Count);
var results = built.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
var results = result.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
Assert.Equal(190, results[0].Value);
Assert.Equal(15, results[0].Number);
@@ -198,15 +199,11 @@ trailer
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var table);
var result = parser.Parse(input, 0, false);
Assert.Equal(5, result.ObjectOffsets.Count);
Assert.True(result);
var built = table.AsCrossReferenceTablePart();
Assert.Equal(5, built.ObjectOffsets.Count);
var results = built.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
var results = result.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
Assert.Equal(100, results[0].Value);
Assert.Equal(1, results[0].Number);
@@ -239,9 +236,9 @@ trailer
trailer
<<>>");
Action action = () => parser.TryParse(input, 0, false, objectPool, out var _);
Action action = () => parser.Parse(input, 0, false);
Assert.Throws<InvalidOperationException>(action);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
@@ -254,24 +251,24 @@ trailer
trailer
<<>>");
Action action = () => parser.TryParse(input, 0, false, objectPool, out var _);
Action action = () => parser.Parse(input, 0, false);
Assert.Throws<InvalidOperationException>(action);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void ShortLineInTableReturnsFalse()
public void ShortLineInTableReturnsThrows()
{
var input = GetReader(@"xref
15 2
000000019000000 n
019 n
0000000250 00032 n
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var _);
Action action = () => parser.Parse(input, 0, false);
Assert.False(result);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
@@ -285,16 +282,14 @@ trailer
trailer
<<>>");
var result = parser.TryParse(input, 0, false, objectPool, out var table);
Assert.True(result);
Assert.Equal(2, table.AsCrossReferenceTablePart().ObjectOffsets.Count);
var result = parser.Parse(input, 0, false);
Assert.Equal(2, result.ObjectOffsets.Count);
}
private static IRandomAccessRead GetReader(string input)
private static CoreTokenScanner GetReader(string input)
{
return new RandomAccessBuffer(OtherEncodings.StringAsLatin1Bytes(input));
return StringBytesTestConverter.Scanner(input);
}
}
}

View File

@@ -2,7 +2,7 @@
{
using System;
using Exceptions;
using Pdf.Parser.Parts;
using Pdf.Parser.FileStructure;
using Pdf.Tokenization.Scanner;
using Xunit;

View File

@@ -1,4 +1,4 @@
namespace UglyToad.Pdf.Parser.Parts
namespace UglyToad.Pdf.Content
{
internal class HeaderVersion
{

View File

@@ -5,6 +5,7 @@
using System.Collections;
using System.Text;
using Cos;
using Tokenization.Tokens;
using Util.JetBrains.Annotations;
internal class PdfDictionary : CosBase, IReadOnlyDictionary<CosName, CosBase>
@@ -105,5 +106,15 @@
throw new NotImplementedException();
}
#endregion
internal static PdfDictionary FromDictionaryToken(DictionaryToken token)
{
if (token == null)
{
throw new ArgumentNullException(nameof(token));
}
return new PdfDictionary();
}
}
}

View File

@@ -14,7 +14,7 @@
public PdfDictionary Dictionary { get; set; }
public CrossReferenceType XRefType { get; set; }
public void Add(long objectId, int generationNumber, long offset)
{
CosObjectKey objKey = new CosObjectKey(objectId, generationNumber);
@@ -25,7 +25,7 @@
}
}
public CrossReferenceTablePart AsCrossReferenceTablePart()
public CrossReferenceTablePart Build()
{
return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType);
}

View File

@@ -0,0 +1,25 @@
namespace UglyToad.Pdf.Parser.FileStructure
{
using Parts.CrossReference;
internal class CrossReferenceOffsetValidator
{
private readonly XrefOffsetValidator offsetValidator;
public CrossReferenceOffsetValidator(XrefOffsetValidator offsetValidator)
{
this.offsetValidator = offsetValidator;
}
public long Validate(long crossReferenceOffset, bool isLenientParsing)
{
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, isLenientParsing);
if (fixedOffset > -1)
{
crossReferenceOffset = fixedOffset;
}
return crossReferenceOffset;
}
}
}

View File

@@ -1,14 +1,20 @@
namespace UglyToad.Pdf.Parser.Parts.CrossReference
namespace UglyToad.Pdf.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using ContentStream;
using ContentStream.TypedAccessors;
using Cos;
using Exceptions;
using IO;
using Logging;
using Parts;
using Parts.CrossReference;
using Tokenization.Scanner;
using Tokenization.Tokens;
using UglyToad.Pdf.Parser.Parts.CrossReference;
internal class FileCrossReferenceTableParser
internal class CrossReferenceParser
{
private const int X = 'x';
@@ -18,11 +24,13 @@
private readonly CosStreamParser streamParser;
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
private readonly CrossReferenceTableParser crossReferenceTableParser;
private readonly OldCrossReferenceTableParser oldCrossReferenceTableParser;
public FileCrossReferenceTableParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
CosStreamParser streamParser,
CrossReferenceStreamParser crossReferenceStreamParser,
CrossReferenceTableParser crossReferenceTableParser)
CrossReferenceTableParser crossReferenceTableParser,
OldCrossReferenceTableParser oldCrossReferenceTableParser)
{
this.log = log;
this.dictionaryParser = dictionaryParser;
@@ -30,6 +38,41 @@
this.streamParser = streamParser;
this.crossReferenceStreamParser = crossReferenceStreamParser;
this.crossReferenceTableParser = crossReferenceTableParser;
this.oldCrossReferenceTableParser = oldCrossReferenceTableParser;
}
public CrossReferenceTable ParseNew(long crossReferenceLocation, ISeekableTokenScanner scanner,
bool isLenientParsing)
{
var previousLocation = crossReferenceLocation;
var visitedCrossReferences = new HashSet<long>();
while (previousLocation >= 0)
{
scanner.Seek(crossReferenceLocation);
scanner.MoveNext();
if (scanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{
var table = crossReferenceTableParser.Parse(scanner, crossReferenceLocation, isLenientParsing);
previousLocation = table.Dictionary.GetLongOrDefault(CosName.PREV, -1);
}
else if (scanner.CurrentToken is NumericToken streamObjectNumberToken)
{
break;
}
else
{
throw new PdfDocumentFormatException($"The xref object was not a stream or a table, was instead: {scanner.CurrentToken}.");
}
}
return null;
}
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
@@ -62,11 +105,11 @@
{
// xref table and trailer
// use existing parser to parse xref table
if (!crossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
if (!oldCrossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
{
throw new InvalidOperationException($"Expected trailer object at position: {reader.GetPosition()}");
}
PdfDictionary trailer = tableBuilder.Dictionary;
CrossReferenceTablePart streamPart = null;
// check for a XRef stream, it may contain some object ids of compressed objects
@@ -128,7 +171,7 @@
tableBuilder.Previous = tableBuilder.Dictionary.GetLongOrDefault(CosName.PREV);
table.Add(tableBuilder.AsCrossReferenceTablePart());
table.Add(tableBuilder.Build());
if (streamPart != null)
{

View File

@@ -0,0 +1,220 @@
namespace UglyToad.Pdf.Parser.FileStructure
{
using System.Collections.Generic;
using System.Linq;
using ContentStream;
using Cos;
using Exceptions;
using Parts.CrossReference;
using Tokenization;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class CrossReferenceTableParser
{
private const string InUseEntry = "n";
private const string FreeEntry = "f";
public CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing)
{
var builder = new CrossReferenceTablePartBuilder
{
Offset = offset,
XRefType = CrossReferenceType.Table
};
if (scanner.CurrentPosition != offset)
{
scanner.Seek(offset);
}
scanner.MoveNext();
if (scanner.CurrentToken is OperatorToken operatorToken)
{
if (operatorToken.Data == "xref")
{
scanner.MoveNext();
}
else
{
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
}
}
if (scanner.CurrentToken is NumericToken firstObjectNumber)
{
if (!scanner.TryReadToken(out NumericToken objectCount))
{
throw new PdfDocumentFormatException($"Unexpected token following xref and {firstObjectNumber}. We found: {scanner.CurrentToken}.");
}
var definition = new TableSubsectionDefinition(firstObjectNumber.Long, objectCount.Int);
var tokenizer = new CrossReferenceEndOfLineTokenizer();
scanner.RegisterCustomTokenizer((byte)'\r', tokenizer);
scanner.RegisterCustomTokenizer((byte)'\n', tokenizer);
var readingLine = false;
var tokens = new List<IToken>();
var count = 0;
while (scanner.MoveNext())
{
if (scanner.CurrentToken is EndOfLineToken)
{
if (!readingLine)
{
continue;
}
readingLine = false;
count = ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition);
tokens.Clear();
continue;
}
if (scanner.CurrentToken is CommentToken)
{
continue;
}
var isLineOperator = scanner.CurrentToken is OperatorToken op && (op.Data == FreeEntry || op.Data == InUseEntry);
if (!(scanner.CurrentToken is NumericToken) && !isLineOperator)
{
break;
}
readingLine = true;
tokens.Add(scanner.CurrentToken);
}
if (tokens.Count > 0)
{
ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition);
}
scanner.DeregisterCustomTokenizer(tokenizer);
}
builder.Dictionary = ParseTrailer(scanner, isLenientParsing);
return builder.Build();
}
private static int ProcessTokens(List<IToken> tokens, ISeekableTokenScanner scanner, CrossReferenceTablePartBuilder builder, bool isLenientParsing,
int objectCount, ref TableSubsectionDefinition definition)
{
string GetErrorMessage()
{
var representation = "Invalid line format in xref table: [" + string.Join(", ", tokens.Select(x => x.ToString())) + "]";
return representation;
}
if (objectCount == definition.Count)
{
if (tokens.Count == 2)
{
if (tokens[0] is NumericToken newFirstObjectToken && tokens[1] is NumericToken newObjectCountToken)
{
definition = new TableSubsectionDefinition(newFirstObjectToken.Long, newObjectCountToken.Int);
return 0;
}
}
throw new PdfDocumentFormatException($"Found a line with 2 unexpected entries in the cross reference table: {tokens[0]}, {tokens[1]}.");
}
if (tokens.Count <= 2)
{
if (!isLenientParsing)
{
throw new PdfDocumentFormatException(GetErrorMessage());
}
return objectCount;
}
var lastToken = tokens[tokens.Count - 1];
if (lastToken is OperatorToken operatorToken)
{
if (operatorToken.Data == FreeEntry)
{
return objectCount + 1;
}
if (operatorToken.Data != InUseEntry)
{
if (!isLenientParsing)
{
throw new PdfDocumentFormatException(GetErrorMessage());
}
return objectCount;
}
if (tokens[0] is NumericToken offset && tokens[1] is NumericToken generationNumber)
{
if (offset.Long >= builder.Offset && offset.Long <= scanner.CurrentPosition)
{
throw new PdfDocumentFormatException($"Object offset {offset} is within its own cross-reference table for object {definition.FirstNumber + objectCount}");
}
builder.Add(definition.FirstNumber + objectCount, generationNumber.Int, offset.Long);
return objectCount + 1;
}
}
else
{
if (!isLenientParsing)
{
throw new PdfDocumentFormatException(GetErrorMessage());
}
}
return objectCount;
}
private static PdfDictionary ParseTrailer(ISeekableTokenScanner scanner, bool isLenientParsing)
{
if (scanner.CurrentToken is OperatorToken trailerToken && trailerToken.Data == "trailer")
{
if (!scanner.TryReadToken(out DictionaryToken trailerDictionary))
{
throw new PdfDocumentFormatException($"Expected to find a dictionary in the trailer but instead found: {scanner.CurrentToken}.");
}
return PdfDictionary.FromDictionaryToken(trailerDictionary);
}
if (isLenientParsing)
{
var foundTrailer = false;
while (scanner.MoveNext())
{
if (scanner.CurrentToken is OperatorToken op && op.Data == "trailer")
{
foundTrailer = true;
break;
}
}
if (foundTrailer && scanner.TryReadToken(out DictionaryToken trailerDictionary))
{
return PdfDictionary.FromDictionaryToken(trailerDictionary);
}
}
throw new PdfDocumentFormatException("No trailer dictionary was present.");
}
}
}

View File

@@ -1,7 +1,8 @@
namespace UglyToad.Pdf.Parser.Parts
namespace UglyToad.Pdf.Parser.FileStructure
{
using System;
using System.Text.RegularExpressions;
using Content;
using Exceptions;
using Logging;
using Tokenization.Scanner;

View File

@@ -1,4 +1,4 @@
namespace UglyToad.Pdf.Parser.Parts
namespace UglyToad.Pdf.Parser.FileStructure
{
using System;
using System.Collections.Generic;

View File

@@ -0,0 +1,219 @@
namespace UglyToad.Pdf.Parser.FileStructure
{
namespace UglyToad.Pdf.Parser.Parts.CrossReference
{
using System;
using ContentStream;
using ContentStream.TypedAccessors;
using Cos;
using global::UglyToad.Pdf.Parser.Parts;
using global::UglyToad.Pdf.Parser.Parts.CrossReference;
using IO;
using Logging;
using Util;
internal class OldCrossReferenceTableParser
{
private const string InUseEntry = "n";
private const string FreeEntry = "f";
private readonly ILog log;
private readonly IDictionaryParser dictionaryParser;
private readonly IBaseParser baseParser;
public OldCrossReferenceTableParser(ILog log, IDictionaryParser dictionaryParser, IBaseParser baseParser)
{
this.log = log;
this.dictionaryParser = dictionaryParser;
this.baseParser = baseParser;
}
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
{
builder = null;
var tableStartOffset = source.GetPosition();
if (source.Peek() != 'x')
{
return false;
}
var xref = ReadHelper.ReadString(source);
if (!xref.Trim().Equals("xref"))
{
return false;
}
// check for trailer after xref
var str = ReadHelper.ReadString(source);
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
source.Rewind(b.Length);
if (str.StartsWith("trailer"))
{
log.Warn("skipping empty xref table");
return false;
}
builder = new CrossReferenceTablePartBuilder
{
Offset = offset,
XRefType = CrossReferenceType.Table
};
// Tables can have multiple sections. Each starts with a starting object id and a count.
while (true)
{
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
{
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
if (isLenientParsing)
{
break;
}
return false;
}
var currentObjectId = subsectionDefinition.FirstNumber;
ReadHelper.SkipSpaces(source);
for (var i = 0; i < subsectionDefinition.Count; i++)
{
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
{
break;
}
if (source.Peek() == 't')
{
break;
}
//Ignore table contents
var currentLine = ReadHelper.ReadLine(source);
var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
if (splitString.Length < 3)
{
log.Warn("invalid xref line: " + currentLine);
break;
}
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
if (splitString[splitString.Length - 1].Equals(InUseEntry))
{
try
{
var objectOffset = long.Parse(splitString[0]);
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
{
// PDFBOX-3923: offset points inside this table - that can't be good
throw new InvalidOperationException(
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
}
var generation = int.Parse(splitString[1]);
builder.Add(currentObjectId, generation, objectOffset);
}
catch (FormatException e)
{
throw new InvalidOperationException("Bad", e);
}
}
else if (!splitString[2].Equals(FreeEntry))
{
throw new InvalidOperationException(
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
}
currentObjectId++;
ReadHelper.SkipSpaces(source);
}
ReadHelper.SkipSpaces(source);
if (!ReadHelper.IsDigit(source))
{
break;
}
}
if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
{
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
}
builder.Dictionary = trailer;
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
return true;
}
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
{
trailer = null;
// parse the last trailer.
var trailerOffset = source.GetPosition();
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
if (isLenientParsing)
{
int nextCharacter = source.Peek();
while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
{
if (source.GetPosition() == trailerOffset)
{
// warn only the first time
//LOG.warn("Expected trailer object at position " + trailerOffset
// + ", keep trying");
}
ReadHelper.ReadLine(source);
nextCharacter = source.Peek();
}
}
if (source.Peek() != 't')
{
return false;
}
//read "trailer"
long currentOffset = source.GetPosition();
string nextLine = ReadHelper.ReadLine(source);
if (!nextLine.Trim().Equals("trailer"))
{
// in some cases the EOL is missing and the trailer immediately
// continues with "<<" or with a blank character
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
if (nextLine.StartsWith("trailer"))
{
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
int len = "trailer".Length;
// jump back right after "trailer"
source.Seek(currentOffset + len);
}
else
{
return false;
}
}
// in some cases the EOL is missing and the trailer continues with " <<"
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
ReadHelper.SkipSpaces(source);
PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);
trailer = parsedTrailer;
ReadHelper.SkipSpaces(source);
return true;
}
}
}
}

View File

@@ -1,10 +1,11 @@
namespace UglyToad.Pdf.Parser.Parts.CrossReference
namespace UglyToad.Pdf.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using System.Linq;
using Cos;
using IO;
using Parts;
using Util;
internal class XrefCosOffsetChecker

View File

@@ -1,4 +1,4 @@
namespace UglyToad.Pdf.Parser.Parts.CrossReference
namespace UglyToad.Pdf.Parser.FileStructure
{
using System;
using System.Collections.Generic;
@@ -6,6 +6,7 @@
using Cos;
using IO;
using Logging;
using Parts;
internal class XrefOffsetValidator
{

View File

@@ -139,7 +139,7 @@
}
}
return builder.AsCrossReferenceTablePart();
return builder.Build();
}
private static List<long> GetObjectNumbers(PdfRawStream stream)

View File

@@ -1,213 +0,0 @@
namespace UglyToad.Pdf.Parser.Parts.CrossReference
{
using System;
using ContentStream;
using ContentStream.TypedAccessors;
using Cos;
using IO;
using Logging;
using Util;
internal class CrossReferenceTableParser
{
private const string InUseEntry = "n";
private const string FreeEntry = "f";
private readonly ILog log;
private readonly IDictionaryParser dictionaryParser;
private readonly IBaseParser baseParser;
public CrossReferenceTableParser(ILog log, IDictionaryParser dictionaryParser, IBaseParser baseParser)
{
this.log = log;
this.dictionaryParser = dictionaryParser;
this.baseParser = baseParser;
}
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
{
builder = null;
var tableStartOffset = source.GetPosition();
if (source.Peek() != 'x')
{
return false;
}
var xref = ReadHelper.ReadString(source);
if (!xref.Trim().Equals("xref"))
{
return false;
}
// check for trailer after xref
var str = ReadHelper.ReadString(source);
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
source.Rewind(b.Length);
if (str.StartsWith("trailer"))
{
log.Warn("skipping empty xref table");
return false;
}
builder = new CrossReferenceTablePartBuilder
{
Offset = offset,
XRefType = CrossReferenceType.Table
};
// Tables can have multiple sections. Each starts with a starting object id and a count.
while (true)
{
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
{
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
if (isLenientParsing)
{
break;
}
return false;
}
var currentObjectId = subsectionDefinition.FirstNumber;
ReadHelper.SkipSpaces(source);
for (var i = 0; i < subsectionDefinition.Count; i++)
{
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
{
break;
}
if (source.Peek() == 't')
{
break;
}
//Ignore table contents
var currentLine = ReadHelper.ReadLine(source);
var splitString = currentLine.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries);
if (splitString.Length < 3)
{
log.Warn("invalid xref line: " + currentLine);
break;
}
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
if (splitString[splitString.Length - 1].Equals(InUseEntry))
{
try
{
var objectOffset = long.Parse(splitString[0]);
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
{
// PDFBOX-3923: offset points inside this table - that can't be good
throw new InvalidOperationException(
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
}
var generation = int.Parse(splitString[1]);
builder.Add(currentObjectId, generation, objectOffset);
}
catch (FormatException e)
{
throw new InvalidOperationException("Bad", e);
}
}
else if (!splitString[2].Equals(FreeEntry))
{
throw new InvalidOperationException(
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
}
currentObjectId++;
ReadHelper.SkipSpaces(source);
}
ReadHelper.SkipSpaces(source);
if (!ReadHelper.IsDigit(source))
{
break;
}
}
if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
{
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
}
builder.Dictionary = trailer;
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
return true;
}
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
{
trailer = null;
// parse the last trailer.
var trailerOffset = source.GetPosition();
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
if (isLenientParsing)
{
int nextCharacter = source.Peek();
while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
{
if (source.GetPosition() == trailerOffset)
{
// warn only the first time
//LOG.warn("Expected trailer object at position " + trailerOffset
// + ", keep trying");
}
ReadHelper.ReadLine(source);
nextCharacter = source.Peek();
}
}
if (source.Peek() != 't')
{
return false;
}
//read "trailer"
long currentOffset = source.GetPosition();
string nextLine = ReadHelper.ReadLine(source);
if (!nextLine.Trim().Equals("trailer"))
{
// in some cases the EOL is missing and the trailer immediately
// continues with "<<" or with a blank character
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
if (nextLine.StartsWith("trailer"))
{
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
int len = "trailer".Length;
// jump back right after "trailer"
source.Seek(currentOffset + len);
}
else
{
return false;
}
}
// in some cases the EOL is missing and the trailer continues with " <<"
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
ReadHelper.SkipSpaces(source);
PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);
trailer = parsedTrailer;
ReadHelper.SkipSpaces(source);
return true;
}
}
}

View File

@@ -5,6 +5,7 @@
using Content;
using ContentStream;
using Cos;
using FileStructure;
using Filters;
using Fonts;
using Fonts.Parser;
@@ -57,10 +58,18 @@
var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
var pool = new CosObjectPool();
// TODO: make this use the scanner.
var validator = new CrossReferenceOffsetValidator(new XrefOffsetValidator(log, reader, container.Get<CosDictionaryParser>(),
container.Get<CosBaseParser>(), pool));
crossReferenceOffset = validator.Validate(crossReferenceOffset, isLenientParsing);
var crossReferenceTable = container.Get<FileCrossReferenceTableParser>()
var crossReferenceTable = container.Get<CrossReferenceParser>()
.Parse(reader, isLenientParsing, crossReferenceOffset, pool);
container.Get<CrossReferenceParser>().ParseNew(crossReferenceOffset, scanner, isLenientParsing);
var filterProvider = container.Get<IFilterProvider>();
var bruteForceSearcher = new BruteForceSearcher(reader);
var pdfObjectParser = new PdfObjectParser(container.Get<ILog>(), container.Get<CosBaseParser>(),

View File

@@ -6,6 +6,7 @@
using IO;
using Logging;
using Parser;
using Parser.FileStructure;
using Parser.Parts;
using Util.JetBrains.Annotations;

View File

@@ -0,0 +1,32 @@
namespace UglyToad.Pdf.Tokenization
{
using IO;
using Tokens;
internal class CrossReferenceEndOfLineTokenizer : ITokenizer
{
public bool ReadsNextByte { get; } = false;
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
{
token = null;
if (currentByte != '\r' && currentByte != '\n')
{
return false;
}
token = EndOfLineToken.Token;
return true;
}
}
internal class EndOfLineToken : IToken
{
public static EndOfLineToken Token { get; } = new EndOfLineToken();
private EndOfLineToken()
{
}
}
}

View File

@@ -21,6 +21,7 @@
private readonly ScannerScope scope;
private readonly IInputBytes inputBytes;
private readonly List<byte> currentBuffer = new List<byte>();
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
public IToken CurrentToken { get; private set; }
public bool TryReadToken<T>(out T token) where T : class, IToken
@@ -47,7 +48,7 @@
}
public long CurrentPosition => inputBytes.CurrentOffset;
private bool hasBytePreRead;
internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
@@ -68,73 +69,86 @@
hasBytePreRead = false;
var currentByte = inputBytes.CurrentByte;
var c = (char) currentByte;
if (IsEmpty(currentByte) || ReadHelper.IsWhitespace(currentByte))
{
isSkippingSymbol = false;
continue;
}
// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol && c != '>')
{
continue;
}
ITokenizer tokenizer = null;
switch (c)
foreach (var customTokenizer in customTokenizers)
{
case '(':
tokenizer = StringTokenizer;
if (currentByte == customTokenizer.firstByte)
{
tokenizer = customTokenizer.tokenizer;
break;
case '<':
var following = inputBytes.Peek();
if (following == '<')
{
isSkippingSymbol = true;
tokenizer = DictionaryTokenizer;
}
else
{
tokenizer = HexTokenizer;
}
break;
case '>' when scope == ScannerScope.Dictionary:
endAngleBracesRead++;
if (endAngleBracesRead == 2)
{
}
}
if (tokenizer == null)
{
if (IsEmpty(currentByte) || ReadHelper.IsWhitespace(currentByte))
{
isSkippingSymbol = false;
continue;
}
// If we failed to read the symbol for whatever reason we pass over it.
if (isSkippingSymbol && c != '>')
{
continue;
}
switch (c)
{
case '(':
tokenizer = StringTokenizer;
break;
case '<':
var following = inputBytes.Peek();
if (following == '<')
{
isSkippingSymbol = true;
tokenizer = DictionaryTokenizer;
}
else
{
tokenizer = HexTokenizer;
}
break;
case '>' when scope == ScannerScope.Dictionary:
endAngleBracesRead++;
if (endAngleBracesRead == 2)
{
return false;
}
break;
case '[':
tokenizer = ArrayTokenizer;
break;
case ']' when scope == ScannerScope.Array:
return false;
}
break;
case '[':
tokenizer = ArrayTokenizer;
break;
case ']' when scope == ScannerScope.Array:
return false;
case '/':
tokenizer = NameTokenizer;
break;
case '%':
tokenizer = CommentTokenizer;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
case '+':
case '.':
tokenizer = NumericTokenizer;
break;
default:
tokenizer = PlainTokenizer;
break;
case '/':
tokenizer = NameTokenizer;
break;
case '%':
tokenizer = CommentTokenizer;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
case '+':
case '.':
tokenizer = NumericTokenizer;
break;
default:
tokenizer = PlainTokenizer;
break;
}
}
if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
@@ -158,6 +172,21 @@
return false;
}
public void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer)
{
if (tokenizer == null)
{
throw new ArgumentNullException(nameof(tokenizer));
}
customTokenizers.Add((firstByte, tokenizer));
}
public void DeregisterCustomTokenizer(ITokenizer tokenizer)
{
customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer));
}
private static bool IsEmpty(byte b)
{
return b == ' ' || b == '\r' || b == '\n' || b == 0;

View File

@@ -16,5 +16,9 @@
void Seek(long position);
long CurrentPosition { get; }
void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer);
void DeregisterCustomTokenizer(ITokenizer tokenizer);
}
}

View File

@@ -4,6 +4,8 @@
using Fonts.Parser;
using Logging;
using Parser;
using Parser.FileStructure;
using Parser.FileStructure.UglyToad.Pdf.Parser.Parts.CrossReference;
using Parser.Parts;
using Parser.Parts.CrossReference;
@@ -39,8 +41,8 @@
var objectStreamParser = new ObjectStreamParser(logger, filterProvider, baseParser);
var dynamicParser = new DynamicParser(logger, baseParser, streamParser, objectStreamParser);
var crossReferenceTableParser = new FileCrossReferenceTableParser(logger, dictionaryParser, baseParser, streamParser, crossReferenceParser,
new CrossReferenceTableParser(logger, dictionaryParser, baseParser));
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, streamParser, crossReferenceParser, new CrossReferenceTableParser(),
new OldCrossReferenceTableParser(logger, dictionaryParser, baseParser));
var cmapParser = new CMapParser();
var afmParser = new AdobeFontMetricsParser();