mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-01-18 19:51:24 +08:00
start migrating cross reference parsing process to token scanner
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
{
|
||||
using System;
|
||||
using Exceptions;
|
||||
using Pdf.Parser.FileStructure;
|
||||
using Pdf.Parser.Parts;
|
||||
using Xunit;
|
||||
|
||||
|
||||
@@ -1,77 +1,94 @@
|
||||
namespace UglyToad.Pdf.Tests.Parser.Parts.CrossReference
|
||||
namespace UglyToad.Pdf.Tests.Parser.Parts.FileStructure
|
||||
{
|
||||
using System;
|
||||
using System.Linq;
|
||||
using IO;
|
||||
using Exceptions;
|
||||
using Pdf.Cos;
|
||||
using Pdf.Parser.Parts.CrossReference;
|
||||
using Pdf.Util;
|
||||
using Pdf.Parser.FileStructure;
|
||||
using Pdf.Tokenization.Scanner;
|
||||
using Xunit;
|
||||
|
||||
public class CrossReferenceTableParserTests
|
||||
{
|
||||
private readonly CosObjectPool objectPool = new CosObjectPool();
|
||||
|
||||
private readonly CrossReferenceTableParser parser = new CrossReferenceTableParser(new TestingLog(),
|
||||
new TestDictionaryParser(),
|
||||
new TestBaseParser());
|
||||
private readonly CrossReferenceTableParser parser = new CrossReferenceTableParser();
|
||||
|
||||
[Fact]
|
||||
public void OffsetNotXrefFalse()
|
||||
public void ParseNewDefaultTable()
|
||||
{
|
||||
var input = GetReader("12 0 obj <<>> endobj xref");
|
||||
var input = StringBytesTestConverter.Scanner(@"one xref
|
||||
0 6
|
||||
0000000003 65535 f
|
||||
0000000090 00000 n
|
||||
0000000081 00000 n
|
||||
0000000000 00007 f
|
||||
0000000331 00000 n
|
||||
0000000409 00000 n
|
||||
|
||||
var result = parser.TryParse(input, 4, false, objectPool, out var _);
|
||||
trailer
|
||||
<< >>");
|
||||
|
||||
Assert.False(result);
|
||||
var result = parser.Parse(input, 4, false);
|
||||
|
||||
Assert.Equal(4, result.ObjectOffsets.Count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void OffsetXButNotXrefFalse()
|
||||
public void OffsetNotXrefThrows()
|
||||
{
|
||||
var input = GetReader("12 0 obj <<>> endobj xref");
|
||||
|
||||
Action action = () => parser.Parse(input, 4, false);
|
||||
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void OffsetXButNotXrefThrows()
|
||||
{
|
||||
var input = GetReader(@"xtable
|
||||
trailer");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var _);
|
||||
Action action = () => parser.Parse(input, 0, false);
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EmptyTableFalse()
|
||||
public void EmptyTableReturnsEmpty()
|
||||
{
|
||||
var input = GetReader(@"xref
|
||||
trailer");
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var _);
|
||||
var result = parser.Parse(input, 0, false);
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Empty(result.ObjectOffsets);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void InvalidSubsectionDefinitionLenientTrue()
|
||||
public void InvalidSubsectionDefinitionLenientSkips()
|
||||
{
|
||||
var input = GetReader(@"xref
|
||||
ab 12
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, true, objectPool, out var _);
|
||||
var result = parser.Parse(input, 0, true);
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Empty(result.ObjectOffsets);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void InvalidSubsectionDefinitionNotLenientFalse()
|
||||
public void InvalidSubsectionDefinitionNotLenientThrows()
|
||||
{
|
||||
var input = GetReader(@"xref
|
||||
ab 12
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var _);
|
||||
Action action = () => parser.Parse(input, 0, false);
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -83,15 +100,11 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var table);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var built = table.AsCrossReferenceTablePart();
|
||||
|
||||
Assert.Empty(built.ObjectOffsets);
|
||||
Assert.Equal(0, built.Offset);
|
||||
Assert.Equal(CrossReferenceType.Table, built.Type);
|
||||
var result = parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Empty(result.ObjectOffsets);
|
||||
Assert.Equal(0, result.Offset);
|
||||
Assert.Equal(CrossReferenceType.Table, result.Type);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -105,15 +118,11 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var table);
|
||||
var result = parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Equal(2, result.ObjectOffsets.Count);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var built = table.AsCrossReferenceTablePart();
|
||||
|
||||
Assert.Equal(2, built.ObjectOffsets.Count);
|
||||
|
||||
var results = built.ObjectOffsets.Select(x => new {x.Key.Number, x.Key.Generation, x.Value}).ToList();
|
||||
var results = result.ObjectOffsets.Select(x => new {x.Key.Number, x.Key.Generation, x.Value}).ToList();
|
||||
|
||||
Assert.Equal(100, results[0].Value);
|
||||
Assert.Equal(1, results[0].Number);
|
||||
@@ -134,15 +143,11 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var table);
|
||||
var result = parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Equal(2, result.ObjectOffsets.Count);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var built = table.AsCrossReferenceTablePart();
|
||||
|
||||
Assert.Equal(2, built.ObjectOffsets.Count);
|
||||
|
||||
var results = built.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
|
||||
var results = result.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
|
||||
|
||||
Assert.Equal(190, results[0].Value);
|
||||
Assert.Equal(15, results[0].Number);
|
||||
@@ -164,15 +169,11 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var table);
|
||||
var result = parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Equal(2, result.ObjectOffsets.Count);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var built = table.AsCrossReferenceTablePart();
|
||||
|
||||
Assert.Equal(2, built.ObjectOffsets.Count);
|
||||
|
||||
var results = built.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
|
||||
var results = result.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
|
||||
|
||||
Assert.Equal(190, results[0].Value);
|
||||
Assert.Equal(15, results[0].Number);
|
||||
@@ -198,15 +199,11 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var table);
|
||||
var result = parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Equal(5, result.ObjectOffsets.Count);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
var built = table.AsCrossReferenceTablePart();
|
||||
|
||||
Assert.Equal(5, built.ObjectOffsets.Count);
|
||||
|
||||
var results = built.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
|
||||
var results = result.ObjectOffsets.Select(x => new { x.Key.Number, x.Key.Generation, x.Value }).ToList();
|
||||
|
||||
Assert.Equal(100, results[0].Value);
|
||||
Assert.Equal(1, results[0].Number);
|
||||
@@ -239,9 +236,9 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
Action action = () => parser.TryParse(input, 0, false, objectPool, out var _);
|
||||
Action action = () => parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Throws<InvalidOperationException>(action);
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -254,24 +251,24 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
Action action = () => parser.TryParse(input, 0, false, objectPool, out var _);
|
||||
Action action = () => parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Throws<InvalidOperationException>(action);
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShortLineInTableReturnsFalse()
|
||||
public void ShortLineInTableReturnsThrows()
|
||||
{
|
||||
var input = GetReader(@"xref
|
||||
15 2
|
||||
000000019000000 n
|
||||
019 n
|
||||
0000000250 00032 n
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var _);
|
||||
Action action = () => parser.Parse(input, 0, false);
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Throws<PdfDocumentFormatException>(action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -285,16 +282,14 @@ trailer
|
||||
trailer
|
||||
<<>>");
|
||||
|
||||
var result = parser.TryParse(input, 0, false, objectPool, out var table);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
Assert.Equal(2, table.AsCrossReferenceTablePart().ObjectOffsets.Count);
|
||||
var result = parser.Parse(input, 0, false);
|
||||
|
||||
Assert.Equal(2, result.ObjectOffsets.Count);
|
||||
}
|
||||
|
||||
private static IRandomAccessRead GetReader(string input)
|
||||
private static CoreTokenScanner GetReader(string input)
|
||||
{
|
||||
return new RandomAccessBuffer(OtherEncodings.StringAsLatin1Bytes(input));
|
||||
return StringBytesTestConverter.Scanner(input);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,7 +2,7 @@
|
||||
{
|
||||
using System;
|
||||
using Exceptions;
|
||||
using Pdf.Parser.Parts;
|
||||
using Pdf.Parser.FileStructure;
|
||||
using Pdf.Tokenization.Scanner;
|
||||
using Xunit;
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts
|
||||
namespace UglyToad.Pdf.Content
|
||||
{
|
||||
internal class HeaderVersion
|
||||
{
|
||||
@@ -5,6 +5,7 @@
|
||||
using System.Collections;
|
||||
using System.Text;
|
||||
using Cos;
|
||||
using Tokenization.Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
internal class PdfDictionary : CosBase, IReadOnlyDictionary<CosName, CosBase>
|
||||
@@ -105,5 +106,15 @@
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
#endregion
|
||||
|
||||
internal static PdfDictionary FromDictionaryToken(DictionaryToken token)
|
||||
{
|
||||
if (token == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(token));
|
||||
}
|
||||
|
||||
return new PdfDictionary();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
public PdfDictionary Dictionary { get; set; }
|
||||
|
||||
public CrossReferenceType XRefType { get; set; }
|
||||
|
||||
|
||||
public void Add(long objectId, int generationNumber, long offset)
|
||||
{
|
||||
CosObjectKey objKey = new CosObjectKey(objectId, generationNumber);
|
||||
@@ -25,7 +25,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
public CrossReferenceTablePart AsCrossReferenceTablePart()
|
||||
public CrossReferenceTablePart Build()
|
||||
{
|
||||
return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
using Parts.CrossReference;
|
||||
|
||||
internal class CrossReferenceOffsetValidator
|
||||
{
|
||||
private readonly XrefOffsetValidator offsetValidator;
|
||||
|
||||
public CrossReferenceOffsetValidator(XrefOffsetValidator offsetValidator)
|
||||
{
|
||||
this.offsetValidator = offsetValidator;
|
||||
}
|
||||
|
||||
public long Validate(long crossReferenceOffset, bool isLenientParsing)
|
||||
{
|
||||
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, isLenientParsing);
|
||||
if (fixedOffset > -1)
|
||||
{
|
||||
crossReferenceOffset = fixedOffset;
|
||||
}
|
||||
|
||||
return crossReferenceOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,14 +1,20 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts.CrossReference
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
using Parts.CrossReference;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
using UglyToad.Pdf.Parser.Parts.CrossReference;
|
||||
|
||||
internal class FileCrossReferenceTableParser
|
||||
internal class CrossReferenceParser
|
||||
{
|
||||
private const int X = 'x';
|
||||
|
||||
@@ -18,11 +24,13 @@
|
||||
private readonly CosStreamParser streamParser;
|
||||
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
|
||||
private readonly CrossReferenceTableParser crossReferenceTableParser;
|
||||
private readonly OldCrossReferenceTableParser oldCrossReferenceTableParser;
|
||||
|
||||
public FileCrossReferenceTableParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
|
||||
public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
|
||||
CosStreamParser streamParser,
|
||||
CrossReferenceStreamParser crossReferenceStreamParser,
|
||||
CrossReferenceTableParser crossReferenceTableParser)
|
||||
CrossReferenceTableParser crossReferenceTableParser,
|
||||
OldCrossReferenceTableParser oldCrossReferenceTableParser)
|
||||
{
|
||||
this.log = log;
|
||||
this.dictionaryParser = dictionaryParser;
|
||||
@@ -30,6 +38,41 @@
|
||||
this.streamParser = streamParser;
|
||||
this.crossReferenceStreamParser = crossReferenceStreamParser;
|
||||
this.crossReferenceTableParser = crossReferenceTableParser;
|
||||
this.oldCrossReferenceTableParser = oldCrossReferenceTableParser;
|
||||
}
|
||||
|
||||
public CrossReferenceTable ParseNew(long crossReferenceLocation, ISeekableTokenScanner scanner,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
var previousLocation = crossReferenceLocation;
|
||||
|
||||
var visitedCrossReferences = new HashSet<long>();
|
||||
|
||||
while (previousLocation >= 0)
|
||||
{
|
||||
scanner.Seek(crossReferenceLocation);
|
||||
|
||||
scanner.MoveNext();
|
||||
|
||||
if (scanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
|
||||
{
|
||||
var table = crossReferenceTableParser.Parse(scanner, crossReferenceLocation, isLenientParsing);
|
||||
|
||||
previousLocation = table.Dictionary.GetLongOrDefault(CosName.PREV, -1);
|
||||
|
||||
|
||||
}
|
||||
else if (scanner.CurrentToken is NumericToken streamObjectNumberToken)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The xref object was not a stream or a table, was instead: {scanner.CurrentToken}.");
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
|
||||
@@ -62,11 +105,11 @@
|
||||
{
|
||||
// xref table and trailer
|
||||
// use existing parser to parse xref table
|
||||
if (!crossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
|
||||
if (!oldCrossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
|
||||
{
|
||||
throw new InvalidOperationException($"Expected trailer object at position: {reader.GetPosition()}");
|
||||
}
|
||||
|
||||
|
||||
PdfDictionary trailer = tableBuilder.Dictionary;
|
||||
CrossReferenceTablePart streamPart = null;
|
||||
// check for a XRef stream, it may contain some object ids of compressed objects
|
||||
@@ -128,7 +171,7 @@
|
||||
|
||||
tableBuilder.Previous = tableBuilder.Dictionary.GetLongOrDefault(CosName.PREV);
|
||||
|
||||
table.Add(tableBuilder.AsCrossReferenceTablePart());
|
||||
table.Add(tableBuilder.Build());
|
||||
|
||||
if (streamPart != null)
|
||||
{
|
||||
@@ -0,0 +1,220 @@
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using Parts.CrossReference;
|
||||
using Tokenization;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class CrossReferenceTableParser
|
||||
{
|
||||
private const string InUseEntry = "n";
|
||||
private const string FreeEntry = "f";
|
||||
|
||||
public CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing)
|
||||
{
|
||||
var builder = new CrossReferenceTablePartBuilder
|
||||
{
|
||||
Offset = offset,
|
||||
XRefType = CrossReferenceType.Table
|
||||
};
|
||||
|
||||
if (scanner.CurrentPosition != offset)
|
||||
{
|
||||
scanner.Seek(offset);
|
||||
}
|
||||
|
||||
scanner.MoveNext();
|
||||
|
||||
if (scanner.CurrentToken is OperatorToken operatorToken)
|
||||
{
|
||||
if (operatorToken.Data == "xref")
|
||||
{
|
||||
scanner.MoveNext();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
|
||||
}
|
||||
}
|
||||
|
||||
if (scanner.CurrentToken is NumericToken firstObjectNumber)
|
||||
{
|
||||
if (!scanner.TryReadToken(out NumericToken objectCount))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Unexpected token following xref and {firstObjectNumber}. We found: {scanner.CurrentToken}.");
|
||||
}
|
||||
|
||||
var definition = new TableSubsectionDefinition(firstObjectNumber.Long, objectCount.Int);
|
||||
|
||||
var tokenizer = new CrossReferenceEndOfLineTokenizer();
|
||||
|
||||
scanner.RegisterCustomTokenizer((byte)'\r', tokenizer);
|
||||
scanner.RegisterCustomTokenizer((byte)'\n', tokenizer);
|
||||
|
||||
var readingLine = false;
|
||||
var tokens = new List<IToken>();
|
||||
var count = 0;
|
||||
while (scanner.MoveNext())
|
||||
{
|
||||
if (scanner.CurrentToken is EndOfLineToken)
|
||||
{
|
||||
if (!readingLine)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
readingLine = false;
|
||||
|
||||
count = ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition);
|
||||
|
||||
tokens.Clear();
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (scanner.CurrentToken is CommentToken)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var isLineOperator = scanner.CurrentToken is OperatorToken op && (op.Data == FreeEntry || op.Data == InUseEntry);
|
||||
|
||||
if (!(scanner.CurrentToken is NumericToken) && !isLineOperator)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
readingLine = true;
|
||||
tokens.Add(scanner.CurrentToken);
|
||||
}
|
||||
|
||||
if (tokens.Count > 0)
|
||||
{
|
||||
ProcessTokens(tokens, scanner, builder, isLenientParsing, count, ref definition);
|
||||
}
|
||||
|
||||
scanner.DeregisterCustomTokenizer(tokenizer);
|
||||
}
|
||||
|
||||
builder.Dictionary = ParseTrailer(scanner, isLenientParsing);
|
||||
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
private static int ProcessTokens(List<IToken> tokens, ISeekableTokenScanner scanner, CrossReferenceTablePartBuilder builder, bool isLenientParsing,
|
||||
int objectCount, ref TableSubsectionDefinition definition)
|
||||
{
|
||||
string GetErrorMessage()
|
||||
{
|
||||
var representation = "Invalid line format in xref table: [" + string.Join(", ", tokens.Select(x => x.ToString())) + "]";
|
||||
|
||||
return representation;
|
||||
}
|
||||
|
||||
if (objectCount == definition.Count)
|
||||
{
|
||||
if (tokens.Count == 2)
|
||||
{
|
||||
if (tokens[0] is NumericToken newFirstObjectToken && tokens[1] is NumericToken newObjectCountToken)
|
||||
{
|
||||
definition = new TableSubsectionDefinition(newFirstObjectToken.Long, newObjectCountToken.Int);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
throw new PdfDocumentFormatException($"Found a line with 2 unexpected entries in the cross reference table: {tokens[0]}, {tokens[1]}.");
|
||||
}
|
||||
|
||||
if (tokens.Count <= 2)
|
||||
{
|
||||
if (!isLenientParsing)
|
||||
{
|
||||
throw new PdfDocumentFormatException(GetErrorMessage());
|
||||
}
|
||||
|
||||
return objectCount;
|
||||
}
|
||||
|
||||
var lastToken = tokens[tokens.Count - 1];
|
||||
|
||||
if (lastToken is OperatorToken operatorToken)
|
||||
{
|
||||
if (operatorToken.Data == FreeEntry)
|
||||
{
|
||||
return objectCount + 1;
|
||||
}
|
||||
|
||||
if (operatorToken.Data != InUseEntry)
|
||||
{
|
||||
if (!isLenientParsing)
|
||||
{
|
||||
throw new PdfDocumentFormatException(GetErrorMessage());
|
||||
}
|
||||
|
||||
return objectCount;
|
||||
}
|
||||
|
||||
if (tokens[0] is NumericToken offset && tokens[1] is NumericToken generationNumber)
|
||||
{
|
||||
if (offset.Long >= builder.Offset && offset.Long <= scanner.CurrentPosition)
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Object offset {offset} is within its own cross-reference table for object {definition.FirstNumber + objectCount}");
|
||||
}
|
||||
|
||||
builder.Add(definition.FirstNumber + objectCount, generationNumber.Int, offset.Long);
|
||||
|
||||
return objectCount + 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!isLenientParsing)
|
||||
{
|
||||
throw new PdfDocumentFormatException(GetErrorMessage());
|
||||
}
|
||||
}
|
||||
|
||||
return objectCount;
|
||||
}
|
||||
|
||||
private static PdfDictionary ParseTrailer(ISeekableTokenScanner scanner, bool isLenientParsing)
|
||||
{
|
||||
if (scanner.CurrentToken is OperatorToken trailerToken && trailerToken.Data == "trailer")
|
||||
{
|
||||
if (!scanner.TryReadToken(out DictionaryToken trailerDictionary))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Expected to find a dictionary in the trailer but instead found: {scanner.CurrentToken}.");
|
||||
}
|
||||
|
||||
return PdfDictionary.FromDictionaryToken(trailerDictionary);
|
||||
}
|
||||
|
||||
if (isLenientParsing)
|
||||
{
|
||||
var foundTrailer = false;
|
||||
while (scanner.MoveNext())
|
||||
{
|
||||
if (scanner.CurrentToken is OperatorToken op && op.Data == "trailer")
|
||||
{
|
||||
foundTrailer = true;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (foundTrailer && scanner.TryReadToken(out DictionaryToken trailerDictionary))
|
||||
{
|
||||
return PdfDictionary.FromDictionaryToken(trailerDictionary);
|
||||
}
|
||||
}
|
||||
|
||||
throw new PdfDocumentFormatException("No trailer dictionary was present.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,8 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
using System;
|
||||
using System.Text.RegularExpressions;
|
||||
using Content;
|
||||
using Exceptions;
|
||||
using Logging;
|
||||
using Tokenization.Scanner;
|
||||
@@ -1,4 +1,4 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
@@ -0,0 +1,219 @@
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
namespace UglyToad.Pdf.Parser.Parts.CrossReference
|
||||
{
|
||||
using System;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Cos;
|
||||
using global::UglyToad.Pdf.Parser.Parts;
|
||||
using global::UglyToad.Pdf.Parser.Parts.CrossReference;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Util;
|
||||
|
||||
internal class OldCrossReferenceTableParser
|
||||
{
|
||||
private const string InUseEntry = "n";
|
||||
private const string FreeEntry = "f";
|
||||
|
||||
private readonly ILog log;
|
||||
private readonly IDictionaryParser dictionaryParser;
|
||||
private readonly IBaseParser baseParser;
|
||||
|
||||
public OldCrossReferenceTableParser(ILog log, IDictionaryParser dictionaryParser, IBaseParser baseParser)
|
||||
{
|
||||
this.log = log;
|
||||
this.dictionaryParser = dictionaryParser;
|
||||
this.baseParser = baseParser;
|
||||
}
|
||||
|
||||
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
|
||||
{
|
||||
builder = null;
|
||||
|
||||
var tableStartOffset = source.GetPosition();
|
||||
|
||||
if (source.Peek() != 'x')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var xref = ReadHelper.ReadString(source);
|
||||
if (!xref.Trim().Equals("xref"))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// check for trailer after xref
|
||||
var str = ReadHelper.ReadString(source);
|
||||
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
|
||||
|
||||
source.Rewind(b.Length);
|
||||
|
||||
if (str.StartsWith("trailer"))
|
||||
{
|
||||
log.Warn("skipping empty xref table");
|
||||
return false;
|
||||
}
|
||||
|
||||
builder = new CrossReferenceTablePartBuilder
|
||||
{
|
||||
Offset = offset,
|
||||
XRefType = CrossReferenceType.Table
|
||||
};
|
||||
|
||||
// Tables can have multiple sections. Each starts with a starting object id and a count.
|
||||
while (true)
|
||||
{
|
||||
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
|
||||
{
|
||||
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
|
||||
|
||||
if (isLenientParsing)
|
||||
{
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
var currentObjectId = subsectionDefinition.FirstNumber;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
for (var i = 0; i < subsectionDefinition.Count; i++)
|
||||
{
|
||||
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (source.Peek() == 't')
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
//Ignore table contents
|
||||
var currentLine = ReadHelper.ReadLine(source);
|
||||
var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
|
||||
if (splitString.Length < 3)
|
||||
{
|
||||
log.Warn("invalid xref line: " + currentLine);
|
||||
break;
|
||||
}
|
||||
|
||||
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
|
||||
if (splitString[splitString.Length - 1].Equals(InUseEntry))
|
||||
{
|
||||
try
|
||||
{
|
||||
var objectOffset = long.Parse(splitString[0]);
|
||||
|
||||
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
|
||||
{
|
||||
// PDFBOX-3923: offset points inside this table - that can't be good
|
||||
throw new InvalidOperationException(
|
||||
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
|
||||
}
|
||||
|
||||
var generation = int.Parse(splitString[1]);
|
||||
builder.Add(currentObjectId, generation, objectOffset);
|
||||
}
|
||||
catch (FormatException e)
|
||||
{
|
||||
throw new InvalidOperationException("Bad", e);
|
||||
}
|
||||
}
|
||||
else if (!splitString[2].Equals(FreeEntry))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
|
||||
}
|
||||
|
||||
currentObjectId++;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
if (!ReadHelper.IsDigit(source))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
|
||||
{
|
||||
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
|
||||
}
|
||||
|
||||
builder.Dictionary = trailer;
|
||||
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
|
||||
{
|
||||
trailer = null;
|
||||
// parse the last trailer.
|
||||
var trailerOffset = source.GetPosition();
|
||||
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
|
||||
if (isLenientParsing)
|
||||
{
|
||||
int nextCharacter = source.Peek();
|
||||
while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
|
||||
{
|
||||
if (source.GetPosition() == trailerOffset)
|
||||
{
|
||||
// warn only the first time
|
||||
//LOG.warn("Expected trailer object at position " + trailerOffset
|
||||
// + ", keep trying");
|
||||
}
|
||||
ReadHelper.ReadLine(source);
|
||||
nextCharacter = source.Peek();
|
||||
}
|
||||
}
|
||||
if (source.Peek() != 't')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
//read "trailer"
|
||||
long currentOffset = source.GetPosition();
|
||||
string nextLine = ReadHelper.ReadLine(source);
|
||||
if (!nextLine.Trim().Equals("trailer"))
|
||||
{
|
||||
// in some cases the EOL is missing and the trailer immediately
|
||||
// continues with "<<" or with a blank character
|
||||
// even if this does not comply with PDF reference we want to support as many PDFs as possible
|
||||
// Acrobat reader can also deal with this.
|
||||
if (nextLine.StartsWith("trailer"))
|
||||
{
|
||||
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
|
||||
int len = "trailer".Length;
|
||||
// jump back right after "trailer"
|
||||
source.Seek(currentOffset + len);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// in some cases the EOL is missing and the trailer continues with " <<"
|
||||
// even if this does not comply with PDF reference we want to support as many PDFs as possible
|
||||
// Acrobat reader can also deal with this.
|
||||
ReadHelper.SkipSpaces(source);
|
||||
|
||||
PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);
|
||||
|
||||
trailer = parsedTrailer;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,10 +1,11 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts.CrossReference
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Parts;
|
||||
using Util;
|
||||
|
||||
internal class XrefCosOffsetChecker
|
||||
@@ -1,4 +1,4 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts.CrossReference
|
||||
namespace UglyToad.Pdf.Parser.FileStructure
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
@@ -6,6 +6,7 @@
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
|
||||
internal class XrefOffsetValidator
|
||||
{
|
||||
@@ -139,7 +139,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
return builder.AsCrossReferenceTablePart();
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
private static List<long> GetObjectNumbers(PdfRawStream stream)
|
||||
|
||||
@@ -1,213 +0,0 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts.CrossReference
|
||||
{
|
||||
using System;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Util;
|
||||
|
||||
internal class CrossReferenceTableParser
|
||||
{
|
||||
private const string InUseEntry = "n";
|
||||
private const string FreeEntry = "f";
|
||||
|
||||
private readonly ILog log;
|
||||
private readonly IDictionaryParser dictionaryParser;
|
||||
private readonly IBaseParser baseParser;
|
||||
|
||||
public CrossReferenceTableParser(ILog log, IDictionaryParser dictionaryParser, IBaseParser baseParser)
|
||||
{
|
||||
this.log = log;
|
||||
this.dictionaryParser = dictionaryParser;
|
||||
this.baseParser = baseParser;
|
||||
}
|
||||
|
||||
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
|
||||
{
|
||||
builder = null;
|
||||
|
||||
var tableStartOffset = source.GetPosition();
|
||||
|
||||
if (source.Peek() != 'x')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var xref = ReadHelper.ReadString(source);
|
||||
if (!xref.Trim().Equals("xref"))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// check for trailer after xref
|
||||
var str = ReadHelper.ReadString(source);
|
||||
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
|
||||
|
||||
source.Rewind(b.Length);
|
||||
|
||||
if (str.StartsWith("trailer"))
|
||||
{
|
||||
log.Warn("skipping empty xref table");
|
||||
return false;
|
||||
}
|
||||
|
||||
builder = new CrossReferenceTablePartBuilder
|
||||
{
|
||||
Offset = offset,
|
||||
XRefType = CrossReferenceType.Table
|
||||
};
|
||||
|
||||
// Tables can have multiple sections. Each starts with a starting object id and a count.
|
||||
while (true)
|
||||
{
|
||||
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
|
||||
{
|
||||
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
|
||||
|
||||
if (isLenientParsing)
|
||||
{
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
var currentObjectId = subsectionDefinition.FirstNumber;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
for (var i = 0; i < subsectionDefinition.Count; i++)
|
||||
{
|
||||
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (source.Peek() == 't')
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
//Ignore table contents
|
||||
var currentLine = ReadHelper.ReadLine(source);
|
||||
var splitString = currentLine.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries);
|
||||
if (splitString.Length < 3)
|
||||
{
|
||||
log.Warn("invalid xref line: " + currentLine);
|
||||
break;
|
||||
}
|
||||
|
||||
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
|
||||
if (splitString[splitString.Length - 1].Equals(InUseEntry))
|
||||
{
|
||||
try
|
||||
{
|
||||
var objectOffset = long.Parse(splitString[0]);
|
||||
|
||||
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
|
||||
{
|
||||
// PDFBOX-3923: offset points inside this table - that can't be good
|
||||
throw new InvalidOperationException(
|
||||
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
|
||||
}
|
||||
|
||||
var generation = int.Parse(splitString[1]);
|
||||
builder.Add(currentObjectId, generation, objectOffset);
|
||||
}
|
||||
catch (FormatException e)
|
||||
{
|
||||
throw new InvalidOperationException("Bad", e);
|
||||
}
|
||||
}
|
||||
else if (!splitString[2].Equals(FreeEntry))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
|
||||
}
|
||||
|
||||
currentObjectId++;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
if (!ReadHelper.IsDigit(source))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
|
||||
{
|
||||
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
|
||||
}
|
||||
|
||||
builder.Dictionary = trailer;
|
||||
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
|
||||
{
|
||||
trailer = null;
|
||||
// parse the last trailer.
|
||||
var trailerOffset = source.GetPosition();
|
||||
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
|
||||
if (isLenientParsing)
|
||||
{
|
||||
int nextCharacter = source.Peek();
|
||||
while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
|
||||
{
|
||||
if (source.GetPosition() == trailerOffset)
|
||||
{
|
||||
// warn only the first time
|
||||
//LOG.warn("Expected trailer object at position " + trailerOffset
|
||||
// + ", keep trying");
|
||||
}
|
||||
ReadHelper.ReadLine(source);
|
||||
nextCharacter = source.Peek();
|
||||
}
|
||||
}
|
||||
if (source.Peek() != 't')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
//read "trailer"
|
||||
long currentOffset = source.GetPosition();
|
||||
string nextLine = ReadHelper.ReadLine(source);
|
||||
if (!nextLine.Trim().Equals("trailer"))
|
||||
{
|
||||
// in some cases the EOL is missing and the trailer immediately
|
||||
// continues with "<<" or with a blank character
|
||||
// even if this does not comply with PDF reference we want to support as many PDFs as possible
|
||||
// Acrobat reader can also deal with this.
|
||||
if (nextLine.StartsWith("trailer"))
|
||||
{
|
||||
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
|
||||
int len = "trailer".Length;
|
||||
// jump back right after "trailer"
|
||||
source.Seek(currentOffset + len);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// in some cases the EOL is missing and the trailer continues with " <<"
|
||||
// even if this does not comply with PDF reference we want to support as many PDFs as possible
|
||||
// Acrobat reader can also deal with this.
|
||||
ReadHelper.SkipSpaces(source);
|
||||
|
||||
PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);
|
||||
|
||||
trailer = parsedTrailer;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@
|
||||
using Content;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using FileStructure;
|
||||
using Filters;
|
||||
using Fonts;
|
||||
using Fonts.Parser;
|
||||
@@ -57,10 +58,18 @@
|
||||
var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
|
||||
|
||||
var pool = new CosObjectPool();
|
||||
|
||||
// TODO: make this use the scanner.
|
||||
var validator = new CrossReferenceOffsetValidator(new XrefOffsetValidator(log, reader, container.Get<CosDictionaryParser>(),
|
||||
container.Get<CosBaseParser>(), pool));
|
||||
|
||||
crossReferenceOffset = validator.Validate(crossReferenceOffset, isLenientParsing);
|
||||
|
||||
var crossReferenceTable = container.Get<FileCrossReferenceTableParser>()
|
||||
var crossReferenceTable = container.Get<CrossReferenceParser>()
|
||||
.Parse(reader, isLenientParsing, crossReferenceOffset, pool);
|
||||
|
||||
container.Get<CrossReferenceParser>().ParseNew(crossReferenceOffset, scanner, isLenientParsing);
|
||||
|
||||
var filterProvider = container.Get<IFilterProvider>();
|
||||
var bruteForceSearcher = new BruteForceSearcher(reader);
|
||||
var pdfObjectParser = new PdfObjectParser(container.Get<ILog>(), container.Get<CosBaseParser>(),
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Parser.FileStructure;
|
||||
using Parser.Parts;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
namespace UglyToad.Pdf.Tokenization
|
||||
{
|
||||
using IO;
|
||||
using Tokens;
|
||||
|
||||
internal class CrossReferenceEndOfLineTokenizer : ITokenizer
|
||||
{
|
||||
public bool ReadsNextByte { get; } = false;
|
||||
|
||||
public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token)
|
||||
{
|
||||
token = null;
|
||||
if (currentByte != '\r' && currentByte != '\n')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
token = EndOfLineToken.Token;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
internal class EndOfLineToken : IToken
|
||||
{
|
||||
public static EndOfLineToken Token { get; } = new EndOfLineToken();
|
||||
|
||||
private EndOfLineToken()
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,7 @@
|
||||
private readonly ScannerScope scope;
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly List<byte> currentBuffer = new List<byte>();
|
||||
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
||||
|
||||
public IToken CurrentToken { get; private set; }
|
||||
public bool TryReadToken<T>(out T token) where T : class, IToken
|
||||
@@ -47,7 +48,7 @@
|
||||
}
|
||||
|
||||
public long CurrentPosition => inputBytes.CurrentOffset;
|
||||
|
||||
|
||||
private bool hasBytePreRead;
|
||||
|
||||
internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)
|
||||
@@ -68,73 +69,86 @@
|
||||
hasBytePreRead = false;
|
||||
var currentByte = inputBytes.CurrentByte;
|
||||
var c = (char) currentByte;
|
||||
|
||||
if (IsEmpty(currentByte) || ReadHelper.IsWhitespace(currentByte))
|
||||
{
|
||||
isSkippingSymbol = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we failed to read the symbol for whatever reason we pass over it.
|
||||
if (isSkippingSymbol && c != '>')
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
ITokenizer tokenizer = null;
|
||||
switch (c)
|
||||
foreach (var customTokenizer in customTokenizers)
|
||||
{
|
||||
case '(':
|
||||
tokenizer = StringTokenizer;
|
||||
if (currentByte == customTokenizer.firstByte)
|
||||
{
|
||||
tokenizer = customTokenizer.tokenizer;
|
||||
break;
|
||||
case '<':
|
||||
var following = inputBytes.Peek();
|
||||
if (following == '<')
|
||||
{
|
||||
isSkippingSymbol = true;
|
||||
tokenizer = DictionaryTokenizer;
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenizer = HexTokenizer;
|
||||
}
|
||||
break;
|
||||
case '>' when scope == ScannerScope.Dictionary:
|
||||
endAngleBracesRead++;
|
||||
if (endAngleBracesRead == 2)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
if (tokenizer == null)
|
||||
{
|
||||
if (IsEmpty(currentByte) || ReadHelper.IsWhitespace(currentByte))
|
||||
{
|
||||
isSkippingSymbol = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// If we failed to read the symbol for whatever reason we pass over it.
|
||||
if (isSkippingSymbol && c != '>')
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case '(':
|
||||
tokenizer = StringTokenizer;
|
||||
break;
|
||||
case '<':
|
||||
var following = inputBytes.Peek();
|
||||
if (following == '<')
|
||||
{
|
||||
isSkippingSymbol = true;
|
||||
tokenizer = DictionaryTokenizer;
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenizer = HexTokenizer;
|
||||
}
|
||||
break;
|
||||
case '>' when scope == ScannerScope.Dictionary:
|
||||
endAngleBracesRead++;
|
||||
if (endAngleBracesRead == 2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case '[':
|
||||
tokenizer = ArrayTokenizer;
|
||||
break;
|
||||
case ']' when scope == ScannerScope.Array:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case '[':
|
||||
tokenizer = ArrayTokenizer;
|
||||
break;
|
||||
case ']' when scope == ScannerScope.Array:
|
||||
return false;
|
||||
case '/':
|
||||
tokenizer = NameTokenizer;
|
||||
break;
|
||||
case '%':
|
||||
tokenizer = CommentTokenizer;
|
||||
break;
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9':
|
||||
case '-':
|
||||
case '+':
|
||||
case '.':
|
||||
tokenizer = NumericTokenizer;
|
||||
break;
|
||||
default:
|
||||
tokenizer = PlainTokenizer;
|
||||
break;
|
||||
case '/':
|
||||
tokenizer = NameTokenizer;
|
||||
break;
|
||||
case '%':
|
||||
tokenizer = CommentTokenizer;
|
||||
break;
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9':
|
||||
case '-':
|
||||
case '+':
|
||||
case '.':
|
||||
tokenizer = NumericTokenizer;
|
||||
break;
|
||||
default:
|
||||
tokenizer = PlainTokenizer;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
|
||||
@@ -158,6 +172,21 @@
|
||||
return false;
|
||||
}
|
||||
|
||||
public void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer)
|
||||
{
|
||||
if (tokenizer == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(tokenizer));
|
||||
}
|
||||
|
||||
customTokenizers.Add((firstByte, tokenizer));
|
||||
}
|
||||
|
||||
public void DeregisterCustomTokenizer(ITokenizer tokenizer)
|
||||
{
|
||||
customTokenizers.RemoveAll(x => ReferenceEquals(x.tokenizer, tokenizer));
|
||||
}
|
||||
|
||||
private static bool IsEmpty(byte b)
|
||||
{
|
||||
return b == ' ' || b == '\r' || b == '\n' || b == 0;
|
||||
|
||||
@@ -16,5 +16,9 @@
|
||||
void Seek(long position);
|
||||
|
||||
long CurrentPosition { get; }
|
||||
|
||||
void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer);
|
||||
|
||||
void DeregisterCustomTokenizer(ITokenizer tokenizer);
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
using Fonts.Parser;
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Parser.FileStructure;
|
||||
using Parser.FileStructure.UglyToad.Pdf.Parser.Parts.CrossReference;
|
||||
using Parser.Parts;
|
||||
using Parser.Parts.CrossReference;
|
||||
|
||||
@@ -39,8 +41,8 @@
|
||||
var objectStreamParser = new ObjectStreamParser(logger, filterProvider, baseParser);
|
||||
var dynamicParser = new DynamicParser(logger, baseParser, streamParser, objectStreamParser);
|
||||
|
||||
var crossReferenceTableParser = new FileCrossReferenceTableParser(logger, dictionaryParser, baseParser, streamParser, crossReferenceParser,
|
||||
new CrossReferenceTableParser(logger, dictionaryParser, baseParser));
|
||||
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, streamParser, crossReferenceParser, new CrossReferenceTableParser(),
|
||||
new OldCrossReferenceTableParser(logger, dictionaryParser, baseParser));
|
||||
|
||||
var cmapParser = new CMapParser();
|
||||
var afmParser = new AdobeFontMetricsParser();
|
||||
|
||||
Reference in New Issue
Block a user