create a pdf object scanner which sits on top of the core token scanner to provide complete object parsing

This commit is contained in:
Eliot Jones
2018-01-13 22:30:15 +00:00
parent 4b0af707d1
commit 8dcea9b37f
8 changed files with 367 additions and 1 deletions

View File

@@ -2,6 +2,7 @@
{
using System;
using System.Collections.Generic;
using Exceptions;
using IO;
using Parser.Parts;
using Tokens;
@@ -22,6 +23,7 @@
private readonly List<byte> currentBuffer = new List<byte>();
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
internal long CurrentTokenStart { get; private set; }
public IToken CurrentToken { get; private set; }
public bool TryReadToken<T>(out T token) where T : class, IToken
{
@@ -150,6 +152,8 @@
}
}
CurrentTokenStart = inputBytes.CurrentOffset - 1;
if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
{
isSkippingSymbol = true;

View File

@@ -0,0 +1,120 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using ContentStream;
using Cos;
using Exceptions;
using IO;
using Tokens;
internal class PdfTokenScanner : ISeekableTokenScanner
{
private readonly IInputBytes inputBytes;
private readonly CrossReferenceTable crossReferenceTable;
private readonly CoreTokenScanner coreTokenScanner;
private readonly long[] previousTokenPositions = new long[2];
private readonly IToken[] previousTokens = new IToken[2];
private readonly Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
public IToken CurrentToken { get; private set; }
public long CurrentPosition => coreTokenScanner.CurrentPosition;
public PdfTokenScanner(IInputBytes inputBytes, CrossReferenceTable crossReferenceTable)
{
this.inputBytes = inputBytes;
this.crossReferenceTable = crossReferenceTable;
coreTokenScanner = new CoreTokenScanner(inputBytes);
}
public bool MoveNext()
{
int tokensRead = 0;
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject)
{
if (coreTokenScanner.CurrentToken is CommentToken)
{
continue;
}
tokensRead++;
previousTokens[0] = previousTokens[1];
previousTokenPositions[0] = previousTokenPositions[1];
previousTokens[1] = coreTokenScanner.CurrentToken;
previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart;
}
if (tokensRead < 2)
{
return false;
}
var startPosition = previousTokenPositions[0];
var objectNumber = previousTokens[0] as NumericToken;
var generation = previousTokens[1] as NumericToken;
if (objectNumber == null || generation == null)
{
throw new PdfDocumentFormatException("The obj operator (start object) was not preceded by a 2 numbers." +
$"Instead got: {previousTokens[0]} {previousTokens[1]} obj");
}
var data = new List<IToken>();
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject)
{
if (coreTokenScanner.CurrentToken is CommentToken)
{
continue;
}
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
{
// Read stream.
}
data.Add(coreTokenScanner.CurrentToken);
previousTokens[0] = previousTokens[1];
previousTokenPositions[0] = previousTokenPositions[1];
previousTokens[1] = coreTokenScanner.CurrentToken;
previousTokenPositions[1] = coreTokenScanner.CurrentPosition;
}
if (coreTokenScanner.CurrentToken != OperatorToken.EndObject)
{
return false;
}
CurrentToken = new ObjectToken(startPosition, new IndirectReference(objectNumber.Long, generation.Int), data[data.Count - 1]);
return true;
}
public bool TryReadToken<T>(out T token) where T : class, IToken
{
return coreTokenScanner.TryReadToken(out token);
}
public void Seek(long position)
{
coreTokenScanner.Seek(position);
}
public void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer)
{
coreTokenScanner.RegisterCustomTokenizer(firstByte, tokenizer);
}
public void DeregisterCustomTokenizer(ITokenizer tokenizer)
{
coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
}
}
}