create a pdf object scanner which sits on top of the core token scanner to provide complete object parsing

This commit is contained in:
Eliot Jones
2018-01-13 22:30:15 +00:00
parent 4b0af707d1
commit 8dcea9b37f
8 changed files with 367 additions and 1 deletions

View File

@@ -0,0 +1,170 @@
namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using PdfPig.ContentStream;
using PdfPig.Cos;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokenization.Tokens;
using Xunit;
public class PdfTokenScannerTests
{
private readonly CrossReferenceTable table = new CrossReferenceTable(CrossReferenceType.Table, new Dictionary<CosObjectKey, long>(),
new PdfDictionary());
[Fact]
public void ReadsSimpleObject()
{
const string s = @"294 0 obj
/WDKAAR+CMBX12
endobj";
var pdfScanner = GetScanner(s);
pdfScanner.MoveNext();
var objectToken = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
var name = Assert.IsType<NameToken>(objectToken.Data);
Assert.Equal(294, objectToken.Number.ObjectNumber);
Assert.Equal(0, objectToken.Number.Generation);
Assert.Equal("WDKAAR+CMBX12", name.Data.Name);
Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position));
}
[Fact]
public void ReadsNumericObjectWithComment()
{
const string s = @"%PDF-1.2
% I commented here too, tee hee
10383384 2 obj
%and here, I just love comments
45
endobj
%%EOF";
var pdfScanner = GetScanner(s);
pdfScanner.MoveNext();
var obj = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
var num = Assert.IsType<NumericToken>(obj.Data);
Assert.Equal(45, num.Int);
Assert.Equal(10383384, obj.Number.ObjectNumber);
Assert.Equal(2, obj.Number.Generation);
Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position));
Assert.False(pdfScanner.MoveNext());
}
[Fact]
public void ReadsArrayObject()
{
const string s = @"
endobj
295 0 obj
[
676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313
344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313
]
endobj";
var pdfScanner = GetScanner(s);
pdfScanner.MoveNext();
var obj = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
var array = Assert.IsType<ArrayToken>(obj.Data);
Assert.Equal(676, ((NumericToken)array.Data[0]).Int);
Assert.Equal(33, array.Data.Count);
Assert.Equal(295, obj.Number.ObjectNumber);
Assert.Equal(0, obj.Number.Generation);
Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position));
Assert.False(pdfScanner.MoveNext());
}
[Fact]
public void ReadsDictionaryObjectThenNameThenDictionary()
{
const string s = @"
274 0 obj
<<
/Type /Pages
/Count 2
/Parent 275 0 R
/Kids [ 121 0 R 125 0 R ]
>>
endobj
%Other parts...
310 0 obj
/WPXNWT+CMR9
endobj 311 0 obj
<<
/Type /Font
/Subtype /Type1
/FirstChar 0
/LastChar 127
/Widths 313 0 R
/BaseFont 310 0 R /FontDescriptor 312 0 R
>>
endobj";
var scanner = GetScanner(s);
var tokens = ReadToEnd(scanner);
var dictionary = Assert.IsType<DictionaryToken>(tokens[0].Data);
Assert.Equal(4, dictionary.Data.Count);
Assert.Equal(274, tokens[0].Number.ObjectNumber);
Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position));
var nameObject = Assert.IsType<NameToken>(tokens[1].Data);
Assert.Equal("WPXNWT+CMR9", nameObject.Data.Name);
Assert.Equal(310, tokens[1].Number.ObjectNumber);
Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
dictionary = Assert.IsType<DictionaryToken>(tokens[2].Data);
Assert.Equal(7, dictionary.Data.Count);
Assert.Equal(311, tokens[2].Number.ObjectNumber);
Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position));
}
[Fact]
public void ReadsStringObject()
{
const string s = @"
58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj
";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
Assert.Equal(58949797283757L, token.Number.ObjectNumber);
Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType<StringToken>(token.Data).Data);

View File

@@ -0,0 +1,13 @@
namespace UglyToad.PdfPig.Tests.Tokenization
{
using Xunit;
public class StreamTokenizerTests
{
[Fact]
public void ReadsStream()
{
}
}
}

View File

@@ -0,0 +1,10 @@
namespace UglyToad.PdfPig.Fonts.Type1.Parser
{
internal class Type1FontParser
{
public void Parse()
{
}
}
}

View File

@@ -2,6 +2,7 @@
{ {
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using Exceptions;
using IO; using IO;
using Parser.Parts; using Parser.Parts;
using Tokens; using Tokens;
@@ -22,6 +23,7 @@
private readonly List<byte> currentBuffer = new List<byte>(); private readonly List<byte> currentBuffer = new List<byte>();
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>(); private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
internal long CurrentTokenStart { get; private set; }
public IToken CurrentToken { get; private set; } public IToken CurrentToken { get; private set; }
public bool TryReadToken<T>(out T token) where T : class, IToken public bool TryReadToken<T>(out T token) where T : class, IToken
{ {
@@ -150,6 +152,8 @@
} }
} }
CurrentTokenStart = inputBytes.CurrentOffset - 1;
if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token)) if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
{ {
isSkippingSymbol = true; isSkippingSymbol = true;

View File

@@ -0,0 +1,120 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using ContentStream;
using Cos;
using Exceptions;
using IO;
using Tokens;
internal class PdfTokenScanner : ISeekableTokenScanner
{
private readonly IInputBytes inputBytes;
private readonly CrossReferenceTable crossReferenceTable;
private readonly CoreTokenScanner coreTokenScanner;
private readonly long[] previousTokenPositions = new long[2];
private readonly IToken[] previousTokens = new IToken[2];
private readonly Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
public IToken CurrentToken { get; private set; }
public long CurrentPosition => coreTokenScanner.CurrentPosition;
public PdfTokenScanner(IInputBytes inputBytes, CrossReferenceTable crossReferenceTable)
{
this.inputBytes = inputBytes;
this.crossReferenceTable = crossReferenceTable;
coreTokenScanner = new CoreTokenScanner(inputBytes);
}
public bool MoveNext()
{
int tokensRead = 0;
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject)
{
if (coreTokenScanner.CurrentToken is CommentToken)
{
continue;
}
tokensRead++;
previousTokens[0] = previousTokens[1];
previousTokenPositions[0] = previousTokenPositions[1];
previousTokens[1] = coreTokenScanner.CurrentToken;
previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart;
}
if (tokensRead < 2)
{
return false;
}
var startPosition = previousTokenPositions[0];
var objectNumber = previousTokens[0] as NumericToken;
var generation = previousTokens[1] as NumericToken;
if (objectNumber == null || generation == null)
{
throw new PdfDocumentFormatException("The obj operator (start object) was not preceded by a 2 numbers." +
$"Instead got: {previousTokens[0]} {previousTokens[1]} obj");
}
var data = new List<IToken>();
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject)
{
if (coreTokenScanner.CurrentToken is CommentToken)
{
continue;
}
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
{
// Read stream.
}
data.Add(coreTokenScanner.CurrentToken);
previousTokens[0] = previousTokens[1];
previousTokenPositions[0] = previousTokenPositions[1];
previousTokens[1] = coreTokenScanner.CurrentToken;
previousTokenPositions[1] = coreTokenScanner.CurrentPosition;
}
if (coreTokenScanner.CurrentToken != OperatorToken.EndObject)
{
return false;
}
CurrentToken = new ObjectToken(startPosition, new IndirectReference(objectNumber.Long, generation.Int), data[data.Count - 1]);
return true;
}
public bool TryReadToken<T>(out T token) where T : class, IToken
{
return coreTokenScanner.TryReadToken(out token);
}
public void Seek(long position)
{
coreTokenScanner.Seek(position);
}
public void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer)
{
coreTokenScanner.RegisterCustomTokenizer(firstByte, tokenizer);
}
public void DeregisterCustomTokenizer(ITokenizer tokenizer)
{
coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
}
}
}

View File

@@ -0,0 +1,20 @@
namespace UglyToad.PdfPig.Tokenization
{
using Cos;
using Exceptions;
using IO;
using Tokens;
internal class StreamTokenizer
{
public object Tokenize(DictionaryToken streamDictionary, IInputBytes inputBytes)
{
if (!streamDictionary.TryGetByName(CosName.LENGTH, out var lengthToken))
{
throw new PdfDocumentFormatException("The stream dictionary did not define a length: " + streamDictionary);
}
return null;
}
}
}

View File

@@ -0,0 +1,29 @@
namespace UglyToad.PdfPig.Tokenization.Tokens
{
using ContentStream;
internal class ObjectToken : IDataToken<IToken>
{
/// <summary>
/// The offset of the start of the object number in the file bytes.
/// </summary>
public long Position { get; set; }
/// <summary>
/// The object and generation number of the object.
/// </summary>
public IndirectReference Number { get; }
/// <summary>
/// The inner data of the object.
/// </summary>
public IToken Data { get; }
public ObjectToken(long position, IndirectReference number, IToken data)
{
Position = position;
Number = number;
Data = data;
}
}
}