mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-19 02:37:56 +08:00
create a pdf object scanner which sits on top of the core token scanner to provide complete object parsing
This commit is contained in:
@@ -0,0 +1,170 @@
|
||||
namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using PdfPig.ContentStream;
|
||||
using PdfPig.Cos;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
using PdfPig.Tokenization.Tokens;
|
||||
using Xunit;
|
||||
|
||||
public class PdfTokenScannerTests
|
||||
{
|
||||
private readonly CrossReferenceTable table = new CrossReferenceTable(CrossReferenceType.Table, new Dictionary<CosObjectKey, long>(),
|
||||
new PdfDictionary());
|
||||
|
||||
[Fact]
|
||||
public void ReadsSimpleObject()
|
||||
{
|
||||
const string s = @"294 0 obj
|
||||
/WDKAAR+CMBX12
|
||||
endobj";
|
||||
|
||||
var pdfScanner = GetScanner(s);
|
||||
|
||||
pdfScanner.MoveNext();
|
||||
|
||||
var objectToken = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
|
||||
|
||||
var name = Assert.IsType<NameToken>(objectToken.Data);
|
||||
|
||||
Assert.Equal(294, objectToken.Number.ObjectNumber);
|
||||
Assert.Equal(0, objectToken.Number.Generation);
|
||||
|
||||
Assert.Equal("WDKAAR+CMBX12", name.Data.Name);
|
||||
|
||||
Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsNumericObjectWithComment()
|
||||
{
|
||||
const string s = @"%PDF-1.2
|
||||
|
||||
% I commented here too, tee hee
|
||||
10383384 2 obj
|
||||
%and here, I just love comments
|
||||
|
||||
45
|
||||
|
||||
endobj
|
||||
|
||||
%%EOF";
|
||||
|
||||
var pdfScanner = GetScanner(s);
|
||||
|
||||
pdfScanner.MoveNext();
|
||||
|
||||
var obj = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
|
||||
|
||||
var num = Assert.IsType<NumericToken>(obj.Data);
|
||||
|
||||
Assert.Equal(45, num.Int);
|
||||
|
||||
Assert.Equal(10383384, obj.Number.ObjectNumber);
|
||||
Assert.Equal(2, obj.Number.Generation);
|
||||
|
||||
Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position));
|
||||
|
||||
Assert.False(pdfScanner.MoveNext());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsArrayObject()
|
||||
{
|
||||
const string s = @"
|
||||
endobj
|
||||
|
||||
295 0 obj
|
||||
[
|
||||
676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313
|
||||
344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313
|
||||
]
|
||||
endobj";
|
||||
|
||||
var pdfScanner = GetScanner(s);
|
||||
|
||||
pdfScanner.MoveNext();
|
||||
|
||||
var obj = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
|
||||
|
||||
var array = Assert.IsType<ArrayToken>(obj.Data);
|
||||
|
||||
Assert.Equal(676, ((NumericToken)array.Data[0]).Int);
|
||||
|
||||
Assert.Equal(33, array.Data.Count);
|
||||
|
||||
Assert.Equal(295, obj.Number.ObjectNumber);
|
||||
Assert.Equal(0, obj.Number.Generation);
|
||||
|
||||
Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position));
|
||||
|
||||
Assert.False(pdfScanner.MoveNext());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsDictionaryObjectThenNameThenDictionary()
|
||||
{
|
||||
const string s = @"
|
||||
|
||||
274 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Count 2
|
||||
/Parent 275 0 R
|
||||
/Kids [ 121 0 R 125 0 R ]
|
||||
>>
|
||||
endobj
|
||||
|
||||
%Other parts...
|
||||
|
||||
310 0 obj
|
||||
/WPXNWT+CMR9
|
||||
endobj 311 0 obj
|
||||
<<
|
||||
/Type /Font
|
||||
/Subtype /Type1
|
||||
/FirstChar 0
|
||||
/LastChar 127
|
||||
/Widths 313 0 R
|
||||
/BaseFont 310 0 R /FontDescriptor 312 0 R
|
||||
>>
|
||||
endobj";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
|
||||
var dictionary = Assert.IsType<DictionaryToken>(tokens[0].Data);
|
||||
|
||||
Assert.Equal(4, dictionary.Data.Count);
|
||||
Assert.Equal(274, tokens[0].Number.ObjectNumber);
|
||||
Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position));
|
||||
|
||||
var nameObject = Assert.IsType<NameToken>(tokens[1].Data);
|
||||
|
||||
Assert.Equal("WPXNWT+CMR9", nameObject.Data.Name);
|
||||
Assert.Equal(310, tokens[1].Number.ObjectNumber);
|
||||
Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
|
||||
|
||||
dictionary = Assert.IsType<DictionaryToken>(tokens[2].Data);
|
||||
|
||||
Assert.Equal(7, dictionary.Data.Count);
|
||||
Assert.Equal(311, tokens[2].Number.ObjectNumber);
|
||||
Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStringObject()
|
||||
{
|
||||
const string s = @"
|
||||
|
||||
58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj
|
||||
";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
var token = ReadToEnd(scanner)[0];
|
||||
|
||||
Assert.Equal(58949797283757L, token.Number.ObjectNumber);
|
||||
Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType<StringToken>(token.Data).Data);
|
@@ -0,0 +1,13 @@
|
||||
namespace UglyToad.PdfPig.Tests.Tokenization
|
||||
{
|
||||
using Xunit;
|
||||
|
||||
public class StreamTokenizerTests
|
||||
{
|
||||
[Fact]
|
||||
public void ReadsStream()
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@@ -55,7 +55,7 @@
|
||||
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);
|
||||
|
||||
var name = FontDictionaryAccessHelper.GetName(pdfObjectParser, dictionary, descriptor, reader, isLenientParsing);
|
||||
|
||||
|
||||
CMap toUnicodeCMap = null;
|
||||
if (dictionary.TryGetItemOfType(CosName.TO_UNICODE, out CosObject toUnicodeObj))
|
||||
{
|
||||
|
10
src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs
Normal file
10
src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs
Normal file
@@ -0,0 +1,10 @@
|
||||
namespace UglyToad.PdfPig.Fonts.Type1.Parser
|
||||
{
|
||||
internal class Type1FontParser
|
||||
{
|
||||
public void Parse()
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@@ -2,6 +2,7 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
@@ -22,6 +23,7 @@
|
||||
private readonly List<byte> currentBuffer = new List<byte>();
|
||||
private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>();
|
||||
|
||||
internal long CurrentTokenStart { get; private set; }
|
||||
public IToken CurrentToken { get; private set; }
|
||||
public bool TryReadToken<T>(out T token) where T : class, IToken
|
||||
{
|
||||
@@ -150,6 +152,8 @@
|
||||
}
|
||||
}
|
||||
|
||||
CurrentTokenStart = inputBytes.CurrentOffset - 1;
|
||||
|
||||
if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token))
|
||||
{
|
||||
isSkippingSymbol = true;
|
||||
|
120
src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
Normal file
120
src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
Normal file
@@ -0,0 +1,120 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Tokens;
|
||||
|
||||
internal class PdfTokenScanner : ISeekableTokenScanner
|
||||
{
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly CrossReferenceTable crossReferenceTable;
|
||||
private readonly CoreTokenScanner coreTokenScanner;
|
||||
|
||||
private readonly long[] previousTokenPositions = new long[2];
|
||||
private readonly IToken[] previousTokens = new IToken[2];
|
||||
|
||||
private readonly Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
|
||||
|
||||
public IToken CurrentToken { get; private set; }
|
||||
|
||||
public long CurrentPosition => coreTokenScanner.CurrentPosition;
|
||||
|
||||
public PdfTokenScanner(IInputBytes inputBytes, CrossReferenceTable crossReferenceTable)
|
||||
{
|
||||
this.inputBytes = inputBytes;
|
||||
this.crossReferenceTable = crossReferenceTable;
|
||||
coreTokenScanner = new CoreTokenScanner(inputBytes);
|
||||
}
|
||||
|
||||
public bool MoveNext()
|
||||
{
|
||||
int tokensRead = 0;
|
||||
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject)
|
||||
{
|
||||
if (coreTokenScanner.CurrentToken is CommentToken)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
tokensRead++;
|
||||
|
||||
previousTokens[0] = previousTokens[1];
|
||||
previousTokenPositions[0] = previousTokenPositions[1];
|
||||
|
||||
previousTokens[1] = coreTokenScanner.CurrentToken;
|
||||
previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart;
|
||||
}
|
||||
|
||||
if (tokensRead < 2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var startPosition = previousTokenPositions[0];
|
||||
var objectNumber = previousTokens[0] as NumericToken;
|
||||
var generation = previousTokens[1] as NumericToken;
|
||||
|
||||
if (objectNumber == null || generation == null)
|
||||
{
|
||||
throw new PdfDocumentFormatException("The obj operator (start object) was not preceded by a 2 numbers." +
|
||||
$"Instead got: {previousTokens[0]} {previousTokens[1]} obj");
|
||||
}
|
||||
|
||||
var data = new List<IToken>();
|
||||
|
||||
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject)
|
||||
{
|
||||
if (coreTokenScanner.CurrentToken is CommentToken)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
|
||||
{
|
||||
// Read stream.
|
||||
}
|
||||
|
||||
data.Add(coreTokenScanner.CurrentToken);
|
||||
|
||||
previousTokens[0] = previousTokens[1];
|
||||
previousTokenPositions[0] = previousTokenPositions[1];
|
||||
|
||||
previousTokens[1] = coreTokenScanner.CurrentToken;
|
||||
previousTokenPositions[1] = coreTokenScanner.CurrentPosition;
|
||||
}
|
||||
|
||||
if (coreTokenScanner.CurrentToken != OperatorToken.EndObject)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
CurrentToken = new ObjectToken(startPosition, new IndirectReference(objectNumber.Long, generation.Int), data[data.Count - 1]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public bool TryReadToken<T>(out T token) where T : class, IToken
|
||||
{
|
||||
return coreTokenScanner.TryReadToken(out token);
|
||||
}
|
||||
|
||||
public void Seek(long position)
|
||||
{
|
||||
coreTokenScanner.Seek(position);
|
||||
}
|
||||
|
||||
public void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer)
|
||||
{
|
||||
coreTokenScanner.RegisterCustomTokenizer(firstByte, tokenizer);
|
||||
}
|
||||
|
||||
public void DeregisterCustomTokenizer(ITokenizer tokenizer)
|
||||
{
|
||||
coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
|
||||
}
|
||||
}
|
||||
}
|
20
src/UglyToad.PdfPig/Tokenization/StreamTokenizer.cs
Normal file
20
src/UglyToad.PdfPig/Tokenization/StreamTokenizer.cs
Normal file
@@ -0,0 +1,20 @@
|
||||
namespace UglyToad.PdfPig.Tokenization
|
||||
{
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Tokens;
|
||||
|
||||
internal class StreamTokenizer
|
||||
{
|
||||
public object Tokenize(DictionaryToken streamDictionary, IInputBytes inputBytes)
|
||||
{
|
||||
if (!streamDictionary.TryGetByName(CosName.LENGTH, out var lengthToken))
|
||||
{
|
||||
throw new PdfDocumentFormatException("The stream dictionary did not define a length: " + streamDictionary);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
29
src/UglyToad.PdfPig/Tokenization/Tokens/ObjectToken.cs
Normal file
29
src/UglyToad.PdfPig/Tokenization/Tokens/ObjectToken.cs
Normal file
@@ -0,0 +1,29 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Tokens
|
||||
{
|
||||
using ContentStream;
|
||||
|
||||
internal class ObjectToken : IDataToken<IToken>
|
||||
{
|
||||
/// <summary>
|
||||
/// The offset of the start of the object number in the file bytes.
|
||||
/// </summary>
|
||||
public long Position { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The object and generation number of the object.
|
||||
/// </summary>
|
||||
public IndirectReference Number { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The inner data of the object.
|
||||
/// </summary>
|
||||
public IToken Data { get; }
|
||||
|
||||
public ObjectToken(long position, IndirectReference number, IToken data)
|
||||
{
|
||||
Position = position;
|
||||
Number = number;
|
||||
Data = data;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user