diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs new file mode 100644 index 00000000..678bafd3 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -0,0 +1,170 @@ +namespace UglyToad.PdfPig.Tests.Tokenization.Scanner +{ + using System; + using System.Collections.Generic; + using PdfPig.ContentStream; + using PdfPig.Cos; + using PdfPig.Tokenization.Scanner; + using PdfPig.Tokenization.Tokens; + using Xunit; + + public class PdfTokenScannerTests + { + private readonly CrossReferenceTable table = new CrossReferenceTable(CrossReferenceType.Table, new Dictionary(), + new PdfDictionary()); + + [Fact] + public void ReadsSimpleObject() + { + const string s = @"294 0 obj /WDKAAR+CMBX12 endobj"; + + var pdfScanner = GetScanner(s); + + pdfScanner.MoveNext(); + + var objectToken = Assert.IsType(pdfScanner.CurrentToken); + + var name = Assert.IsType(objectToken.Data); + + Assert.Equal(294, objectToken.Number.ObjectNumber); + Assert.Equal(0, objectToken.Number.Generation); + + Assert.Equal("WDKAAR+CMBX12", name.Data.Name); + + Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position)); + } + + [Fact] + public void ReadsNumericObjectWithComment() + { + const string s = @"%PDF-1.2 + +% I commented here too, tee hee +10383384 2 obj +%and here, I just love comments + +45 + +endobj + +%%EOF"; + + var pdfScanner = GetScanner(s); + + pdfScanner.MoveNext(); + + var obj = Assert.IsType(pdfScanner.CurrentToken); + + var num = Assert.IsType(obj.Data); + + Assert.Equal(45, num.Int); + + Assert.Equal(10383384, obj.Number.ObjectNumber); + Assert.Equal(2, obj.Number.Generation); + + Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position)); + + Assert.False(pdfScanner.MoveNext()); + } + + [Fact] + public void ReadsArrayObject() + { + const string s = @" endobj 295 0 obj [ 676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313 344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313 ] endobj"; + + var pdfScanner = GetScanner(s); + + pdfScanner.MoveNext(); + + var obj = Assert.IsType(pdfScanner.CurrentToken); + + var array = Assert.IsType(obj.Data); + + Assert.Equal(676, ((NumericToken)array.Data[0]).Int); + + Assert.Equal(33, array.Data.Count); + + Assert.Equal(295, obj.Number.ObjectNumber); + Assert.Equal(0, obj.Number.Generation); + + Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position)); + + Assert.False(pdfScanner.MoveNext()); + } + + [Fact] + public void ReadsDictionaryObjectThenNameThenDictionary() + { + const string s = @" + +274 0 obj << /Type /Pages /Count 2 /Parent 275 0 R /Kids [ 121 0 R 125 0 R ] >> endobj + %Other parts... 310 0 obj /WPXNWT+CMR9 endobj 311 0 obj << /Type /Font /Subtype /Type1 /FirstChar 0 /LastChar 127 /Widths 313 0 R /BaseFont 310 0 R /FontDescriptor 312 0 R >> endobj"; + + var scanner = GetScanner(s); + + var tokens = ReadToEnd(scanner); + + var dictionary = Assert.IsType(tokens[0].Data); + + Assert.Equal(4, dictionary.Data.Count); + Assert.Equal(274, tokens[0].Number.ObjectNumber); + Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position)); + + var nameObject = Assert.IsType(tokens[1].Data); + + Assert.Equal("WPXNWT+CMR9", nameObject.Data.Name); + Assert.Equal(310, tokens[1].Number.ObjectNumber); + Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position)); + + dictionary = Assert.IsType(tokens[2].Data); + + Assert.Equal(7, dictionary.Data.Count); + Assert.Equal(311, tokens[2].Number.ObjectNumber); + Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position)); + } + + [Fact] + public void ReadsStringObject() + { + const string s = @" + +58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj +"; + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + Assert.Equal(58949797283757L, token.Number.ObjectNumber); + Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType(token.Data).Data); + + Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position)); + } + + private PdfTokenScanner GetScanner(string s) + { + var input = StringBytesTestConverter.Convert(s, false); + + return new PdfTokenScanner(input.Bytes, table); + } + + private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner) + { + var result = new List(); + + while (scanner.MoveNext()) + { + if (scanner.CurrentToken is ObjectToken obj) + { + result.Add(obj); + } + else + { + throw new InvalidOperationException($"Pdf token scanner produced token which was not an object token: {scanner.CurrentToken}."); + } + } + + return result; + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/StreamTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/StreamTokenizerTests.cs new file mode 100644 index 00000000..cb76b656 --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Tokenization/StreamTokenizerTests.cs @@ -0,0 +1,13 @@ +namespace UglyToad.PdfPig.Tests.Tokenization +{ + using Xunit; + + public class StreamTokenizerTests + { + [Fact] + public void ReadsStream() + { + + } + } +} diff --git a/src/UglyToad.PdfPig/Fonts/Parser/Handlers/Type1FontHandler.cs b/src/UglyToad.PdfPig/Fonts/Parser/Handlers/Type1FontHandler.cs index d2470656..2abe9719 100644 --- a/src/UglyToad.PdfPig/Fonts/Parser/Handlers/Type1FontHandler.cs +++ b/src/UglyToad.PdfPig/Fonts/Parser/Handlers/Type1FontHandler.cs @@ -55,7 +55,7 @@ var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing); var name = FontDictionaryAccessHelper.GetName(pdfObjectParser, dictionary, descriptor, reader, isLenientParsing); - + CMap toUnicodeCMap = null; if (dictionary.TryGetItemOfType(CosName.TO_UNICODE, out CosObject toUnicodeObj)) { diff --git a/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs new file mode 100644 index 00000000..e16de0e2 --- /dev/null +++ b/src/UglyToad.PdfPig/Fonts/Type1/Parser/Type1FontParser.cs @@ -0,0 +1,10 @@ +namespace UglyToad.PdfPig.Fonts.Type1.Parser +{ + internal class Type1FontParser + { + public void Parse() + { + + } + } +} diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs index 4eed4932..07563135 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/CoreTokenScanner.cs @@ -2,6 +2,7 @@ { using System; using System.Collections.Generic; + using Exceptions; using IO; using Parser.Parts; using Tokens; @@ -22,6 +23,7 @@ private readonly List currentBuffer = new List(); private readonly List<(byte firstByte, ITokenizer tokenizer)> customTokenizers = new List<(byte, ITokenizer)>(); + internal long CurrentTokenStart { get; private set; } public IToken CurrentToken { get; private set; } public bool TryReadToken(out T token) where T : class, IToken { @@ -150,6 +152,8 @@ } } + CurrentTokenStart = inputBytes.CurrentOffset - 1; + if (tokenizer == null || !tokenizer.TryTokenize(currentByte, inputBytes, out var token)) { isSkippingSymbol = true; diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs new file mode 100644 index 00000000..c1553ac9 --- /dev/null +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -0,0 +1,120 @@ +namespace UglyToad.PdfPig.Tokenization.Scanner +{ + using System; + using System.Collections.Generic; + using ContentStream; + using Cos; + using Exceptions; + using IO; + using Tokens; + + internal class PdfTokenScanner : ISeekableTokenScanner + { + private readonly IInputBytes inputBytes; + private readonly CrossReferenceTable crossReferenceTable; + private readonly CoreTokenScanner coreTokenScanner; + + private readonly long[] previousTokenPositions = new long[2]; + private readonly IToken[] previousTokens = new IToken[2]; + + private readonly Dictionary objectOffsets = new Dictionary(); + + public IToken CurrentToken { get; private set; } + + public long CurrentPosition => coreTokenScanner.CurrentPosition; + + public PdfTokenScanner(IInputBytes inputBytes, CrossReferenceTable crossReferenceTable) + { + this.inputBytes = inputBytes; + this.crossReferenceTable = crossReferenceTable; + coreTokenScanner = new CoreTokenScanner(inputBytes); + } + + public bool MoveNext() + { + int tokensRead = 0; + while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject) + { + if (coreTokenScanner.CurrentToken is CommentToken) + { + continue; + } + + tokensRead++; + + previousTokens[0] = previousTokens[1]; + previousTokenPositions[0] = previousTokenPositions[1]; + + previousTokens[1] = coreTokenScanner.CurrentToken; + previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart; + } + + if (tokensRead < 2) + { + return false; + } + + var startPosition = previousTokenPositions[0]; + var objectNumber = previousTokens[0] as NumericToken; + var generation = previousTokens[1] as NumericToken; + + if (objectNumber == null || generation == null) + { + throw new PdfDocumentFormatException("The obj operator (start object) was not preceded by a 2 numbers." + + $"Instead got: {previousTokens[0]} {previousTokens[1]} obj"); + } + + var data = new List(); + + while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject) + { + if (coreTokenScanner.CurrentToken is CommentToken) + { + continue; + } + + if (coreTokenScanner.CurrentToken == OperatorToken.StartStream) + { + // Read stream. + } + + data.Add(coreTokenScanner.CurrentToken); + + previousTokens[0] = previousTokens[1]; + previousTokenPositions[0] = previousTokenPositions[1]; + + previousTokens[1] = coreTokenScanner.CurrentToken; + previousTokenPositions[1] = coreTokenScanner.CurrentPosition; + } + + if (coreTokenScanner.CurrentToken != OperatorToken.EndObject) + { + return false; + } + + CurrentToken = new ObjectToken(startPosition, new IndirectReference(objectNumber.Long, generation.Int), data[data.Count - 1]); + + return true; + } + + public bool TryReadToken(out T token) where T : class, IToken + { + return coreTokenScanner.TryReadToken(out token); + } + + public void Seek(long position) + { + coreTokenScanner.Seek(position); + } + + public void RegisterCustomTokenizer(byte firstByte, ITokenizer tokenizer) + { + coreTokenScanner.RegisterCustomTokenizer(firstByte, tokenizer); + } + + public void DeregisterCustomTokenizer(ITokenizer tokenizer) + { + coreTokenScanner.DeregisterCustomTokenizer(tokenizer); + } + } +} diff --git a/src/UglyToad.PdfPig/Tokenization/StreamTokenizer.cs b/src/UglyToad.PdfPig/Tokenization/StreamTokenizer.cs new file mode 100644 index 00000000..e244abd1 --- /dev/null +++ b/src/UglyToad.PdfPig/Tokenization/StreamTokenizer.cs @@ -0,0 +1,20 @@ +namespace UglyToad.PdfPig.Tokenization +{ + using Cos; + using Exceptions; + using IO; + using Tokens; + + internal class StreamTokenizer + { + public object Tokenize(DictionaryToken streamDictionary, IInputBytes inputBytes) + { + if (!streamDictionary.TryGetByName(CosName.LENGTH, out var lengthToken)) + { + throw new PdfDocumentFormatException("The stream dictionary did not define a length: " + streamDictionary); + } + + return null; + } + } +} diff --git a/src/UglyToad.PdfPig/Tokenization/Tokens/ObjectToken.cs b/src/UglyToad.PdfPig/Tokenization/Tokens/ObjectToken.cs new file mode 100644 index 00000000..0e6758db --- /dev/null +++ b/src/UglyToad.PdfPig/Tokenization/Tokens/ObjectToken.cs @@ -0,0 +1,29 @@ +namespace UglyToad.PdfPig.Tokenization.Tokens +{ + using ContentStream; + + internal class ObjectToken : IDataToken + { + /// + /// The offset of the start of the object number in the file bytes. + /// + public long Position { get; set; } + + /// + /// The object and generation number of the object. + /// + public IndirectReference Number { get; } + + /// + /// The inner data of the object. + /// + public IToken Data { get; } + + public ObjectToken(long position, IndirectReference number, IToken data) + { + Position = position; + Number = number; + Data = data; + } + } +}