diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs index a0635a84..1ccbc138 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/DictionaryTokenizerTests.cs @@ -39,7 +39,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization var dictionary = AssertDictionaryToken(token); - AssertDictionaryEntry(dictionary, 0, CosName.NAME, "Barry Scott"); + AssertDictionaryEntry(dictionary, CosName.NAME, "Barry Scott"); } [Fact] @@ -53,7 +53,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization var dictionary = AssertDictionaryToken(token); - AssertDictionaryEntry(dictionary, 0, CosName.TYPE, + AssertDictionaryEntry(dictionary, CosName.TYPE, CosName.Create("Example")); } @@ -68,9 +68,9 @@ namespace UglyToad.PdfPig.Tests.Tokenization var dictionary = AssertDictionaryToken(token); - AssertDictionaryEntry(dictionary, 0, CosName.FILTER, CosName.FLATE_DECODE); - AssertDictionaryEntry(dictionary, 1, CosName.S, 36); - AssertDictionaryEntry(dictionary, 2, CosName.LENGTH, 53); + AssertDictionaryEntry(dictionary, CosName.FILTER, CosName.FLATE_DECODE); + AssertDictionaryEntry(dictionary, CosName.S, 36); + AssertDictionaryEntry(dictionary, CosName.LENGTH, 53); } [Fact] @@ -86,8 +86,8 @@ namespace UglyToad.PdfPig.Tests.Tokenization var reference = new IndirectReference(14, 0); - AssertDictionaryEntry(dictionary, 0, CosName.PAGES, reference); - AssertDictionaryEntry(dictionary, 1, CosName.TYPE, CosName.CATALOG); + AssertDictionaryEntry(dictionary, CosName.PAGES, reference); + AssertDictionaryEntry(dictionary, CosName.TYPE, CosName.CATALOG); } [Fact] @@ -114,22 +114,22 @@ namespace UglyToad.PdfPig.Tests.Tokenization var dictionary = AssertDictionaryToken(token); - AssertDictionaryEntry(dictionary, 0, CosName.TYPE, CosName.Create("Example")); - AssertDictionaryEntry(dictionary, 1, CosName.SUBTYPE, CosName.Create("DictionaryExample")); - AssertDictionaryEntry(dictionary, 2, CosName.VERSION, 0.01m); - AssertDictionaryEntry(dictionary, 3, CosName.Create("IntegerItem"), 12m); - AssertDictionaryEntry(dictionary, 4, CosName.Create("StringItem"), "a string"); + AssertDictionaryEntry(dictionary, CosName.TYPE, CosName.Create("Example")); + AssertDictionaryEntry(dictionary, CosName.SUBTYPE, CosName.Create("DictionaryExample")); + AssertDictionaryEntry(dictionary, CosName.VERSION, 0.01m); + AssertDictionaryEntry(dictionary, CosName.Create("IntegerItem"), 12m); + AssertDictionaryEntry(dictionary, CosName.Create("StringItem"), "a string"); var subDictionary = GetIndex(5, dictionary); - Assert.Equal(CosName.Create("Subdictionary"), Assert.IsType(subDictionary.Key).Data); + Assert.Equal("Subdictionary", subDictionary.Key); var subDictionaryValue = Assert.IsType(subDictionary.Value); - AssertDictionaryEntry(subDictionaryValue, 0, CosName.Create("Item1"), 0.4m); - AssertDictionaryEntry(subDictionaryValue, 1, CosName.Create("Item2"), true); - AssertDictionaryEntry(subDictionaryValue, 2, CosName.Create("LastItem"), "not!"); - AssertDictionaryEntry(subDictionaryValue, 3, CosName.Create("VeryLastItem"), "OK"); + AssertDictionaryEntry(subDictionaryValue, CosName.Create("Item1"), 0.4m); + AssertDictionaryEntry(subDictionaryValue, CosName.Create("Item2"), true); + AssertDictionaryEntry(subDictionaryValue, CosName.Create("LastItem"), "not!"); + AssertDictionaryEntry(subDictionaryValue, CosName.Create("VeryLastItem"), "OK"); } [Fact] @@ -147,8 +147,8 @@ endobj var reference = new IndirectReference(69, 0); - AssertDictionaryEntry(dictionary, 0, CosName.PAGES, reference); - AssertDictionaryEntry(dictionary, 1, CosName.TYPE, CosName.CATALOG); + AssertDictionaryEntry(dictionary, CosName.PAGES, reference); + AssertDictionaryEntry(dictionary, CosName.TYPE, CosName.CATALOG); Assert.Equal(2, dictionary.Data.Count); } @@ -164,37 +164,32 @@ endobj var dictionary = AssertDictionaryToken(token); - AssertDictionaryEntry(dictionary, 0, CosName.COUNT, 12); + AssertDictionaryEntry(dictionary, CosName.COUNT, 12); var subDictionaryToken = GetIndex(1, dictionary); - Assert.Equal(CosName.Create("Definition"), Assert.IsType(subDictionaryToken.Key).Data); + Assert.Equal("Definition", subDictionaryToken.Key); var subDictionary = Assert.IsType(subDictionaryToken.Value); - AssertDictionaryEntry(subDictionary, 0, CosName.NAME, "Glorp"); + AssertDictionaryEntry(subDictionary, CosName.NAME, "Glorp"); - AssertDictionaryEntry(dictionary, 2, CosName.TYPE, CosName.CATALOG); + AssertDictionaryEntry(dictionary, CosName.TYPE, CosName.CATALOG); Assert.Equal(3, dictionary.Data.Count); } - private static void AssertDictionaryEntry( - DictionaryToken dictionary, int index, TKeyData key, - TValueData value) where TKey : IDataToken where TValue : IDataToken + private static void AssertDictionaryEntry(DictionaryToken dictionary, CosName key, + TValueData value) where TValue : IDataToken { - KeyValuePair data = GetIndex(index, dictionary); + var result = dictionary.Data[key.Name]; - var keyToken = Assert.IsType(data.Key); - - Assert.Equal(key, keyToken.Data); - - var valueToken = Assert.IsType(data.Value); + var valueToken = Assert.IsType(result); Assert.Equal(value, valueToken.Data); } - private static KeyValuePair GetIndex(int index, DictionaryToken dictionary) + private static KeyValuePair GetIndex(int index, DictionaryToken dictionary) { int i = 0; foreach (var pair in dictionary.Data) diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs index 678bafd3..a8eca719 100644 --- a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs @@ -2,17 +2,14 @@ { using System; using System.Collections.Generic; + using System.Text; using PdfPig.ContentStream; - using PdfPig.Cos; using PdfPig.Tokenization.Scanner; using PdfPig.Tokenization.Tokens; using Xunit; public class PdfTokenScannerTests { - private readonly CrossReferenceTable table = new CrossReferenceTable(CrossReferenceType.Table, new Dictionary(), - new PdfDictionary()); - [Fact] public void ReadsSimpleObject() { @@ -34,6 +31,23 @@ Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position)); } + [Fact] + public void ReadsIndirectReferenceInObject() + { + const string s = @" +15 0 obj +12 7 R +endobj"; + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + var reference = Assert.IsType(token.Data); + + Assert.Equal(new IndirectReference(12, 7), reference.Data); + } + [Fact] public void ReadsNumericObjectWithComment() { @@ -141,11 +155,131 @@ endobj Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position)); } - private PdfTokenScanner GetScanner(string s) + [Fact] + public void ReadsStreamObject() + { + const string s = @" +352 0 obj << /S 1273 /Filter /FlateDecode /Length 353 0 R >> stream +H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´ +É² ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs  Ô;€ +À»—ÀF`ÇF@ƒ 4 ˜ï @¥T¨³fY: žw̵;’’Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒ›ß– ¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñ­ím·°gêêb,/,£P§õ^ v¾ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ A¡¬àð‰É©ˆ°‘¼›‚%¥×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú;x#dÃÄ$m ++) )†…±n +9ùyŽA·n\ï»t!=3£½¡:®­µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U‘¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²‚|ê«êªDµ5q°šR¦RÈ£ n¾[è~“}ýƒÝ½SꞦ'æQŽzÝ‚mæ +óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЍ«#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c›©.Œòµ9zz0Ѳ‚B¢«#š-3ªàŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£­ivvv…k2=µZMØ|Úl(ŠZ­V›ÍbI>Ÿl¹œ(â±Äb­ø”Uª ñeü©U*‹’“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0‹Æôz“‰ NÊ,¬‚kÀ°F‚XÛ4&“ÉfÃñÅæûæy=ÆãIðE _¾Èårår/XÞ„/·qò›m¶ìÖ|†óx8Wð¹hºÜÂÕalÎü’˜Ã0^Òòòü¼yÞ¶´´DX + )¨ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„› ”)‰'3¤œ\–™ËN–™ÿe^Ё² y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@ €Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤k¾ hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SЍøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5‚²ìɐ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿i aœ‘ØÏºþÇoäO ôkÆ) + endstream + endobj + 353 0 obj + 1479 + endobj"; + + var locationProvider = new TestObjectLocationProvider(); + // Mark location of "353 0 obj" + locationProvider.Offsets[new IndirectReference(353, 0)] = 1643; + + var scanner = GetScanner(s, locationProvider); + + var tokens = ReadToEnd(scanner); + + Assert.Equal(2, tokens.Count); + + var stream = Assert.IsType(tokens[0].Data); + + var str = Encoding.UTF8.GetString(stream.Data); + + Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str); + + Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]); + } + + [Fact] + public void ReadsSimpleStreamObject() + { + // Length of the bytes as found by Encoding.UTF8.GetBytes is 45 + const string s = @" +574387 0 obj +<< /Length 45 >> +stream +À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú +endstream +endobj"; + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + var stream = Assert.IsType(token.Data); + + Assert.Equal(45, stream.Data.Length); + + var outputString = Encoding.UTF8.GetString(stream.Data); + + Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú", outputString); + } + + [Fact] + public void ReadsStreamWithIndirectLength() + { + const string s = @"5 0 obj 52 endobj + + + +12 0 obj + +<< /Length 5 0 R /S 1245 >> + +stream +%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼ +endstream +endobj"; + var locationProvider = new TestObjectLocationProvider(); + + locationProvider.Offsets[new IndirectReference(5, 0)] = 0; + + var scanner = GetScanner(s, locationProvider); + + var token = ReadToEnd(scanner)[1]; + + var stream = Assert.IsType(token.Data); + + Assert.Equal(52, stream.Data.Length); + + var outputString = Encoding.UTF8.GetString(stream.Data); + + Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼", outputString); + } + + [Fact] + public void ReadsStreamWithMissingLength() + { + const string s = @" +12655 0 obj + +<< /S 1245 >> + +stream +%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼ +endstream +endobj"; + + var scanner = GetScanner(s); + + var token = ReadToEnd(scanner)[0]; + + Assert.Equal(12655, token.Number.ObjectNumber); + + var stream = Assert.IsType(token.Data); + + Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString()); + + Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data)); + } + + private PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null) { var input = StringBytesTestConverter.Convert(s, false); - return new PdfTokenScanner(input.Bytes, table); + return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider()); } private static IReadOnlyList ReadToEnd(PdfTokenScanner scanner) diff --git a/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/TestObjectLocationProvider.cs b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/TestObjectLocationProvider.cs new file mode 100644 index 00000000..553ba36c --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/TestObjectLocationProvider.cs @@ -0,0 +1,21 @@ +namespace UglyToad.PdfPig.Tests.Tokenization.Scanner +{ + using System.Collections.Generic; + using PdfPig.ContentStream; + using PdfPig.Tokenization.Scanner; + + internal class TestObjectLocationProvider : IObjectLocationProvider + { + public Dictionary Offsets { get; } = new Dictionary(); + + public bool TryGetOffset(IndirectReference reference, out long offset) + { + return Offsets.TryGetValue(reference, out offset); + } + + public void UpdateOffset(IndirectReference reference, long offset) + { + Offsets[reference] = offset; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/ContentStream/IndirectReference.cs b/src/UglyToad.PdfPig/ContentStream/IndirectReference.cs index f4e819d1..a786ea5b 100644 --- a/src/UglyToad.PdfPig/ContentStream/IndirectReference.cs +++ b/src/UglyToad.PdfPig/ContentStream/IndirectReference.cs @@ -1,15 +1,57 @@ namespace UglyToad.PdfPig.ContentStream { + /// + /// Used to uniquely identify and refer to objects in the PDF file. + /// internal struct IndirectReference { + /// + /// A positive integer object number. + /// public long ObjectNumber { get; } + /// + /// A non-negative integer generation number which starts as 0 and increases if the file is updated incrementally. + /// public int Generation { get; } + /// + /// Create a new + /// + /// The object number. + /// The generation number. public IndirectReference(long objectNumber, int generation) { ObjectNumber = objectNumber; Generation = generation; } + + public override bool Equals(object obj) + { + if (obj is IndirectReference reference) + { + return reference.ObjectNumber == ObjectNumber + && reference.Generation == Generation; + } + + return false; + } + + public override int GetHashCode() + { + unchecked + { + int hash = 59; + hash = hash * 97 + ObjectNumber.GetHashCode(); + hash = hash * 97 + Generation.GetHashCode(); + + return hash; + } + } + + public override string ToString() + { + return $"{ObjectNumber} {Generation}"; + } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs new file mode 100644 index 00000000..40ba42d9 --- /dev/null +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/IObjectLocationProvider.cs @@ -0,0 +1,11 @@ +namespace UglyToad.PdfPig.Tokenization.Scanner +{ + using ContentStream; + + internal interface IObjectLocationProvider + { + bool TryGetOffset(IndirectReference reference, out long offset); + + void UpdateOffset(IndirectReference reference, long offset); + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs index c1553ac9..14658c9a 100644 --- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs @@ -1,37 +1,44 @@ namespace UglyToad.PdfPig.Tokenization.Scanner { - using System; using System.Collections.Generic; + using System.Diagnostics; + using System.IO; using ContentStream; - using Cos; using Exceptions; using IO; + using Parser.Parts; using Tokens; internal class PdfTokenScanner : ISeekableTokenScanner { private readonly IInputBytes inputBytes; - private readonly CrossReferenceTable crossReferenceTable; + private readonly IObjectLocationProvider objectLocationProvider; private readonly CoreTokenScanner coreTokenScanner; + /// + /// Stores tokens encountered between obj - endobj markers for each call. + /// Cleared after each operation. + /// + private readonly List readTokens = new List(); + + // Store the previous 2 tokens and their positions so we can backtrack to find object numbers and stream dictionaries. private readonly long[] previousTokenPositions = new long[2]; private readonly IToken[] previousTokens = new IToken[2]; - private readonly Dictionary objectOffsets = new Dictionary(); - public IToken CurrentToken { get; private set; } public long CurrentPosition => coreTokenScanner.CurrentPosition; - public PdfTokenScanner(IInputBytes inputBytes, CrossReferenceTable crossReferenceTable) + public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider) { this.inputBytes = inputBytes; - this.crossReferenceTable = crossReferenceTable; + this.objectLocationProvider = objectLocationProvider; coreTokenScanner = new CoreTokenScanner(inputBytes); } public bool MoveNext() { + // Read until we find object-number generation obj, e.g. "69 420 obj". int tokensRead = 0; while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject) { @@ -49,6 +56,7 @@ previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart; } + // We only read partial tokens. if (tokensRead < 2) { return false; @@ -64,8 +72,7 @@ $"Instead got: {previousTokens[0]} {previousTokens[1]} obj"); } - var data = new List(); - + // Read all tokens between obj and endobj. while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject) { if (coreTokenScanner.CurrentToken is CommentToken) @@ -73,12 +80,26 @@ continue; } - if (coreTokenScanner.CurrentToken == OperatorToken.StartStream) + if (coreTokenScanner.CurrentToken == OperatorToken.StartObject) { - // Read stream. + // This should never happen. + Debug.Assert(false, "Encountered a start object 'obj' operator before the end of the previous object."); + return false; } - data.Add(coreTokenScanner.CurrentToken); + if (coreTokenScanner.CurrentToken == OperatorToken.StartStream) + { + // Read stream: special case. + if (TryReadStream(coreTokenScanner.CurrentTokenStart, out var stream)) + { + readTokens.Clear(); + readTokens.Add(stream); + } + } + else + { + readTokens.Add(coreTokenScanner.CurrentToken); + } previousTokens[0] = previousTokens[1]; previousTokenPositions[0] = previousTokenPositions[1]; @@ -89,10 +110,322 @@ if (coreTokenScanner.CurrentToken != OperatorToken.EndObject) { + readTokens.Clear(); return false; } - CurrentToken = new ObjectToken(startPosition, new IndirectReference(objectNumber.Long, generation.Int), data[data.Count - 1]); + var reference = new IndirectReference(objectNumber.Long, generation.Int); + + IToken token; + if (readTokens.Count == 3 && readTokens[0] is NumericToken objNum + && readTokens[1] is NumericToken genNum + && readTokens[2] == OperatorToken.R) + { + // I have no idea if this can ever happen. + token = new IndirectReferenceToken(new IndirectReference(objNum.Long, genNum.Int)); + } + else + { + // Just take the last, should only ever be 1 + Debug.Assert(readTokens.Count == 1, "Found more than 1 token in an object."); + + token = readTokens[readTokens.Count - 1]; + } + + CurrentToken = new ObjectToken(startPosition, reference, token); + + objectLocationProvider.UpdateOffset(reference, startPosition); + + readTokens.Clear(); + return true; + } + + private bool TryReadStream(long startStreamTokenOffset, out StreamToken stream) + { + stream = null; + + DictionaryToken streamDictionaryToken = GetStreamDictionary(); + + // Get the expected length from the stream dictionary if present. + long? length = GetStreamLength(streamDictionaryToken); + + // Verify again that we start with "stream" + var hasStartStreamToken = ReadStreamTokenStart(inputBytes, startStreamTokenOffset); + + if (!hasStartStreamToken) + { + return false; + } + + // From the specification: The stream operator should be followed by \r\n or \n, not just \r. + if (!inputBytes.MoveNext()) + { + return false; + } + + if (inputBytes.CurrentByte == '\r') + { + inputBytes.MoveNext(); + } + + if (inputBytes.CurrentByte != '\n') + { + return false; + } + + // Store where we started reading the first byte of data. + long startDataOffset = inputBytes.CurrentOffset; + + // Store how many bytes we have read for checking against Length. + long read = 0; + + // We want to check if we ever read 'endobj' or 'endstream'. + int endObjPosition = 0; + int endStreamPosition = 0; + int commonPartPosition = 0; + + const string commonPart = "end"; + const string streamPart = "stream"; + const string objPart = "obj"; + + // Track any 'endobj' or 'endstream' operators we see. + var observedEndLocations = new List(); + + // Keep track of the previous byte. + byte previousByte = 0; + + // Begin reading the stream. + using (var memoryStream = new MemoryStream()) + using (var binaryWrite = new BinaryWriter(memoryStream)) + { + while (inputBytes.MoveNext()) + { + if (length.HasValue && read == length) + { + // TODO: read ahead and check we're at the end... + // break; + } + + // We are reading 'end' (possibly). + if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition]) + { + if (commonPartPosition == 0 && !ReadHelper.IsWhitespace(previousByte)) + { + // We've just encountered a normal 'e' in the stream. + } + else + { + commonPartPosition++; + } + } + else if (commonPartPosition == commonPart.Length) + { + // We are reading 'stream' after 'end' + if (inputBytes.CurrentByte == streamPart[endStreamPosition]) + { + endObjPosition = 0; + endStreamPosition++; + + // We've finished reading 'endstream', add it to the end tokens we've seen. + if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte))) + { + var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream); + + observedEndLocations.Add(token); + + if (length.HasValue && read > length) + { + break; + } + + endStreamPosition = 0; + } + } + else if (inputBytes.CurrentByte == objPart[endObjPosition]) + { + // We are reading 'obj' after 'end' + + endStreamPosition = 0; + endObjPosition++; + + // We have finished reading 'endobj'. + if (endObjPosition == objPart.Length) + { + // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now. + if (observedEndLocations.Count > 0) + { + var lastEndToken = observedEndLocations[observedEndLocations.Count - 1]; + + inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1); + + break; + } + + var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject); + observedEndLocations.Add(token); + + if (read > length) + { + break; + } + } + } + else + { + // We were reading 'end' but then we had a character mismatch. + // Reset all the counters. + + endStreamPosition = 0; + endObjPosition = 0; + commonPartPosition = 0; + } + } + else + { + // For safety reset every counter in case we had a partial read. + + endStreamPosition = 0; + endObjPosition = 0; + commonPartPosition = 0; + } + + previousByte = inputBytes.CurrentByte; + binaryWrite.Write(inputBytes.CurrentByte); + + read++; + } + + binaryWrite.Flush(); + + if (observedEndLocations.Count == 0) + { + return false; + } + + memoryStream.Seek(0, SeekOrigin.Begin); + if (length.HasValue && memoryStream.Length >= length) + { + // Use the declared length to copy just the data we want. + byte[] data = new byte[length.Value]; + + memoryStream.Read(data, 0, (int)length.Value); + + stream = new StreamToken(streamDictionaryToken, data); + } + else + { + // Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that. + var lastEnd = observedEndLocations[observedEndLocations.Count - 1]; + + var dataLength = lastEnd.Offset - startDataOffset; + + var current = inputBytes.CurrentOffset; + + // 3 characters, 'e', '\n' and possibly '\r' + inputBytes.Seek(lastEnd.Offset - 3); + inputBytes.MoveNext(); + + if (inputBytes.CurrentByte == '\r') + { + dataLength -= 3; + } + else + { + dataLength -= 2; + } + + inputBytes.Seek(current); + + byte[] data = new byte[dataLength]; + + memoryStream.Read(data, 0, (int)dataLength); + + stream = new StreamToken(streamDictionaryToken, data); + } + } + + return true; + } + + private DictionaryToken GetStreamDictionary() + { + DictionaryToken streamDictionaryToken; + if (previousTokens[1] is DictionaryToken firstDictionary) + { + streamDictionaryToken = firstDictionary; + } + else if (previousTokens[0] is DictionaryToken secondDictionary) + { + streamDictionaryToken = secondDictionary; + } + else + { + throw new PdfDocumentFormatException("No dictionary token was found prior to the 'stream' operator. Previous tokens were:" + + $" {previousTokens[1]} and {previousTokens[0]}."); + } + + return streamDictionaryToken; + } + + private long? GetStreamLength(DictionaryToken dictionary) + { + if (!dictionary.Data.TryGetValue("Length", out var lengthValue)) + { + return null; + } + + long? length = default(long?); + + // Can either be number in the stream dictionary. + if (lengthValue is NumericToken numeric) + { + return numeric.Long; + } + + long currentOffset = inputBytes.CurrentOffset; + + // Or a reference to another numeric object. + if (lengthValue is IndirectReferenceToken lengthReference) + { + // We can only find it if we know where it is. + if (objectLocationProvider.TryGetOffset(lengthReference.Data, out var offset)) + { + // Move to the length object and read it. + Seek(offset); + + // Keep a copy of the read tokens here since this list must be empty prior to move next. + var oldData = new List(readTokens); + readTokens.Clear(); + if (MoveNext() && ((ObjectToken)CurrentToken).Data is NumericToken lengthToken) + { + length = lengthToken.Long; + } + readTokens.AddRange(oldData); + + // Move back to where we started. + Seek(currentOffset); + } + else + { + // warn, we had a reference to a length object but didn't find it... + } + } + + return length; + } + + private static bool ReadStreamTokenStart(IInputBytes input, long tokenStart) + { + input.Seek(tokenStart); + + for (var i = 0; i < OperatorToken.StartStream.Data.Length; i++) + { + if (!input.MoveNext() || input.CurrentByte != OperatorToken.StartStream.Data[i]) + { + input.Seek(tokenStart); + return false; + } + } return true; } diff --git a/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs b/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs new file mode 100644 index 00000000..5ca0262d --- /dev/null +++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PossibleStreamEndLocation.cs @@ -0,0 +1,35 @@ +namespace UglyToad.PdfPig.Tokenization.Scanner +{ + using System; + using Tokens; + + /// + /// Used internally by the when reading streams to store any occurrences of 'endobj' or 'endstream' observed. + /// + internal struct PossibleStreamEndLocation + { + /// + /// The offset at which the token started in the file. + /// + public long Offset { get; } + + /// + /// The type, one of either or . + /// + public OperatorToken Type { get; } + + /// + /// Create a new + /// + public PossibleStreamEndLocation(long offset, OperatorToken type) + { + Offset = offset; + Type = type ?? throw new ArgumentNullException(nameof(type)); + } + + public override string ToString() + { + return $"{Offset}: {Type}"; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Tokenization/Tokens/DictionaryToken.cs b/src/UglyToad.PdfPig/Tokenization/Tokens/DictionaryToken.cs index c9161703..d5892730 100644 --- a/src/UglyToad.PdfPig/Tokenization/Tokens/DictionaryToken.cs +++ b/src/UglyToad.PdfPig/Tokenization/Tokens/DictionaryToken.cs @@ -6,14 +6,34 @@ using Cos; using Util.JetBrains.Annotations; - internal class DictionaryToken : IDataToken> + internal class DictionaryToken : IDataToken> { [NotNull] - public IReadOnlyDictionary Data { get; } + public IReadOnlyDictionary Data { get; } public DictionaryToken([NotNull]IReadOnlyDictionary data) { - Data = data ?? throw new ArgumentNullException(nameof(data)); + if (data == null) + { + throw new ArgumentNullException(nameof(data)); + } + + var result = new Dictionary(data.Count); + + foreach (var keyValuePair in data) + { + if (keyValuePair.Key is NameToken name) + { + result[name.Data.Name] = keyValuePair.Value; + } + else + { + // For now: + throw new InvalidOperationException("Key for dictionary token was not a string! " + keyValuePair.Key); + } + } + + Data = result; } public bool TryGetByName(CosName name, out IToken token) @@ -23,19 +43,7 @@ throw new ArgumentNullException(nameof(name)); } - token = null; - - foreach (var keyValuePair in Data) - { - if (keyValuePair.Key is NameToken nameToken && nameToken.Data.Equals(name)) - { - token = keyValuePair.Value; - - return true; - } - } - - return false; + return Data.TryGetValue(name.Name, out token); } public override string ToString() diff --git a/src/UglyToad.PdfPig/Tokenization/Tokens/StreamToken.cs b/src/UglyToad.PdfPig/Tokenization/Tokens/StreamToken.cs new file mode 100644 index 00000000..af02a311 --- /dev/null +++ b/src/UglyToad.PdfPig/Tokenization/Tokens/StreamToken.cs @@ -0,0 +1,15 @@ +namespace UglyToad.PdfPig.Tokenization.Tokens +{ + internal class StreamToken : IDataToken + { + public DictionaryToken StreamDictionary { get; } + + public byte[] Data { get; } + + public StreamToken(DictionaryToken streamDictionary, byte[] data) + { + StreamDictionary = streamDictionary; + Data = data; + } + } +}