make the pdf object scanner work with streams

This commit is contained in:
Eliot Jones
2018-01-14 10:53:01 +00:00
parent 106cd5f187
commit b19b96604d
9 changed files with 662 additions and 68 deletions

View File

@@ -39,7 +39,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization
var dictionary = AssertDictionaryToken(token); var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 0, CosName.NAME, "Barry Scott"); AssertDictionaryEntry<StringToken, string>(dictionary, CosName.NAME, "Barry Scott");
} }
[Fact] [Fact]
@@ -53,7 +53,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization
var dictionary = AssertDictionaryToken(token); var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE, AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE,
CosName.Create("Example")); CosName.Create("Example"));
} }
@@ -68,9 +68,9 @@ namespace UglyToad.PdfPig.Tests.Tokenization
var dictionary = AssertDictionaryToken(token); var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.FILTER, CosName.FLATE_DECODE); AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.FILTER, CosName.FLATE_DECODE);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 1, CosName.S, 36); AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.S, 36);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.LENGTH, 53); AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.LENGTH, 53);
} }
[Fact] [Fact]
@@ -86,8 +86,8 @@ namespace UglyToad.PdfPig.Tests.Tokenization
var reference = new IndirectReference(14, 0); var reference = new IndirectReference(14, 0);
AssertDictionaryEntry<NameToken, CosName, IndirectReferenceToken, IndirectReference>(dictionary, 0, CosName.PAGES, reference); AssertDictionaryEntry<IndirectReferenceToken, IndirectReference>(dictionary, CosName.PAGES, reference);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.TYPE, CosName.CATALOG); AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.CATALOG);
} }
[Fact] [Fact]
@@ -114,22 +114,22 @@ namespace UglyToad.PdfPig.Tests.Tokenization
var dictionary = AssertDictionaryToken(token); var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE, CosName.Create("Example")); AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.Create("Example"));
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.SUBTYPE, CosName.Create("DictionaryExample")); AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.SUBTYPE, CosName.Create("DictionaryExample"));
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.VERSION, 0.01m); AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.VERSION, 0.01m);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 3, CosName.Create("IntegerItem"), 12m); AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.Create("IntegerItem"), 12m);
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 4, CosName.Create("StringItem"), "a string"); AssertDictionaryEntry<StringToken, string>(dictionary, CosName.Create("StringItem"), "a string");
var subDictionary = GetIndex(5, dictionary); var subDictionary = GetIndex(5, dictionary);
Assert.Equal(CosName.Create("Subdictionary"), Assert.IsType<NameToken>(subDictionary.Key).Data); Assert.Equal("Subdictionary", subDictionary.Key);
var subDictionaryValue = Assert.IsType<DictionaryToken>(subDictionary.Value); var subDictionaryValue = Assert.IsType<DictionaryToken>(subDictionary.Value);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(subDictionaryValue, 0, CosName.Create("Item1"), 0.4m); AssertDictionaryEntry<NumericToken, decimal>(subDictionaryValue, CosName.Create("Item1"), 0.4m);
AssertDictionaryEntry<NameToken, CosName, BooleanToken, bool>(subDictionaryValue, 1, CosName.Create("Item2"), true); AssertDictionaryEntry<BooleanToken, bool>(subDictionaryValue, CosName.Create("Item2"), true);
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 2, CosName.Create("LastItem"), "not!"); AssertDictionaryEntry<StringToken, string>(subDictionaryValue, CosName.Create("LastItem"), "not!");
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 3, CosName.Create("VeryLastItem"), "OK"); AssertDictionaryEntry<StringToken, string>(subDictionaryValue, CosName.Create("VeryLastItem"), "OK");
} }
[Fact] [Fact]
@@ -147,8 +147,8 @@ endobj
var reference = new IndirectReference(69, 0); var reference = new IndirectReference(69, 0);
AssertDictionaryEntry<NameToken, CosName, IndirectReferenceToken, IndirectReference>(dictionary, 0, CosName.PAGES, reference); AssertDictionaryEntry<IndirectReferenceToken, IndirectReference>(dictionary, CosName.PAGES, reference);
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.TYPE, CosName.CATALOG); AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.CATALOG);
Assert.Equal(2, dictionary.Data.Count); Assert.Equal(2, dictionary.Data.Count);
} }
@@ -164,37 +164,32 @@ endobj
var dictionary = AssertDictionaryToken(token); var dictionary = AssertDictionaryToken(token);
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 0, CosName.COUNT, 12); AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.COUNT, 12);
var subDictionaryToken = GetIndex(1, dictionary); var subDictionaryToken = GetIndex(1, dictionary);
Assert.Equal(CosName.Create("Definition"), Assert.IsType<NameToken>(subDictionaryToken.Key).Data); Assert.Equal("Definition", subDictionaryToken.Key);
var subDictionary = Assert.IsType<DictionaryToken>(subDictionaryToken.Value); var subDictionary = Assert.IsType<DictionaryToken>(subDictionaryToken.Value);
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionary, 0, CosName.NAME, "Glorp"); AssertDictionaryEntry<StringToken, string>(subDictionary, CosName.NAME, "Glorp");
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 2, CosName.TYPE, CosName.CATALOG); AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.CATALOG);
Assert.Equal(3, dictionary.Data.Count); Assert.Equal(3, dictionary.Data.Count);
} }
private static void AssertDictionaryEntry<TKey, TKeyData, TValue, TValueData>( private static void AssertDictionaryEntry<TValue, TValueData>(DictionaryToken dictionary, CosName key,
DictionaryToken dictionary, int index, TKeyData key, TValueData value) where TValue : IDataToken<TValueData>
TValueData value) where TKey : IDataToken<TKeyData> where TValue : IDataToken<TValueData>
{ {
KeyValuePair<IToken, IToken> data = GetIndex(index, dictionary); var result = dictionary.Data[key.Name];
var keyToken = Assert.IsType<TKey>(data.Key); var valueToken = Assert.IsType<TValue>(result);
Assert.Equal(key, keyToken.Data);
var valueToken = Assert.IsType<TValue>(data.Value);
Assert.Equal(value, valueToken.Data); Assert.Equal(value, valueToken.Data);
} }
private static KeyValuePair<IToken, IToken> GetIndex(int index, DictionaryToken dictionary) private static KeyValuePair<string, IToken> GetIndex(int index, DictionaryToken dictionary)
{ {
int i = 0; int i = 0;
foreach (var pair in dictionary.Data) foreach (var pair in dictionary.Data)

View File

@@ -2,17 +2,14 @@
{ {
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Text;
using PdfPig.ContentStream; using PdfPig.ContentStream;
using PdfPig.Cos;
using PdfPig.Tokenization.Scanner; using PdfPig.Tokenization.Scanner;
using PdfPig.Tokenization.Tokens; using PdfPig.Tokenization.Tokens;
using Xunit; using Xunit;
public class PdfTokenScannerTests public class PdfTokenScannerTests
{ {
private readonly CrossReferenceTable table = new CrossReferenceTable(CrossReferenceType.Table, new Dictionary<CosObjectKey, long>(),
new PdfDictionary());
[Fact] [Fact]
public void ReadsSimpleObject() public void ReadsSimpleObject()
{ {
@@ -34,6 +31,23 @@
Assert.Equal("WDKAAR+CMBX12", name.Data.Name); Assert.Equal("WDKAAR+CMBX12", name.Data.Name);
Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position)); Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position));
}
[Fact]
public void ReadsIndirectReferenceInObject()
{
const string s = @"
15 0 obj
12 7 R
endobj";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
var reference = Assert.IsType<IndirectReferenceToken>(token.Data);
Assert.Equal(new IndirectReference(12, 7), reference.Data);
} }
[Fact] [Fact]
@@ -141,11 +155,131 @@ endobj
Assert.Equal(274, tokens[0].Number.ObjectNumber); Assert.Equal(274, tokens[0].Number.ObjectNumber);
Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position)); Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position));
var nameObject = Assert.IsType<NameToken>(tokens[1].Data); var nameObject = Assert.IsType<NameToken>(tokens[1].Data);
Assert.Equal("WPXNWT+CMR9", nameObject.Data.Name);
Assert.Equal(310, tokens[1].Number.ObjectNumber);
Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
dictionary = Assert.IsType<DictionaryToken>(tokens[2].Data);
Assert.Equal(7, dictionary.Data.Count);
Assert.Equal(311, tokens[2].Number.ObjectNumber);
Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position));
}
[Fact]
public void ReadsStringObject()
{
const string s = @"
58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj
";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
Assert.Equal(58949797283757L, token.Number.ObjectNumber);
Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType<StringToken>(token.Data).Data);
Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position));
}
[Fact]
public void ReadsStreamObject()
{
const string s = @"
352 0 obj
<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
stream
H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´
É² ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs  Ô;€
À»—ÀF`ÇF@ƒ 4 ˜ï @¥T¨³fY: žµ;Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒß
¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñ­ím·°gêêb,/,£P§õ^ ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ
A¡¬àð‰É©ˆ°¼×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽú;x#dÃÄ$m
+)
)†…±n
9ùyŽA·n\ï»t!=3£½¡:®­µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²|ê«êªDµ5q°šR¦RÈ£ n¾[è~“}ýƒÝ½SꞦ'æQŽ
óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЍ«#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c©.Œòµ9zz0ѲB¢«#š-3ªà<cš¥¡È¨qµ¦{pìÛ„Ã‡ŽŠ/íO»|áIclSCuo_Oœ\\ï!ª©«­ªƒTþ5әܔóî_9|ýÍ7ø!Ñý|2Goÿ€Î¶Öö…<ðáƒGéGá½G´Ã.®TŠóî=_|þ™‡ƒééFwßà 0æîc_Ó릳|ý|¶®æ„…†G8Òüï€l…\¦RFº:‰ VP𕐝S“Û¶ï V—ø/¿¾Xæ+«««ÖŽ4>ŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£­ivvv…k2=µZMØ|Úl(ŠZ­VÍbI>Ÿl¹œ(â±Äb­ø”Uª ñeü©U*“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0Æôz“‰ NÊ,¬‚kÀ°FXÛ4&“ÉfÃñÅæûæy=ÆãIðE _¾Èårår/XÞ„/·qòìÖ|†óx8Wð¹hºÜÂÕalÎü˜Ã0^Òòòü¼yÞ¶´´DX
ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„ ”)‰'3¤œ\™ËN™ÿe^Ё² y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@ €Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤ hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SЍøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5²ìɐ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿iØÏºþÇoäO ôkÆ)
endstream
endobj
353 0 obj
1479
endobj";
var locationProvider = new TestObjectLocationProvider();
// Mark location of "353 0 obj"
locationProvider.Offsets[new IndirectReference(353, 0)] = 1643;
var scanner = GetScanner(s, locationProvider);
var tokens = ReadToEnd(scanner);
Assert.Equal(2, tokens.Count);
var stream = Assert.IsType<StreamToken>(tokens[0].Data);
var str = Encoding.UTF8.GetString(stream.Data);
Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str);
Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
}
[Fact]
public void ReadsSimpleStreamObject()
{
// Length of the bytes as found by Encoding.UTF8.GetBytes is 45
const string s = @"
574387 0 obj
<< /Length 45 >>
stream
À“Éððr¥8»P£ØêÁi½®Û(éhŽú
endstream
endobj";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
var stream = Assert.IsType<StreamToken>(token.Data);
Assert.Equal(45, stream.Data.Length);
var outputString = Encoding.UTF8.GetString(stream.Data);
Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽú", outputString);
}
[Fact]
public void ReadsStreamWithIndirectLength()
{
const string s = @"5 0 obj 52 endobj
12 0 obj
<< /Length 5 0 R /S 1245 >>
stream
%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾~´¼
endstream
endobj";
var locationProvider = new TestObjectLocationProvider();
locationProvider.Offsets[new IndirectReference(5, 0)] = 0;
var scanner = GetScanner(s, locationProvider);
var token = ReadToEnd(scanner)[1];
var stream = Assert.IsType<StreamToken>(token.Data); var stream = Assert.IsType<StreamToken>(token.Data);
Assert.Equal(52, stream.Data.Length); Assert.Equal(52, stream.Data.Length);
Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
var outputString = Encoding.UTF8.GetString(stream.Data); var outputString = Encoding.UTF8.GetString(stream.Data);
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾~´¼", outputString); Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾~´¼", outputString);

View File

@@ -0,0 +1,21 @@
namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
{
using System.Collections.Generic;
using PdfPig.ContentStream;
using PdfPig.Tokenization.Scanner;
internal class TestObjectLocationProvider : IObjectLocationProvider
{
public Dictionary<IndirectReference, long> Offsets { get; } = new Dictionary<IndirectReference, long>();
public bool TryGetOffset(IndirectReference reference, out long offset)
{
return Offsets.TryGetValue(reference, out offset);
}
public void UpdateOffset(IndirectReference reference, long offset)
{
Offsets[reference] = offset;
}
}
}

View File

@@ -1,15 +1,57 @@
namespace UglyToad.PdfPig.ContentStream namespace UglyToad.PdfPig.ContentStream
{ {
/// <summary>
/// Used to uniquely identify and refer to objects in the PDF file.
/// </summary>
internal struct IndirectReference internal struct IndirectReference
{ {
/// <summary>
/// A positive integer object number.
/// </summary>
public long ObjectNumber { get; } public long ObjectNumber { get; }
/// <summary>
/// A non-negative integer generation number which starts as 0 and increases if the file is updated incrementally.
/// </summary>
public int Generation { get; } public int Generation { get; }
/// <summary>
/// Create a new <see cref="IndirectReference"/>
/// </summary>
/// <param name="objectNumber">The object number.</param>
/// <param name="generation">The generation number.</param>
public IndirectReference(long objectNumber, int generation) public IndirectReference(long objectNumber, int generation)
{ {
ObjectNumber = objectNumber; ObjectNumber = objectNumber;
Generation = generation; Generation = generation;
} }
public override bool Equals(object obj)
{
if (obj is IndirectReference reference)
{
return reference.ObjectNumber == ObjectNumber
&& reference.Generation == Generation;
}
return false;
}
public override int GetHashCode()
{
unchecked
{
int hash = 59;
hash = hash * 97 + ObjectNumber.GetHashCode();
hash = hash * 97 + Generation.GetHashCode();
return hash;
}
}
public override string ToString()
{
return $"{ObjectNumber} {Generation}";
}
} }
} }

View File

@@ -0,0 +1,11 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using ContentStream;
internal interface IObjectLocationProvider
{
bool TryGetOffset(IndirectReference reference, out long offset);
void UpdateOffset(IndirectReference reference, long offset);
}
}

View File

@@ -1,37 +1,44 @@
namespace UglyToad.PdfPig.Tokenization.Scanner namespace UglyToad.PdfPig.Tokenization.Scanner
{ {
using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using ContentStream; using ContentStream;
using Cos;
using Exceptions; using Exceptions;
using IO; using IO;
using Parser.Parts;
using Tokens; using Tokens;
internal class PdfTokenScanner : ISeekableTokenScanner internal class PdfTokenScanner : ISeekableTokenScanner
{ {
private readonly IInputBytes inputBytes; private readonly IInputBytes inputBytes;
private readonly CrossReferenceTable crossReferenceTable; private readonly IObjectLocationProvider objectLocationProvider;
private readonly CoreTokenScanner coreTokenScanner; private readonly CoreTokenScanner coreTokenScanner;
/// <summary>
/// Stores tokens encountered between obj - endobj markers for each <see cref="MoveNext"/> call.
/// Cleared after each operation.
/// </summary>
private readonly List<IToken> readTokens = new List<IToken>();
// Store the previous 2 tokens and their positions so we can backtrack to find object numbers and stream dictionaries.
private readonly long[] previousTokenPositions = new long[2]; private readonly long[] previousTokenPositions = new long[2];
private readonly IToken[] previousTokens = new IToken[2]; private readonly IToken[] previousTokens = new IToken[2];
private readonly Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
public IToken CurrentToken { get; private set; } public IToken CurrentToken { get; private set; }
public long CurrentPosition => coreTokenScanner.CurrentPosition; public long CurrentPosition => coreTokenScanner.CurrentPosition;
public PdfTokenScanner(IInputBytes inputBytes, CrossReferenceTable crossReferenceTable) public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider)
{ {
this.inputBytes = inputBytes; this.inputBytes = inputBytes;
this.crossReferenceTable = crossReferenceTable; this.objectLocationProvider = objectLocationProvider;
coreTokenScanner = new CoreTokenScanner(inputBytes); coreTokenScanner = new CoreTokenScanner(inputBytes);
} }
public bool MoveNext() public bool MoveNext()
{ {
// Read until we find object-number generation obj, e.g. "69 420 obj".
int tokensRead = 0; int tokensRead = 0;
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject) while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject)
{ {
@@ -49,6 +56,7 @@
previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart; previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart;
} }
// We only read partial tokens.
if (tokensRead < 2) if (tokensRead < 2)
{ {
return false; return false;
@@ -64,8 +72,7 @@
$"Instead got: {previousTokens[0]} {previousTokens[1]} obj"); $"Instead got: {previousTokens[0]} {previousTokens[1]} obj");
} }
var data = new List<IToken>(); // Read all tokens between obj and endobj.
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject) while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject)
{ {
if (coreTokenScanner.CurrentToken is CommentToken) if (coreTokenScanner.CurrentToken is CommentToken)
@@ -73,12 +80,26 @@
continue; continue;
} }
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream) if (coreTokenScanner.CurrentToken == OperatorToken.StartObject)
{ {
// Read stream. // This should never happen.
Debug.Assert(false, "Encountered a start object 'obj' operator before the end of the previous object.");
return false;
} }
data.Add(coreTokenScanner.CurrentToken); if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
{
// Read stream: special case.
if (TryReadStream(coreTokenScanner.CurrentTokenStart, out var stream))
{
readTokens.Clear();
readTokens.Add(stream);
}
}
else
{
readTokens.Add(coreTokenScanner.CurrentToken);
}
previousTokens[0] = previousTokens[1]; previousTokens[0] = previousTokens[1];
previousTokenPositions[0] = previousTokenPositions[1]; previousTokenPositions[0] = previousTokenPositions[1];
@@ -89,10 +110,322 @@
if (coreTokenScanner.CurrentToken != OperatorToken.EndObject) if (coreTokenScanner.CurrentToken != OperatorToken.EndObject)
{ {
readTokens.Clear();
return false; return false;
} }
CurrentToken = new ObjectToken(startPosition, new IndirectReference(objectNumber.Long, generation.Int), data[data.Count - 1]); var reference = new IndirectReference(objectNumber.Long, generation.Int);
IToken token;
if (readTokens.Count == 3 && readTokens[0] is NumericToken objNum
&& readTokens[1] is NumericToken genNum
&& readTokens[2] == OperatorToken.R)
{
// I have no idea if this can ever happen.
token = new IndirectReferenceToken(new IndirectReference(objNum.Long, genNum.Int));
}
else
{
// Just take the last, should only ever be 1
Debug.Assert(readTokens.Count == 1, "Found more than 1 token in an object.");
token = readTokens[readTokens.Count - 1];
}
CurrentToken = new ObjectToken(startPosition, reference, token);
objectLocationProvider.UpdateOffset(reference, startPosition);
readTokens.Clear();
return true;
}
private bool TryReadStream(long startStreamTokenOffset, out StreamToken stream)
{
stream = null;
DictionaryToken streamDictionaryToken = GetStreamDictionary();
// Get the expected length from the stream dictionary if present.
long? length = GetStreamLength(streamDictionaryToken);
// Verify again that we start with "stream"
var hasStartStreamToken = ReadStreamTokenStart(inputBytes, startStreamTokenOffset);
if (!hasStartStreamToken)
{
return false;
}
// From the specification: The stream operator should be followed by \r\n or \n, not just \r.
if (!inputBytes.MoveNext())
{
return false;
}
if (inputBytes.CurrentByte == '\r')
{
inputBytes.MoveNext();
}
if (inputBytes.CurrentByte != '\n')
{
return false;
}
// Store where we started reading the first byte of data.
long startDataOffset = inputBytes.CurrentOffset;
// Store how many bytes we have read for checking against Length.
long read = 0;
// We want to check if we ever read 'endobj' or 'endstream'.
int endObjPosition = 0;
int endStreamPosition = 0;
int commonPartPosition = 0;
const string commonPart = "end";
const string streamPart = "stream";
const string objPart = "obj";
// Track any 'endobj' or 'endstream' operators we see.
var observedEndLocations = new List<PossibleStreamEndLocation>();
// Keep track of the previous byte.
byte previousByte = 0;
// Begin reading the stream.
using (var memoryStream = new MemoryStream())
using (var binaryWrite = new BinaryWriter(memoryStream))
{
while (inputBytes.MoveNext())
{
if (length.HasValue && read == length)
{
// TODO: read ahead and check we're at the end...
// break;
}
// We are reading 'end' (possibly).
if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
{
if (commonPartPosition == 0 && !ReadHelper.IsWhitespace(previousByte))
{
// We've just encountered a normal 'e' in the stream.
}
else
{
commonPartPosition++;
}
}
else if (commonPartPosition == commonPart.Length)
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
{
endObjPosition = 0;
endStreamPosition++;
// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
observedEndLocations.Add(token);
if (length.HasValue && read > length)
{
break;
}
endStreamPosition = 0;
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'
endStreamPosition = 0;
endObjPosition++;
// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (observedEndLocations.Count > 0)
{
var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
break;
}
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
observedEndLocations.Add(token);
if (read > length)
{
break;
}
}
}
else
{
// We were reading 'end' but then we had a character mismatch.
// Reset all the counters.
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = 0;
}
}
else
{
// For safety reset every counter in case we had a partial read.
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = 0;
}
previousByte = inputBytes.CurrentByte;
binaryWrite.Write(inputBytes.CurrentByte);
read++;
}
binaryWrite.Flush();
if (observedEndLocations.Count == 0)
{
return false;
}
memoryStream.Seek(0, SeekOrigin.Begin);
if (length.HasValue && memoryStream.Length >= length)
{
// Use the declared length to copy just the data we want.
byte[] data = new byte[length.Value];
memoryStream.Read(data, 0, (int)length.Value);
stream = new StreamToken(streamDictionaryToken, data);
}
else
{
// Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
var dataLength = lastEnd.Offset - startDataOffset;
var current = inputBytes.CurrentOffset;
// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Offset - 3);
inputBytes.MoveNext();
if (inputBytes.CurrentByte == '\r')
{
dataLength -= 3;
}
else
{
dataLength -= 2;
}
inputBytes.Seek(current);
byte[] data = new byte[dataLength];
memoryStream.Read(data, 0, (int)dataLength);
stream = new StreamToken(streamDictionaryToken, data);
}
}
return true;
}
private DictionaryToken GetStreamDictionary()
{
DictionaryToken streamDictionaryToken;
if (previousTokens[1] is DictionaryToken firstDictionary)
{
streamDictionaryToken = firstDictionary;
}
else if (previousTokens[0] is DictionaryToken secondDictionary)
{
streamDictionaryToken = secondDictionary;
}
else
{
throw new PdfDocumentFormatException("No dictionary token was found prior to the 'stream' operator. Previous tokens were:" +
$" {previousTokens[1]} and {previousTokens[0]}.");
}
return streamDictionaryToken;
}
private long? GetStreamLength(DictionaryToken dictionary)
{
if (!dictionary.Data.TryGetValue("Length", out var lengthValue))
{
return null;
}
long? length = default(long?);
// Can either be number in the stream dictionary.
if (lengthValue is NumericToken numeric)
{
return numeric.Long;
}
long currentOffset = inputBytes.CurrentOffset;
// Or a reference to another numeric object.
if (lengthValue is IndirectReferenceToken lengthReference)
{
// We can only find it if we know where it is.
if (objectLocationProvider.TryGetOffset(lengthReference.Data, out var offset))
{
// Move to the length object and read it.
Seek(offset);
// Keep a copy of the read tokens here since this list must be empty prior to move next.
var oldData = new List<IToken>(readTokens);
readTokens.Clear();
if (MoveNext() && ((ObjectToken)CurrentToken).Data is NumericToken lengthToken)
{
length = lengthToken.Long;
}
readTokens.AddRange(oldData);
// Move back to where we started.
Seek(currentOffset);
}
else
{
// warn, we had a reference to a length object but didn't find it...
}
}
return length;
}
private static bool ReadStreamTokenStart(IInputBytes input, long tokenStart)
{
input.Seek(tokenStart);
for (var i = 0; i < OperatorToken.StartStream.Data.Length; i++)
{
if (!input.MoveNext() || input.CurrentByte != OperatorToken.StartStream.Data[i])
{
input.Seek(tokenStart);
return false;
}
}
return true; return true;
} }

View File

@@ -0,0 +1,35 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using System;
using Tokens;
/// <summary>
/// Used internally by the <see cref="PdfTokenScanner"/> when reading streams to store any occurrences of 'endobj' or 'endstream' observed.
/// </summary>
internal struct PossibleStreamEndLocation
{
/// <summary>
/// The offset at which the token started in the file.
/// </summary>
public long Offset { get; }
/// <summary>
/// The type, one of either <see cref="OperatorToken.EndObject"/> or <see cref="OperatorToken.EndStream"/>.
/// </summary>
public OperatorToken Type { get; }
/// <summary>
/// Create a new <see cref="PossibleStreamEndLocation"/>
/// </summary>
public PossibleStreamEndLocation(long offset, OperatorToken type)
{
Offset = offset;
Type = type ?? throw new ArgumentNullException(nameof(type));
}
public override string ToString()
{
return $"{Offset}: {Type}";
}
}
}

View File

@@ -6,14 +6,34 @@
using Cos; using Cos;
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;
internal class DictionaryToken : IDataToken<IReadOnlyDictionary<IToken, IToken>> internal class DictionaryToken : IDataToken<IReadOnlyDictionary<string, IToken>>
{ {
[NotNull] [NotNull]
public IReadOnlyDictionary<IToken, IToken> Data { get; } public IReadOnlyDictionary<string, IToken> Data { get; }
public DictionaryToken([NotNull]IReadOnlyDictionary<IToken, IToken> data) public DictionaryToken([NotNull]IReadOnlyDictionary<IToken, IToken> data)
{ {
Data = data ?? throw new ArgumentNullException(nameof(data)); if (data == null)
{
throw new ArgumentNullException(nameof(data));
}
var result = new Dictionary<string, IToken>(data.Count);
foreach (var keyValuePair in data)
{
if (keyValuePair.Key is NameToken name)
{
result[name.Data.Name] = keyValuePair.Value;
}
else
{
// For now:
throw new InvalidOperationException("Key for dictionary token was not a string! " + keyValuePair.Key);
}
}
Data = result;
} }
public bool TryGetByName(CosName name, out IToken token) public bool TryGetByName(CosName name, out IToken token)
@@ -23,19 +43,7 @@
throw new ArgumentNullException(nameof(name)); throw new ArgumentNullException(nameof(name));
} }
token = null; return Data.TryGetValue(name.Name, out token);
foreach (var keyValuePair in Data)
{
if (keyValuePair.Key is NameToken nameToken && nameToken.Data.Equals(name))
{
token = keyValuePair.Value;
return true;
}
}
return false;
} }
public override string ToString() public override string ToString()

View File

@@ -0,0 +1,15 @@
namespace UglyToad.PdfPig.Tokenization.Tokens
{
internal class StreamToken : IDataToken<byte[]>
{
public DictionaryToken StreamDictionary { get; }
public byte[] Data { get; }
public StreamToken(DictionaryToken streamDictionary, byte[] data)
{
StreamDictionary = streamDictionary;
Data = data;
}
}
}