mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-20 11:37:57 +08:00
make the pdf object scanner work with streams
This commit is contained in:
@@ -39,7 +39,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 0, CosName.NAME, "Barry Scott");
|
||||
AssertDictionaryEntry<StringToken, string>(dictionary, CosName.NAME, "Barry Scott");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -53,7 +53,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE,
|
||||
AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE,
|
||||
CosName.Create("Example"));
|
||||
}
|
||||
|
||||
@@ -68,9 +68,9 @@ namespace UglyToad.PdfPig.Tests.Tokenization
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.FILTER, CosName.FLATE_DECODE);
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 1, CosName.S, 36);
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.LENGTH, 53);
|
||||
AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.FILTER, CosName.FLATE_DECODE);
|
||||
AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.S, 36);
|
||||
AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.LENGTH, 53);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -86,8 +86,8 @@ namespace UglyToad.PdfPig.Tests.Tokenization
|
||||
|
||||
var reference = new IndirectReference(14, 0);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, IndirectReferenceToken, IndirectReference>(dictionary, 0, CosName.PAGES, reference);
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.TYPE, CosName.CATALOG);
|
||||
AssertDictionaryEntry<IndirectReferenceToken, IndirectReference>(dictionary, CosName.PAGES, reference);
|
||||
AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.CATALOG);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -114,22 +114,22 @@ namespace UglyToad.PdfPig.Tests.Tokenization
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 0, CosName.TYPE, CosName.Create("Example"));
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.SUBTYPE, CosName.Create("DictionaryExample"));
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 2, CosName.VERSION, 0.01m);
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 3, CosName.Create("IntegerItem"), 12m);
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(dictionary, 4, CosName.Create("StringItem"), "a string");
|
||||
AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.Create("Example"));
|
||||
AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.SUBTYPE, CosName.Create("DictionaryExample"));
|
||||
AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.VERSION, 0.01m);
|
||||
AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.Create("IntegerItem"), 12m);
|
||||
AssertDictionaryEntry<StringToken, string>(dictionary, CosName.Create("StringItem"), "a string");
|
||||
|
||||
var subDictionary = GetIndex(5, dictionary);
|
||||
|
||||
Assert.Equal(CosName.Create("Subdictionary"), Assert.IsType<NameToken>(subDictionary.Key).Data);
|
||||
Assert.Equal("Subdictionary", subDictionary.Key);
|
||||
|
||||
var subDictionaryValue = Assert.IsType<DictionaryToken>(subDictionary.Value);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(subDictionaryValue, 0, CosName.Create("Item1"), 0.4m);
|
||||
AssertDictionaryEntry<NameToken, CosName, BooleanToken, bool>(subDictionaryValue, 1, CosName.Create("Item2"), true);
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 2, CosName.Create("LastItem"), "not!");
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionaryValue, 3, CosName.Create("VeryLastItem"), "OK");
|
||||
AssertDictionaryEntry<NumericToken, decimal>(subDictionaryValue, CosName.Create("Item1"), 0.4m);
|
||||
AssertDictionaryEntry<BooleanToken, bool>(subDictionaryValue, CosName.Create("Item2"), true);
|
||||
AssertDictionaryEntry<StringToken, string>(subDictionaryValue, CosName.Create("LastItem"), "not!");
|
||||
AssertDictionaryEntry<StringToken, string>(subDictionaryValue, CosName.Create("VeryLastItem"), "OK");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -147,8 +147,8 @@ endobj
|
||||
|
||||
var reference = new IndirectReference(69, 0);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, IndirectReferenceToken, IndirectReference>(dictionary, 0, CosName.PAGES, reference);
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 1, CosName.TYPE, CosName.CATALOG);
|
||||
AssertDictionaryEntry<IndirectReferenceToken, IndirectReference>(dictionary, CosName.PAGES, reference);
|
||||
AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.CATALOG);
|
||||
|
||||
Assert.Equal(2, dictionary.Data.Count);
|
||||
}
|
||||
@@ -164,37 +164,32 @@ endobj
|
||||
|
||||
var dictionary = AssertDictionaryToken(token);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NumericToken, decimal>(dictionary, 0, CosName.COUNT, 12);
|
||||
AssertDictionaryEntry<NumericToken, decimal>(dictionary, CosName.COUNT, 12);
|
||||
|
||||
var subDictionaryToken = GetIndex(1, dictionary);
|
||||
|
||||
Assert.Equal(CosName.Create("Definition"), Assert.IsType<NameToken>(subDictionaryToken.Key).Data);
|
||||
Assert.Equal("Definition", subDictionaryToken.Key);
|
||||
|
||||
var subDictionary = Assert.IsType<DictionaryToken>(subDictionaryToken.Value);
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, StringToken, string>(subDictionary, 0, CosName.NAME, "Glorp");
|
||||
AssertDictionaryEntry<StringToken, string>(subDictionary, CosName.NAME, "Glorp");
|
||||
|
||||
AssertDictionaryEntry<NameToken, CosName, NameToken, CosName>(dictionary, 2, CosName.TYPE, CosName.CATALOG);
|
||||
AssertDictionaryEntry<NameToken, CosName>(dictionary, CosName.TYPE, CosName.CATALOG);
|
||||
|
||||
Assert.Equal(3, dictionary.Data.Count);
|
||||
}
|
||||
|
||||
private static void AssertDictionaryEntry<TKey, TKeyData, TValue, TValueData>(
|
||||
DictionaryToken dictionary, int index, TKeyData key,
|
||||
TValueData value) where TKey : IDataToken<TKeyData> where TValue : IDataToken<TValueData>
|
||||
private static void AssertDictionaryEntry<TValue, TValueData>(DictionaryToken dictionary, CosName key,
|
||||
TValueData value) where TValue : IDataToken<TValueData>
|
||||
{
|
||||
KeyValuePair<IToken, IToken> data = GetIndex(index, dictionary);
|
||||
var result = dictionary.Data[key.Name];
|
||||
|
||||
var keyToken = Assert.IsType<TKey>(data.Key);
|
||||
|
||||
Assert.Equal(key, keyToken.Data);
|
||||
|
||||
var valueToken = Assert.IsType<TValue>(data.Value);
|
||||
var valueToken = Assert.IsType<TValue>(result);
|
||||
|
||||
Assert.Equal(value, valueToken.Data);
|
||||
}
|
||||
|
||||
private static KeyValuePair<IToken, IToken> GetIndex(int index, DictionaryToken dictionary)
|
||||
private static KeyValuePair<string, IToken> GetIndex(int index, DictionaryToken dictionary)
|
||||
{
|
||||
int i = 0;
|
||||
foreach (var pair in dictionary.Data)
|
||||
|
@@ -2,17 +2,14 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
using PdfPig.ContentStream;
|
||||
using PdfPig.Cos;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
using PdfPig.Tokenization.Tokens;
|
||||
using Xunit;
|
||||
|
||||
public class PdfTokenScannerTests
|
||||
{
|
||||
private readonly CrossReferenceTable table = new CrossReferenceTable(CrossReferenceType.Table, new Dictionary<CosObjectKey, long>(),
|
||||
new PdfDictionary());
|
||||
|
||||
[Fact]
|
||||
public void ReadsSimpleObject()
|
||||
{
|
||||
@@ -34,6 +31,23 @@
|
||||
Assert.Equal("WDKAAR+CMBX12", name.Data.Name);
|
||||
|
||||
Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsIndirectReferenceInObject()
|
||||
{
|
||||
const string s = @"
|
||||
15 0 obj
|
||||
12 7 R
|
||||
endobj";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
var token = ReadToEnd(scanner)[0];
|
||||
|
||||
var reference = Assert.IsType<IndirectReferenceToken>(token.Data);
|
||||
|
||||
Assert.Equal(new IndirectReference(12, 7), reference.Data);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -141,11 +155,131 @@ endobj
|
||||
Assert.Equal(274, tokens[0].Number.ObjectNumber);
|
||||
Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position));
|
||||
|
||||
var nameObject = Assert.IsType<NameToken>(tokens[1].Data);
|
||||
var nameObject = Assert.IsType<NameToken>(tokens[1].Data);
|
||||
|
||||
Assert.Equal("WPXNWT+CMR9", nameObject.Data.Name);
|
||||
Assert.Equal(310, tokens[1].Number.ObjectNumber);
|
||||
Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
|
||||
|
||||
dictionary = Assert.IsType<DictionaryToken>(tokens[2].Data);
|
||||
|
||||
Assert.Equal(7, dictionary.Data.Count);
|
||||
Assert.Equal(311, tokens[2].Number.ObjectNumber);
|
||||
Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStringObject()
|
||||
{
|
||||
const string s = @"
|
||||
|
||||
58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj
|
||||
";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
var token = ReadToEnd(scanner)[0];
|
||||
|
||||
Assert.Equal(58949797283757L, token.Number.ObjectNumber);
|
||||
Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType<StringToken>(token.Data).Data);
|
||||
|
||||
Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStreamObject()
|
||||
{
|
||||
const string s = @"
|
||||
352 0 obj
|
||||
<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
|
||||
stream
|
||||
H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´
|
||||
ɲ ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs Ô;€
|
||||
À»—ÀF`ÇF@ƒ4˜ï @¥T¨³fY: žw̵;’’Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒ›ß–
|
||||
¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñím·°gêêb,/,£P§õ^v¾ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ
|
||||
A¡¬àð‰É©ˆ°‘¼›‚%¥×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú;x#dÃÄ$m
|
||||
+)
|
||||
)†…±n
|
||||
9ùyŽA·n\ï»t!=3£½¡:®µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U‘¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²‚|ê«êªDµ5q°šR¦RÈ£n¾[è~“}ýƒÝ½SꞦ'æQŽzÝ‚mæ
|
||||
óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЫ#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c›©.Œòµ9zz0Ѳ‚B¢«#š-3ªà<cš¥’¡È¨qµ¦{pìÛ„Ã‡ŽŠ/íO»|áIclSCuo_Oœ\\ï!ª©«ªƒTþ5Ó‹™Ü”óî_9|ýÍ7ø!Ñý|2Goÿ€Î¶Öö…<ðáƒGéGá½G´Ã.®TŠóî=_|þ™‡ƒééFwßà 0æîc_Ó릳|ý|¶®æ„…†G8Òüï€l…\¦RFº:‰ VPð•S“Û¶ï V—ø/¿¾Xæ+«««ÖŽ4>ŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£ivvv…k2=µZMØ|Úl(ŠZV›ÍbI>Ÿl¹œ(â±Äbø”Uªñeü©U*‹’“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0‹Æôz“‰ NÊ,¬‚kÀ°F‚XÛ4&“ÉfÃñÅæûæy=ÆãIðE_¾Èårår/XÞ„/·qò›m¶ìÖ|†óx8Wð¹hºÜÂÕalÎü’˜Ã0^Òòòü¼yÞ¶´´DX
|
||||
)¨ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„› ”)‰'3¤œ\–™ËN–™ÿe^в y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@€Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤k¾hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SÐøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5‚²ìÉ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿i aœ‘ØÏºþÇoäO ôkÆ)
|
||||
endstream
|
||||
endobj
|
||||
353 0 obj
|
||||
1479
|
||||
endobj";
|
||||
|
||||
var locationProvider = new TestObjectLocationProvider();
|
||||
// Mark location of "353 0 obj"
|
||||
locationProvider.Offsets[new IndirectReference(353, 0)] = 1643;
|
||||
|
||||
var scanner = GetScanner(s, locationProvider);
|
||||
|
||||
var tokens = ReadToEnd(scanner);
|
||||
|
||||
Assert.Equal(2, tokens.Count);
|
||||
|
||||
var stream = Assert.IsType<StreamToken>(tokens[0].Data);
|
||||
|
||||
var str = Encoding.UTF8.GetString(stream.Data);
|
||||
|
||||
Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str);
|
||||
|
||||
Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsSimpleStreamObject()
|
||||
{
|
||||
// Length of the bytes as found by Encoding.UTF8.GetBytes is 45
|
||||
const string s = @"
|
||||
574387 0 obj
|
||||
<< /Length 45 >>
|
||||
stream
|
||||
À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú
|
||||
endstream
|
||||
endobj";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
var token = ReadToEnd(scanner)[0];
|
||||
|
||||
var stream = Assert.IsType<StreamToken>(token.Data);
|
||||
|
||||
Assert.Equal(45, stream.Data.Length);
|
||||
|
||||
var outputString = Encoding.UTF8.GetString(stream.Data);
|
||||
|
||||
Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽ‘ú", outputString);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStreamWithIndirectLength()
|
||||
{
|
||||
const string s = @"5 0 obj 52 endobj
|
||||
|
||||
|
||||
|
||||
12 0 obj
|
||||
|
||||
<< /Length 5 0 R /S 1245 >>
|
||||
|
||||
stream
|
||||
%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼
|
||||
endstream
|
||||
endobj";
|
||||
var locationProvider = new TestObjectLocationProvider();
|
||||
|
||||
locationProvider.Offsets[new IndirectReference(5, 0)] = 0;
|
||||
|
||||
var scanner = GetScanner(s, locationProvider);
|
||||
|
||||
var token = ReadToEnd(scanner)[1];
|
||||
|
||||
var stream = Assert.IsType<StreamToken>(token.Data);
|
||||
|
||||
Assert.Equal(52, stream.Data.Length);
|
||||
Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
|
||||
|
||||
var outputString = Encoding.UTF8.GetString(stream.Data);
|
||||
|
||||
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾–~´¼", outputString);
|
||||
|
@@ -0,0 +1,21 @@
|
||||
namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using PdfPig.ContentStream;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
|
||||
internal class TestObjectLocationProvider : IObjectLocationProvider
|
||||
{
|
||||
public Dictionary<IndirectReference, long> Offsets { get; } = new Dictionary<IndirectReference, long>();
|
||||
|
||||
public bool TryGetOffset(IndirectReference reference, out long offset)
|
||||
{
|
||||
return Offsets.TryGetValue(reference, out offset);
|
||||
}
|
||||
|
||||
public void UpdateOffset(IndirectReference reference, long offset)
|
||||
{
|
||||
Offsets[reference] = offset;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,15 +1,57 @@
|
||||
namespace UglyToad.PdfPig.ContentStream
|
||||
{
|
||||
/// <summary>
|
||||
/// Used to uniquely identify and refer to objects in the PDF file.
|
||||
/// </summary>
|
||||
internal struct IndirectReference
|
||||
{
|
||||
/// <summary>
|
||||
/// A positive integer object number.
|
||||
/// </summary>
|
||||
public long ObjectNumber { get; }
|
||||
|
||||
/// <summary>
|
||||
/// A non-negative integer generation number which starts as 0 and increases if the file is updated incrementally.
|
||||
/// </summary>
|
||||
public int Generation { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="IndirectReference"/>
|
||||
/// </summary>
|
||||
/// <param name="objectNumber">The object number.</param>
|
||||
/// <param name="generation">The generation number.</param>
|
||||
public IndirectReference(long objectNumber, int generation)
|
||||
{
|
||||
ObjectNumber = objectNumber;
|
||||
Generation = generation;
|
||||
}
|
||||
|
||||
public override bool Equals(object obj)
|
||||
{
|
||||
if (obj is IndirectReference reference)
|
||||
{
|
||||
return reference.ObjectNumber == ObjectNumber
|
||||
&& reference.Generation == Generation;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public override int GetHashCode()
|
||||
{
|
||||
unchecked
|
||||
{
|
||||
int hash = 59;
|
||||
hash = hash * 97 + ObjectNumber.GetHashCode();
|
||||
hash = hash * 97 + Generation.GetHashCode();
|
||||
|
||||
return hash;
|
||||
}
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{ObjectNumber} {Generation}";
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,11 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using ContentStream;
|
||||
|
||||
internal interface IObjectLocationProvider
|
||||
{
|
||||
bool TryGetOffset(IndirectReference reference, out long offset);
|
||||
|
||||
void UpdateOffset(IndirectReference reference, long offset);
|
||||
}
|
||||
}
|
@@ -1,37 +1,44 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
|
||||
internal class PdfTokenScanner : ISeekableTokenScanner
|
||||
{
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly CrossReferenceTable crossReferenceTable;
|
||||
private readonly IObjectLocationProvider objectLocationProvider;
|
||||
private readonly CoreTokenScanner coreTokenScanner;
|
||||
|
||||
/// <summary>
|
||||
/// Stores tokens encountered between obj - endobj markers for each <see cref="MoveNext"/> call.
|
||||
/// Cleared after each operation.
|
||||
/// </summary>
|
||||
private readonly List<IToken> readTokens = new List<IToken>();
|
||||
|
||||
// Store the previous 2 tokens and their positions so we can backtrack to find object numbers and stream dictionaries.
|
||||
private readonly long[] previousTokenPositions = new long[2];
|
||||
private readonly IToken[] previousTokens = new IToken[2];
|
||||
|
||||
private readonly Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
|
||||
|
||||
public IToken CurrentToken { get; private set; }
|
||||
|
||||
public long CurrentPosition => coreTokenScanner.CurrentPosition;
|
||||
|
||||
public PdfTokenScanner(IInputBytes inputBytes, CrossReferenceTable crossReferenceTable)
|
||||
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider)
|
||||
{
|
||||
this.inputBytes = inputBytes;
|
||||
this.crossReferenceTable = crossReferenceTable;
|
||||
this.objectLocationProvider = objectLocationProvider;
|
||||
coreTokenScanner = new CoreTokenScanner(inputBytes);
|
||||
}
|
||||
|
||||
public bool MoveNext()
|
||||
{
|
||||
// Read until we find object-number generation obj, e.g. "69 420 obj".
|
||||
int tokensRead = 0;
|
||||
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject)
|
||||
{
|
||||
@@ -49,6 +56,7 @@
|
||||
previousTokenPositions[1] = coreTokenScanner.CurrentTokenStart;
|
||||
}
|
||||
|
||||
// We only read partial tokens.
|
||||
if (tokensRead < 2)
|
||||
{
|
||||
return false;
|
||||
@@ -64,8 +72,7 @@
|
||||
$"Instead got: {previousTokens[0]} {previousTokens[1]} obj");
|
||||
}
|
||||
|
||||
var data = new List<IToken>();
|
||||
|
||||
// Read all tokens between obj and endobj.
|
||||
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject)
|
||||
{
|
||||
if (coreTokenScanner.CurrentToken is CommentToken)
|
||||
@@ -73,12 +80,26 @@
|
||||
continue;
|
||||
}
|
||||
|
||||
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
|
||||
if (coreTokenScanner.CurrentToken == OperatorToken.StartObject)
|
||||
{
|
||||
// Read stream.
|
||||
// This should never happen.
|
||||
Debug.Assert(false, "Encountered a start object 'obj' operator before the end of the previous object.");
|
||||
return false;
|
||||
}
|
||||
|
||||
data.Add(coreTokenScanner.CurrentToken);
|
||||
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
|
||||
{
|
||||
// Read stream: special case.
|
||||
if (TryReadStream(coreTokenScanner.CurrentTokenStart, out var stream))
|
||||
{
|
||||
readTokens.Clear();
|
||||
readTokens.Add(stream);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
readTokens.Add(coreTokenScanner.CurrentToken);
|
||||
}
|
||||
|
||||
previousTokens[0] = previousTokens[1];
|
||||
previousTokenPositions[0] = previousTokenPositions[1];
|
||||
@@ -89,10 +110,322 @@
|
||||
|
||||
if (coreTokenScanner.CurrentToken != OperatorToken.EndObject)
|
||||
{
|
||||
readTokens.Clear();
|
||||
return false;
|
||||
}
|
||||
|
||||
CurrentToken = new ObjectToken(startPosition, new IndirectReference(objectNumber.Long, generation.Int), data[data.Count - 1]);
|
||||
var reference = new IndirectReference(objectNumber.Long, generation.Int);
|
||||
|
||||
IToken token;
|
||||
if (readTokens.Count == 3 && readTokens[0] is NumericToken objNum
|
||||
&& readTokens[1] is NumericToken genNum
|
||||
&& readTokens[2] == OperatorToken.R)
|
||||
{
|
||||
// I have no idea if this can ever happen.
|
||||
token = new IndirectReferenceToken(new IndirectReference(objNum.Long, genNum.Int));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Just take the last, should only ever be 1
|
||||
Debug.Assert(readTokens.Count == 1, "Found more than 1 token in an object.");
|
||||
|
||||
token = readTokens[readTokens.Count - 1];
|
||||
}
|
||||
|
||||
CurrentToken = new ObjectToken(startPosition, reference, token);
|
||||
|
||||
objectLocationProvider.UpdateOffset(reference, startPosition);
|
||||
|
||||
readTokens.Clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool TryReadStream(long startStreamTokenOffset, out StreamToken stream)
|
||||
{
|
||||
stream = null;
|
||||
|
||||
DictionaryToken streamDictionaryToken = GetStreamDictionary();
|
||||
|
||||
// Get the expected length from the stream dictionary if present.
|
||||
long? length = GetStreamLength(streamDictionaryToken);
|
||||
|
||||
// Verify again that we start with "stream"
|
||||
var hasStartStreamToken = ReadStreamTokenStart(inputBytes, startStreamTokenOffset);
|
||||
|
||||
if (!hasStartStreamToken)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// From the specification: The stream operator should be followed by \r\n or \n, not just \r.
|
||||
if (!inputBytes.MoveNext())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (inputBytes.CurrentByte == '\r')
|
||||
{
|
||||
inputBytes.MoveNext();
|
||||
}
|
||||
|
||||
if (inputBytes.CurrentByte != '\n')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Store where we started reading the first byte of data.
|
||||
long startDataOffset = inputBytes.CurrentOffset;
|
||||
|
||||
// Store how many bytes we have read for checking against Length.
|
||||
long read = 0;
|
||||
|
||||
// We want to check if we ever read 'endobj' or 'endstream'.
|
||||
int endObjPosition = 0;
|
||||
int endStreamPosition = 0;
|
||||
int commonPartPosition = 0;
|
||||
|
||||
const string commonPart = "end";
|
||||
const string streamPart = "stream";
|
||||
const string objPart = "obj";
|
||||
|
||||
// Track any 'endobj' or 'endstream' operators we see.
|
||||
var observedEndLocations = new List<PossibleStreamEndLocation>();
|
||||
|
||||
// Keep track of the previous byte.
|
||||
byte previousByte = 0;
|
||||
|
||||
// Begin reading the stream.
|
||||
using (var memoryStream = new MemoryStream())
|
||||
using (var binaryWrite = new BinaryWriter(memoryStream))
|
||||
{
|
||||
while (inputBytes.MoveNext())
|
||||
{
|
||||
if (length.HasValue && read == length)
|
||||
{
|
||||
// TODO: read ahead and check we're at the end...
|
||||
// break;
|
||||
}
|
||||
|
||||
// We are reading 'end' (possibly).
|
||||
if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
|
||||
{
|
||||
if (commonPartPosition == 0 && !ReadHelper.IsWhitespace(previousByte))
|
||||
{
|
||||
// We've just encountered a normal 'e' in the stream.
|
||||
}
|
||||
else
|
||||
{
|
||||
commonPartPosition++;
|
||||
}
|
||||
}
|
||||
else if (commonPartPosition == commonPart.Length)
|
||||
{
|
||||
// We are reading 'stream' after 'end'
|
||||
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
|
||||
{
|
||||
endObjPosition = 0;
|
||||
endStreamPosition++;
|
||||
|
||||
// We've finished reading 'endstream', add it to the end tokens we've seen.
|
||||
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
|
||||
{
|
||||
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
|
||||
|
||||
observedEndLocations.Add(token);
|
||||
|
||||
if (length.HasValue && read > length)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
endStreamPosition = 0;
|
||||
}
|
||||
}
|
||||
else if (inputBytes.CurrentByte == objPart[endObjPosition])
|
||||
{
|
||||
// We are reading 'obj' after 'end'
|
||||
|
||||
endStreamPosition = 0;
|
||||
endObjPosition++;
|
||||
|
||||
// We have finished reading 'endobj'.
|
||||
if (endObjPosition == objPart.Length)
|
||||
{
|
||||
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
|
||||
if (observedEndLocations.Count > 0)
|
||||
{
|
||||
var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
|
||||
|
||||
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
|
||||
observedEndLocations.Add(token);
|
||||
|
||||
if (read > length)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// We were reading 'end' but then we had a character mismatch.
|
||||
// Reset all the counters.
|
||||
|
||||
endStreamPosition = 0;
|
||||
endObjPosition = 0;
|
||||
commonPartPosition = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// For safety reset every counter in case we had a partial read.
|
||||
|
||||
endStreamPosition = 0;
|
||||
endObjPosition = 0;
|
||||
commonPartPosition = 0;
|
||||
}
|
||||
|
||||
previousByte = inputBytes.CurrentByte;
|
||||
binaryWrite.Write(inputBytes.CurrentByte);
|
||||
|
||||
read++;
|
||||
}
|
||||
|
||||
binaryWrite.Flush();
|
||||
|
||||
if (observedEndLocations.Count == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
memoryStream.Seek(0, SeekOrigin.Begin);
|
||||
if (length.HasValue && memoryStream.Length >= length)
|
||||
{
|
||||
// Use the declared length to copy just the data we want.
|
||||
byte[] data = new byte[length.Value];
|
||||
|
||||
memoryStream.Read(data, 0, (int)length.Value);
|
||||
|
||||
stream = new StreamToken(streamDictionaryToken, data);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
|
||||
var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
|
||||
|
||||
var dataLength = lastEnd.Offset - startDataOffset;
|
||||
|
||||
var current = inputBytes.CurrentOffset;
|
||||
|
||||
// 3 characters, 'e', '\n' and possibly '\r'
|
||||
inputBytes.Seek(lastEnd.Offset - 3);
|
||||
inputBytes.MoveNext();
|
||||
|
||||
if (inputBytes.CurrentByte == '\r')
|
||||
{
|
||||
dataLength -= 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
dataLength -= 2;
|
||||
}
|
||||
|
||||
inputBytes.Seek(current);
|
||||
|
||||
byte[] data = new byte[dataLength];
|
||||
|
||||
memoryStream.Read(data, 0, (int)dataLength);
|
||||
|
||||
stream = new StreamToken(streamDictionaryToken, data);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private DictionaryToken GetStreamDictionary()
|
||||
{
|
||||
DictionaryToken streamDictionaryToken;
|
||||
if (previousTokens[1] is DictionaryToken firstDictionary)
|
||||
{
|
||||
streamDictionaryToken = firstDictionary;
|
||||
}
|
||||
else if (previousTokens[0] is DictionaryToken secondDictionary)
|
||||
{
|
||||
streamDictionaryToken = secondDictionary;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new PdfDocumentFormatException("No dictionary token was found prior to the 'stream' operator. Previous tokens were:" +
|
||||
$" {previousTokens[1]} and {previousTokens[0]}.");
|
||||
}
|
||||
|
||||
return streamDictionaryToken;
|
||||
}
|
||||
|
||||
private long? GetStreamLength(DictionaryToken dictionary)
|
||||
{
|
||||
if (!dictionary.Data.TryGetValue("Length", out var lengthValue))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
long? length = default(long?);
|
||||
|
||||
// Can either be number in the stream dictionary.
|
||||
if (lengthValue is NumericToken numeric)
|
||||
{
|
||||
return numeric.Long;
|
||||
}
|
||||
|
||||
long currentOffset = inputBytes.CurrentOffset;
|
||||
|
||||
// Or a reference to another numeric object.
|
||||
if (lengthValue is IndirectReferenceToken lengthReference)
|
||||
{
|
||||
// We can only find it if we know where it is.
|
||||
if (objectLocationProvider.TryGetOffset(lengthReference.Data, out var offset))
|
||||
{
|
||||
// Move to the length object and read it.
|
||||
Seek(offset);
|
||||
|
||||
// Keep a copy of the read tokens here since this list must be empty prior to move next.
|
||||
var oldData = new List<IToken>(readTokens);
|
||||
readTokens.Clear();
|
||||
if (MoveNext() && ((ObjectToken)CurrentToken).Data is NumericToken lengthToken)
|
||||
{
|
||||
length = lengthToken.Long;
|
||||
}
|
||||
readTokens.AddRange(oldData);
|
||||
|
||||
// Move back to where we started.
|
||||
Seek(currentOffset);
|
||||
}
|
||||
else
|
||||
{
|
||||
// warn, we had a reference to a length object but didn't find it...
|
||||
}
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
private static bool ReadStreamTokenStart(IInputBytes input, long tokenStart)
|
||||
{
|
||||
input.Seek(tokenStart);
|
||||
|
||||
for (var i = 0; i < OperatorToken.StartStream.Data.Length; i++)
|
||||
{
|
||||
if (!input.MoveNext() || input.CurrentByte != OperatorToken.StartStream.Data[i])
|
||||
{
|
||||
input.Seek(tokenStart);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@@ -0,0 +1,35 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using Tokens;
|
||||
|
||||
/// <summary>
|
||||
/// Used internally by the <see cref="PdfTokenScanner"/> when reading streams to store any occurrences of 'endobj' or 'endstream' observed.
|
||||
/// </summary>
|
||||
internal struct PossibleStreamEndLocation
|
||||
{
|
||||
/// <summary>
|
||||
/// The offset at which the token started in the file.
|
||||
/// </summary>
|
||||
public long Offset { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The type, one of either <see cref="OperatorToken.EndObject"/> or <see cref="OperatorToken.EndStream"/>.
|
||||
/// </summary>
|
||||
public OperatorToken Type { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="PossibleStreamEndLocation"/>
|
||||
/// </summary>
|
||||
public PossibleStreamEndLocation(long offset, OperatorToken type)
|
||||
{
|
||||
Offset = offset;
|
||||
Type = type ?? throw new ArgumentNullException(nameof(type));
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{Offset}: {Type}";
|
||||
}
|
||||
}
|
||||
}
|
@@ -6,14 +6,34 @@
|
||||
using Cos;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
internal class DictionaryToken : IDataToken<IReadOnlyDictionary<IToken, IToken>>
|
||||
internal class DictionaryToken : IDataToken<IReadOnlyDictionary<string, IToken>>
|
||||
{
|
||||
[NotNull]
|
||||
public IReadOnlyDictionary<IToken, IToken> Data { get; }
|
||||
public IReadOnlyDictionary<string, IToken> Data { get; }
|
||||
|
||||
public DictionaryToken([NotNull]IReadOnlyDictionary<IToken, IToken> data)
|
||||
{
|
||||
Data = data ?? throw new ArgumentNullException(nameof(data));
|
||||
if (data == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(data));
|
||||
}
|
||||
|
||||
var result = new Dictionary<string, IToken>(data.Count);
|
||||
|
||||
foreach (var keyValuePair in data)
|
||||
{
|
||||
if (keyValuePair.Key is NameToken name)
|
||||
{
|
||||
result[name.Data.Name] = keyValuePair.Value;
|
||||
}
|
||||
else
|
||||
{
|
||||
// For now:
|
||||
throw new InvalidOperationException("Key for dictionary token was not a string! " + keyValuePair.Key);
|
||||
}
|
||||
}
|
||||
|
||||
Data = result;
|
||||
}
|
||||
|
||||
public bool TryGetByName(CosName name, out IToken token)
|
||||
@@ -23,19 +43,7 @@
|
||||
throw new ArgumentNullException(nameof(name));
|
||||
}
|
||||
|
||||
token = null;
|
||||
|
||||
foreach (var keyValuePair in Data)
|
||||
{
|
||||
if (keyValuePair.Key is NameToken nameToken && nameToken.Data.Equals(name))
|
||||
{
|
||||
token = keyValuePair.Value;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return Data.TryGetValue(name.Name, out token);
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
|
15
src/UglyToad.PdfPig/Tokenization/Tokens/StreamToken.cs
Normal file
15
src/UglyToad.PdfPig/Tokenization/Tokens/StreamToken.cs
Normal file
@@ -0,0 +1,15 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Tokens
|
||||
{
|
||||
internal class StreamToken : IDataToken<byte[]>
|
||||
{
|
||||
public DictionaryToken StreamDictionary { get; }
|
||||
|
||||
public byte[] Data { get; }
|
||||
|
||||
public StreamToken(DictionaryToken streamDictionary, byte[] data)
|
||||
{
|
||||
StreamDictionary = streamDictionary;
|
||||
Data = data;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user