continue migrating code to tokenizer

This commit is contained in:
Eliot Jones
2018-01-20 18:42:29 +00:00
parent 3d2a66cbf9
commit 7d90f4858a
30 changed files with 542 additions and 368 deletions

View File

@@ -5,6 +5,7 @@
using System.Text;
using IO;
using PdfPig.ContentStream;
using PdfPig.Filters;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokenization.Tokens;
using PdfPig.Util;
@@ -298,7 +299,7 @@ endobj
stream
%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼
endstream
endobj";
endobj";
var scanner = GetScanner(s);
@@ -310,7 +311,8 @@ endobj";
Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString());
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data));
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data));
}
[Fact]
public void ReadsStreamWithoutBreakBeforeEndstream()
@@ -332,4 +334,22 @@ endobj";
16
endobj";
var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
var scanner = new PdfTokenScanner(inputBytes, new TestObjectLocationProvider(), new TestFilterProvider());
var token = ReadToEnd(scanner)[1];
Assert.Equal(7, token.Number.ObjectNumber);
}
private PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null)
{
var input = StringBytesTestConverter.Convert(s, false);
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider());
}
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
{

View File

@@ -3,6 +3,7 @@
using System.Collections.Generic;
using PdfPig.ContentStream;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokenization.Tokens;
internal class TestObjectLocationProvider : IObjectLocationProvider
{
@@ -17,5 +18,15 @@
{
Offsets[reference] = offset;
}
public bool TryGetCached(IndirectReference reference, out ObjectToken objectToken)
{
objectToken = null;
return false;
}
public void Cache(ObjectToken objectToken)
{
}
}
}

View File

@@ -16,14 +16,14 @@
private readonly IPageFactory pageFactory;
private readonly IRandomAccessRead reader;
private readonly bool isLenientParsing;
private readonly IPdfObjectScanner pdfScanner;
private readonly IPdfTokenScanner pdfScanner;
private readonly DictionaryToken rootPageDictionary;
private readonly Dictionary<int, DictionaryToken> locatedPages = new Dictionary<int, DictionaryToken>();
public int Count { get; }
internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory,
IRandomAccessRead reader, bool isLenientParsing, IPdfObjectScanner pdfScanner)
IRandomAccessRead reader, bool isLenientParsing, IPdfTokenScanner pdfScanner)
{
if (catalog == null)
{

View File

@@ -10,13 +10,13 @@
internal class ResourceContainer : IResourceStore
{
private readonly IPdfObjectScanner scanner;
private readonly IPdfTokenScanner scanner;
private readonly IFontFactory fontFactory;
private readonly Dictionary<IndirectReference, IFont> loadedFonts = new Dictionary<IndirectReference, IFont>();
private readonly Dictionary<NameToken, IndirectReference> currentResourceState = new Dictionary<NameToken, IndirectReference>();
public ResourceContainer(IPdfObjectScanner scanner, IFontFactory fontFactory)
public ResourceContainer(IPdfTokenScanner scanner, IFontFactory fontFactory)
{
this.scanner = scanner;
this.fontFactory = fontFactory;

View File

@@ -1,5 +1,7 @@
namespace UglyToad.PdfPig.ContentStream
{
using System.Diagnostics;
/// <summary>
/// Used to uniquely identify and refer to objects in the PDF file.
/// </summary>
@@ -20,6 +22,7 @@
/// </summary>
/// <param name="objectNumber">The object number.</param>
/// <param name="generation">The generation number.</param>
[DebuggerStepThrough]
public IndirectReference(long objectNumber, int generation)
{
ObjectNumber = objectNumber;

View File

@@ -3,6 +3,7 @@
using System;
using System.Collections.Generic;
using ContentStream;
using Tokenization.Tokens;
using Util.JetBrains.Annotations;
internal class CrossReferenceTable
@@ -15,9 +16,9 @@
public IReadOnlyDictionary<IndirectReference, long> ObjectOffsets => objectOffsets;
[NotNull]
public PdfDictionary Dictionary { get; }
public DictionaryToken Dictionary { get; }
public CrossReferenceTable(CrossReferenceType type, IReadOnlyDictionary<IndirectReference, long> objectOffsets, PdfDictionary dictionary)
public CrossReferenceTable(CrossReferenceType type, IReadOnlyDictionary<IndirectReference, long> objectOffsets, DictionaryToken dictionary)
{
if (objectOffsets == null)
{

View File

@@ -4,8 +4,8 @@
using System.Collections.Generic;
using System.Linq;
using ContentStream;
using ContentStream.TypedAccessors;
using Logging;
using Tokenization.Tokens;
/// <summary>
///
@@ -31,7 +31,7 @@
public CrossReferenceTable Build(long startXrefOffset, ILog log)
{
CrossReferenceType type = CrossReferenceType.Table;
PdfDictionary trailerDictionary = new PdfDictionary();
DictionaryToken trailerDictionary = new DictionaryToken(new Dictionary<IToken, IToken>());
Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
List<long> xrefSeqBytePos = new List<long>();
@@ -59,7 +59,7 @@
while (currentPart.Dictionary != null)
{
long prevBytePos = currentPart.Dictionary.GetLongOrDefault(CosName.PREV, -1L);
long prevBytePos = currentPart.GetPreviousOffset();
if (prevBytePos == -1)
{
break;
@@ -91,16 +91,16 @@
var currentObject = parts.First(x => x.Offset == bPos);
if (currentObject.Dictionary != null)
{
foreach (var entry in currentObject.Dictionary)
foreach (var entry in currentObject.Dictionary.Data)
{
/*
* If we're at a second trailer, we have a linearized pdf file, meaning that the first Size entry represents
* all of the objects so we don't need to grab the second.
*/
if (!entry.Key.Name.Equals("Size")
|| !trailerDictionary.ContainsKey(CosName.Create("Size")))
if (!entry.Key.Equals("Size", StringComparison.OrdinalIgnoreCase)
|| !trailerDictionary.ContainsKey(NameToken.Size))
{
trailerDictionary.Set(entry.Key, entry.Value);
trailerDictionary = trailerDictionary.With(entry.Key, entry.Value);
}
}
}

View File

@@ -2,7 +2,7 @@
{
using System.Collections.Generic;
using ContentStream;
using ContentStream.TypedAccessors;
using Tokenization.Tokens;
/// <summary>
///
@@ -29,11 +29,11 @@
public long Previous { get; }
public PdfDictionary Dictionary { get; }
public DictionaryToken Dictionary { get; private set; }
public CrossReferenceType Type { get; }
public CrossReferenceTablePart(IReadOnlyDictionary<IndirectReference, long> objectOffsets, long offset, long previous, PdfDictionary dictionary, CrossReferenceType type)
public CrossReferenceTablePart(IReadOnlyDictionary<IndirectReference, long> objectOffsets, long offset, long previous, DictionaryToken dictionary, CrossReferenceType type)
{
ObjectOffsets = objectOffsets;
Offset = offset;
@@ -45,7 +45,17 @@
public void FixOffset(long offset)
{
Offset = offset;
Dictionary.SetLong(CosName.PREV, offset);
Dictionary = Dictionary.With(NameToken.Prev, new NumericToken(offset));
}
public long GetPreviousOffset()
{
if (Dictionary.TryGet(NameToken.Prev, out var token) && token is NumericToken numeric)
{
return numeric.Long;
}
return -1;
}
}
}

View File

@@ -2,6 +2,7 @@
{
using System.Collections.Generic;
using ContentStream;
using Tokenization.Tokens;
internal class CrossReferenceTablePartBuilder
{
@@ -11,7 +12,7 @@
public long Previous { get; set; }
public PdfDictionary Dictionary { get; set; }
public DictionaryToken Dictionary { get; set; }
public CrossReferenceType XRefType { get; set; }

View File

@@ -10,9 +10,9 @@
internal class EncodingReader : IEncodingReader
{
private readonly IPdfObjectScanner pdfScanner;
private readonly IPdfTokenScanner pdfScanner;
public EncodingReader(IPdfObjectScanner pdfScanner)
public EncodingReader(IPdfTokenScanner pdfScanner)
{
this.pdfScanner = pdfScanner;
}

View File

@@ -28,7 +28,7 @@
return number.Int;
}
public static decimal[] GetWidths(IPdfObjectScanner pdfScanner, DictionaryToken dictionary, bool isLenientParsing)
public static decimal[] GetWidths(IPdfTokenScanner pdfScanner, DictionaryToken dictionary, bool isLenientParsing)
{
if (!dictionary.TryGet(NameToken.Widths, out var token))
{
@@ -53,7 +53,7 @@
return result;
}
public static FontDescriptor GetFontDescriptor(IPdfObjectScanner pdfScanner, FontDescriptorFactory fontDescriptorFactory, DictionaryToken dictionary,
public static FontDescriptor GetFontDescriptor(IPdfTokenScanner pdfScanner, FontDescriptorFactory fontDescriptorFactory, DictionaryToken dictionary,
bool isLenientParsing)
{
if (!dictionary.TryGet(NameToken.FontDesc, out var obj))
@@ -68,7 +68,7 @@
return descriptor;
}
public static NameToken GetName(IPdfObjectScanner pdfScanner, DictionaryToken dictionary, FontDescriptor descriptor, bool isLenientParsing)
public static NameToken GetName(IPdfTokenScanner pdfScanner, DictionaryToken dictionary, FontDescriptor descriptor, bool isLenientParsing)
{
if (dictionary.TryGet(NameToken.BaseFont, out var nameBase))
{

View File

@@ -20,9 +20,9 @@
private readonly FontDescriptorFactory fontDescriptorFactory;
private readonly TrueTypeFontParser trueTypeFontParser;
private readonly IEncodingReader encodingReader;
private readonly IPdfObjectScanner pdfScanner;
private readonly IPdfTokenScanner pdfScanner;
public TrueTypeFontHandler(IPdfObjectScanner pdfScanner, IFilterProvider filterProvider,
public TrueTypeFontHandler(IPdfTokenScanner pdfScanner, IFilterProvider filterProvider,
CMapCache cMapCache,
FontDescriptorFactory fontDescriptorFactory,
TrueTypeFontParser trueTypeFontParser,

View File

@@ -18,10 +18,10 @@
private readonly CidFontFactory cidFontFactory;
private readonly CMapCache cMapCache;
private readonly IFilterProvider filterProvider;
private readonly IPdfObjectScanner scanner;
private readonly IPdfTokenScanner scanner;
public Type0FontHandler(CidFontFactory cidFontFactory, CMapCache cMapCache, IFilterProvider filterProvider,
IPdfObjectScanner scanner)
IPdfTokenScanner scanner)
{
this.cidFontFactory = cidFontFactory;
this.cMapCache = cMapCache;

View File

@@ -16,14 +16,14 @@
internal class Type1FontHandler : IFontHandler
{
private readonly IPdfObjectScanner pdfScanner;
private readonly IPdfTokenScanner pdfScanner;
private readonly CMapCache cMapCache;
private readonly IFilterProvider filterProvider;
private readonly FontDescriptorFactory fontDescriptorFactory;
private readonly IEncodingReader encodingReader;
private readonly Type1FontParser type1FontParser;
public Type1FontHandler(IPdfObjectScanner pdfScanner, CMapCache cMapCache, IFilterProvider filterProvider,
public Type1FontHandler(IPdfTokenScanner pdfScanner, CMapCache cMapCache, IFilterProvider filterProvider,
FontDescriptorFactory fontDescriptorFactory,
IEncodingReader encodingReader,
Type1FontParser type1FontParser)

View File

@@ -18,9 +18,9 @@
private readonly CMapCache cMapCache;
private readonly IFilterProvider filterProvider;
private readonly IEncodingReader encodingReader;
private readonly IPdfObjectScanner scanner;
private readonly IPdfTokenScanner scanner;
public Type3FontHandler(IPdfObjectScanner scanner, CMapCache cMapCache, IFilterProvider filterProvider,
public Type3FontHandler(IPdfTokenScanner scanner, CMapCache cMapCache, IFilterProvider filterProvider,
IEncodingReader encodingReader)
{
this.cMapCache = cMapCache;

View File

@@ -19,9 +19,9 @@
private readonly FontDescriptorFactory descriptorFactory;
private readonly TrueTypeFontParser trueTypeFontParser;
private readonly IFilterProvider filterProvider;
private readonly IPdfObjectScanner pdfScanner;
private readonly IPdfTokenScanner pdfScanner;
public CidFontFactory(IPdfObjectScanner pdfScanner, FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser,
public CidFontFactory(IPdfTokenScanner pdfScanner, FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser,
IFilterProvider filterProvider)
{
this.descriptorFactory = descriptorFactory;

View File

@@ -11,9 +11,9 @@
internal class CatalogFactory
{
private readonly IPdfObjectScanner scanner;
private readonly IPdfTokenScanner scanner;
public CatalogFactory(IPdfObjectScanner scanner)
public CatalogFactory(IPdfTokenScanner scanner)
{
this.scanner = scanner;
}

View File

@@ -1,44 +1,48 @@
namespace UglyToad.PdfPig.Parser
{
using Content;
using ContentStream;
using Cos;
using IO;
using Parts;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class DocumentInformationFactory
{
public DocumentInformation Create(IPdfObjectParser pdfObjectParser,
PdfDictionary rootDictionary, IRandomAccessRead reader,
bool isLenientParsing)
public DocumentInformation Create(IPdfTokenScanner pdfTokenScanner, DictionaryToken rootDictionary)
{
if (!rootDictionary.TryGetItemOfType(CosName.INFO, out CosObject infoBase))
if (!rootDictionary.TryGet(NameToken.Info, out var infoBase))
{
return DocumentInformation.Default;
}
var infoParsed = pdfObjectParser.Parse(infoBase.ToIndirectReference(), reader, isLenientParsing);
var infoParsed = DirectObjectFinder.Get<DictionaryToken>(infoBase, pdfTokenScanner);
if (!(infoParsed is PdfDictionary infoDictionary))
{
return DocumentInformation.Default;
}
var title = GetEntryOrDefault(infoDictionary, CosName.TITLE);
var author = GetEntryOrDefault(infoDictionary, CosName.AUTHOR);
var subject = GetEntryOrDefault(infoDictionary, CosName.SUBJECT);
var keywords = GetEntryOrDefault(infoDictionary, CosName.KEYWORDS);
var creator = GetEntryOrDefault(infoDictionary, CosName.CREATOR);
var producer = GetEntryOrDefault(infoDictionary, CosName.PRODUCER);
var title = GetEntryOrDefault(infoParsed, NameToken.Title);
var author = GetEntryOrDefault(infoParsed, NameToken.Author);
var subject = GetEntryOrDefault(infoParsed, NameToken.Subject);
var keywords = GetEntryOrDefault(infoParsed, NameToken.Keywords);
var creator = GetEntryOrDefault(infoParsed, NameToken.Creator);
var producer = GetEntryOrDefault(infoParsed, NameToken.Producer);
return new DocumentInformation(title, author, subject,
keywords, creator, producer);
}
private static string GetEntryOrDefault(PdfDictionary infoDictionary, CosName key)
private static string GetEntryOrDefault(DictionaryToken infoDictionary, NameToken key)
{
if (infoDictionary.TryGetItemOfType(key, out CosString str))
if (!infoDictionary.TryGet(key, out var value))
{
return str.GetAscii();
return null;
}
if (value is StringToken str)
{
return str.Data;
}
if (value is HexToken hex)
{
return hex.Data;
}
return null;

View File

@@ -2,8 +2,6 @@
{
using System;
using System.Collections.Generic;
using ContentStream;
using ContentStream.TypedAccessors;
using Cos;
using Exceptions;
using IO;
@@ -15,122 +13,93 @@
internal class CrossReferenceParser
{
private const int X = 'x';
private readonly ILog log;
private readonly CosDictionaryParser dictionaryParser;
private readonly CosBaseParser baseParser;
private readonly CosStreamParser streamParser;
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
private readonly CrossReferenceTableParser crossReferenceTableParser;
private readonly OldCrossReferenceTableParser oldCrossReferenceTableParser;
private readonly XrefCosOffsetChecker xrefCosChecker;
public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
CosStreamParser streamParser,
CrossReferenceStreamParser crossReferenceStreamParser,
CrossReferenceTableParser crossReferenceTableParser,
OldCrossReferenceTableParser oldCrossReferenceTableParser)
CrossReferenceTableParser crossReferenceTableParser)
{
this.log = log;
this.dictionaryParser = dictionaryParser;
this.baseParser = baseParser;
this.streamParser = streamParser;
this.crossReferenceStreamParser = crossReferenceStreamParser;
this.crossReferenceTableParser = crossReferenceTableParser;
this.oldCrossReferenceTableParser = oldCrossReferenceTableParser;
}
public CrossReferenceTable ParseNew(long crossReferenceLocation, ISeekableTokenScanner scanner,
bool isLenientParsing)
{
var previousLocation = crossReferenceLocation;
var visitedCrossReferences = new HashSet<long>();
while (previousLocation >= 0)
{
scanner.Seek(crossReferenceLocation);
scanner.MoveNext();
if (scanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{
var table = crossReferenceTableParser.Parse(scanner, crossReferenceLocation, isLenientParsing);
previousLocation = table.Dictionary.GetLongOrDefault(CosName.PREV, -1);
}
else if (scanner.CurrentToken is NumericToken streamObjectNumberToken)
{
break;
}
else
{
throw new PdfDocumentFormatException($"The xref object was not a stream or a table, was instead: {scanner.CurrentToken}.");
}
}
return null;
xrefCosChecker = new XrefCosOffsetChecker();
}
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
CosObjectPool pool)
CosObjectPool pool, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
{
var xrefOffsetValidator = new XrefOffsetValidator(log, reader, dictionaryParser, baseParser, pool);
var xrefCosChecker = new XrefCosOffsetChecker();
long fixedOffset = xrefOffsetValidator.CheckXRefOffset(xrefLocation, isLenientParsing);
if (fixedOffset > -1)
{
xrefLocation = fixedOffset;
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
}
var table = new CrossReferenceTableBuilder();
var prevSet = new HashSet<long>();
long previousCrossReferenceLocation = xrefLocation;
// ---- parse whole chain of xref tables/object streams using PREV reference
HashSet<long> prevSet = new HashSet<long>();
// Parse all cross reference tables and streams.
while (previousCrossReferenceLocation > 0)
{
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
// seek to xref table
reader.Seek(previousCrossReferenceLocation);
tokenScanner.Seek(previousCrossReferenceLocation);
ReadHelper.SkipSpaces(reader);
tokenScanner.MoveNext();
var isTable = reader.Peek() == X;
// -- parse xref
if (isTable)
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
{
// xref table and trailer
// use existing parser to parse xref table
if (!oldCrossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
{
throw new InvalidOperationException($"Expected trailer object at position: {reader.GetPosition()}");
}
log.Debug("Element was cross reference table.");
CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner,
previousCrossReferenceLocation, isLenientParsing);
previousCrossReferenceLocation = tablePart.GetPreviousOffset();
DictionaryToken tableDictionary = tablePart.Dictionary;
PdfDictionary trailer = tableBuilder.Dictionary;
CrossReferenceTablePart streamPart = null;
// check for a XRef stream, it may contain some object ids of compressed objects
if (trailer.ContainsKey(CosName.XREF_STM))
if (tableDictionary.ContainsKey(NameToken.XrefStm))
{
int streamOffset = trailer.GetIntOrDefault(CosName.XREF_STM);
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(streamOffset, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
log.Warn("/XRefStm offset " + streamOffset + " is incorrect, corrected to " + fixedOffset);
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
streamOffset = (int)fixedOffset;
trailer.SetInt(CosName.XREF_STM, streamOffset);
tableBuilder.Offset = streamOffset;
// Update the cross reference table to be a stream instead.
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
tablePart.Previous, tableDictionary, tablePart.Type);
}
// Read the stream from the table.
if (streamOffset > 0)
{
reader.Seek(streamOffset);
ReadHelper.SkipSpaces(reader);
try
{
streamPart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
streamPart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
}
catch (InvalidOperationException ex)
{
@@ -140,7 +109,7 @@
}
else
{
throw ex;
throw;
}
}
}
@@ -152,35 +121,27 @@
}
else
{
throw new InvalidOperationException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
}
}
}
previousCrossReferenceLocation = trailer.GetLongOrDefault(CosName.PREV);
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
previousCrossReferenceLocation = fixedOffset;
trailer.SetLong(CosName.PREV, previousCrossReferenceLocation);
}
}
tableBuilder.Previous = tableBuilder.Dictionary.GetLongOrDefault(CosName.PREV);
table.Add(tableBuilder.Build());
table.Add(tablePart);
if (streamPart != null)
{
table.Add(streamPart);
}
}
else
else if (tokenScanner.CurrentToken is NumericToken)
{
log.Debug("Element was cross reference stream.");
// Unread the numeric token.
tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream
var tablePart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
table.Add(tablePart);
previousCrossReferenceLocation = tablePart.Previous;
@@ -195,10 +156,19 @@
}
}
}
else
{
log.Debug("Element was invalid.");
throw new PdfDocumentFormatException("The cross reference found at this location was not a " +
$"table or a stream: Location - {previousCrossReferenceLocation}, {tokenScanner.CurrentPosition}.");
}
if (prevSet.Contains(previousCrossReferenceLocation))
{
throw new InvalidOperationException("/Prev loop at offset " + previousCrossReferenceLocation);
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
}
prevSet.Add(previousCrossReferenceLocation);
}
@@ -210,19 +180,20 @@
return resolved;
}
private CrossReferenceTablePart ParseCrossReferenceStream(IRandomAccessRead reader, long objByteOffset, CosObjectPool pool,
bool isLenientParsing)
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
{
// ---- parse indirect object head
ObjectHelper.ReadObjectNumber(reader);
ObjectHelper.ReadGenerationNumber(reader);
pdfScanner.Seek(objByteOffset);
ReadHelper.ReadExpectedString(reader, "obj", true);
pdfScanner.MoveNext();
PdfDictionary dict = dictionaryParser.Parse(reader, baseParser, pool);
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
PdfRawStream xrefStream = streamParser.Parse(reader, dict, isLenientParsing, null);
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, xrefStream);
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
{
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
}
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
return xrefTablePart;
}

View File

@@ -2,7 +2,6 @@
{
using System.Collections.Generic;
using System.Linq;
using ContentStream;
using Cos;
using Exceptions;
using Parts.CrossReference;
@@ -183,7 +182,7 @@
return objectCount;
}
private static PdfDictionary ParseTrailer(ISeekableTokenScanner scanner, bool isLenientParsing)
private static DictionaryToken ParseTrailer(ISeekableTokenScanner scanner, bool isLenientParsing)
{
if (scanner.CurrentToken is OperatorToken trailerToken && trailerToken.Data == "trailer")
{
@@ -192,7 +191,7 @@
throw new PdfDocumentFormatException($"Expected to find a dictionary in the trailer but instead found: {scanner.CurrentToken}.");
}
return PdfDictionary.FromDictionaryToken(trailerDictionary);
return trailerDictionary;
}
if (isLenientParsing)
@@ -210,7 +209,7 @@
if (foundTrailer && scanner.TryReadToken(out DictionaryToken trailerDictionary))
{
return PdfDictionary.FromDictionaryToken(trailerDictionary);
return trailerDictionary;
}
}

View File

@@ -146,7 +146,7 @@
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
}
builder.Dictionary = trailer;
//builder.Dictionary = trailer;
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
return true;

View File

@@ -20,9 +20,9 @@
private readonly IResourceStore resourceStore;
private readonly IFilterProvider filterProvider;
private readonly IPageContentParser pageContentParser;
private readonly IPdfObjectScanner pdfScanner;
private readonly IPdfTokenScanner pdfScanner;
public PageFactory(IPdfObjectScanner pdfScanner, IResourceStore resourceStore, IFilterProvider filterProvider,
public PageFactory(IPdfTokenScanner pdfScanner, IResourceStore resourceStore, IFilterProvider filterProvider,
IPageContentParser pageContentParser)
{
this.resourceStore = resourceStore;

View File

@@ -0,0 +1,57 @@
namespace UglyToad.PdfPig.Parser.Parts.CrossReference
{
using System;
using Exceptions;
using Tokenization.Tokens;
using Util;
/// <summary>
/// The array representing the size of the fields in a cross reference stream.
/// </summary>
internal class CrossReferenceStreamFieldSize
{
/// <summary>
/// The type of the entry.
/// </summary>
public int Field1Size { get; }
/// <summary>
/// Type 0 and 2 is the object number, Type 1 this is the byte offset from beginning of file.
/// </summary>
public int Field2Size { get; }
/// <summary>
/// For types 0 and 1 this is the generation number. For type 2 it is the stream index.
/// </summary>
public int Field3Size { get; }
/// <summary>
/// How many bytes are in a line.
/// </summary>
public int LineLength { get; }
public CrossReferenceStreamFieldSize(DictionaryToken dictionary)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
if (!dictionary.TryGet(NameToken.W, out var token) || !(token is ArrayToken wArray))
{
throw new PdfDocumentFormatException($"The W entry for the stream dictionary was not an array: {token}.");
}
if (wArray.Data.Count < 3)
{
throw new PdfDocumentFormatException($"There must be at least 3 entries in a W entry for a stream dictionary: {wArray}.");
}
Field1Size = wArray.GetNumeric(0).Int;
Field2Size = wArray.GetNumeric(1).Int;
Field3Size = wArray.GetNumeric(2).Int;
LineLength = Field1Size + Field2Size + Field3Size;
}
}
}

View File

@@ -1,11 +1,11 @@
namespace UglyToad.PdfPig.Parser.Parts.CrossReference
{
using System.Collections.Generic;
using System.IO;
using ContentStream;
using ContentStream.TypedAccessors;
using Cos;
using Exceptions;
using Filters;
using Tokenization.Tokens;
using Util;
internal class CrossReferenceStreamParser
{
@@ -19,93 +19,91 @@
/// <summary>
/// Parses through the unfiltered stream and populates the xrefTable HashMap.
/// </summary>
public CrossReferenceTablePart Parse(long streamOffset, PdfRawStream stream)
public CrossReferenceTablePart Parse(long streamOffset, StreamToken stream)
{
var w = stream.Dictionary.GetDictionaryObject(CosName.W);
if (!(w is COSArray format))
byte[] decoded = stream.Decode(filterProvider);
var fieldSizes = new CrossReferenceStreamFieldSize(stream.StreamDictionary);
var lineCount = decoded.Length / fieldSizes.LineLength;
long previousOffset = -1;
if (stream.StreamDictionary.TryGet(NameToken.Prev, out var prevToken) && prevToken is NumericToken prevNumeric)
{
throw new IOException("/W array is missing in Xref stream");
previousOffset = prevNumeric.Long;
}
var objNums = GetObjectNumbers(stream);
/*
* Calculating the size of the line in bytes
*/
int w0 = format.getInt(0);
int w1 = format.getInt(1);
int w2 = format.getInt(2);
int lineSize = w0 + w1 + w2;
var decoded = stream.Decode(filterProvider);
var lineCount = decoded.Length / lineSize;
var lineNumber = 0;
var builder = new CrossReferenceTablePartBuilder
{
Offset = streamOffset,
Previous = stream.Dictionary.GetLongOrDefault(CosName.PREV),
Dictionary = stream.Dictionary,
Previous = previousOffset,
Dictionary = stream.StreamDictionary,
XRefType = CrossReferenceType.Stream
};
using (IEnumerator<long> objIter = objNums.GetEnumerator())
{
var currLine = new byte[lineSize];
var objectNumbers = GetObjectNumbers(stream.StreamDictionary);
while (lineNumber < lineCount && objIter.MoveNext())
var lineNumber = 0;
var lineBuffer = new byte[fieldSizes.LineLength];
foreach (var objectNumber in objectNumbers)
{
var byteOffset = lineNumber * lineSize;
for (int i = 0; i < lineSize; i++)
if (lineNumber >= lineCount)
{
currLine[i] = decoded[byteOffset + i];
break;
}
var byteOffset = lineNumber * fieldSizes.LineLength;
for (var i = 0; i < fieldSizes.LineLength; i++)
{
lineBuffer[i] = decoded[byteOffset + i];
}
int type;
if (w0 == 0)
if (fieldSizes.Field1Size == 0)
{
// "If the first element is zero,
// the type field shall not be present, and shall default to type 1"
type = 1;
}
else
{
type = 0;
/*
* Grabs the number of bytes specified for the first column in
* the W array and stores it.
*/
for (int i = 0; i < w0; i++)
for (var i = 0; i < fieldSizes.Field1Size; i++)
{
type += (currLine[i] & 0x00ff) << ((w0 - i - 1) * 8);
type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8);
}
}
//Need to remember the current objID
long objectId = objIter.Current;
/*
* 3 different types of entries.
*/
ReadNextStreamObject(type, objectNumber, fieldSizes, builder, lineBuffer);
lineNumber++;
}
return builder.Build();
}
private static void ReadNextStreamObject(int type, long objectNumber, CrossReferenceStreamFieldSize fieldSizes,
CrossReferenceTablePartBuilder builder, byte[] lineBuffer)
{
switch (type)
{
case 0:
/*
* Skipping free objects
*/
// Ignore free objects.
break;
case 1:
// Non object stream entries.
int offset = 0;
for (int i = 0; i < w1; i++)
for (int i = 0; i < fieldSizes.Field2Size; i++)
{
offset += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8);
offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
}
int genNum = 0;
for (int i = 0; i < w2; i++)
for (int i = 0; i < fieldSizes.Field3Size; i++)
{
genNum += (currLine[i + w0 + w1] & 0x00ff) << ((w2 - i - 1) * 8);
genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8);
}
builder.Add(objectId, genNum, offset);
builder.Add(objectNumber, genNum, offset);
break;
case 2:
@@ -125,48 +123,43 @@
* distinguish from file offsets
*/
int objstmObjNr = 0;
for (int i = 0; i < w1; i++)
for (int i = 0; i < fieldSizes.Field2Size; i++)
{
objstmObjNr += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8);
objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
}
builder.Add(objectId, 0, -objstmObjNr);
builder.Add(objectNumber, 0, -objstmObjNr);
break;
}
lineNumber++;
}
}
return builder.Build();
}
private static List<long> GetObjectNumbers(PdfRawStream stream)
private static List<long> GetObjectNumbers(DictionaryToken dictionary)
{
var indexArray = (COSArray) stream.Dictionary.GetDictionaryObject(CosName.INDEX);
// If Index doesn't exist, we will use the default values.
if (indexArray == null)
if (!dictionary.TryGet(NameToken.Size, out var sizeToken) || !(sizeToken is NumericToken sizeNumeric))
{
indexArray = new COSArray();
indexArray.add(CosInt.Zero);
indexArray.add(stream.Dictionary.GetDictionaryObject(CosName.SIZE));
throw new PdfDocumentFormatException($"The stream dictionary must contain a numeric size value: {dictionary}.");
}
var indexArray = new[] { 0, sizeNumeric.Int };
if (dictionary.TryGet(NameToken.Index, out var indexToken) && indexToken is ArrayToken indexArrayToken)
{
indexArray = new[]
{
indexArrayToken.GetNumeric(0).Int,
indexArrayToken.GetNumeric(1).Int
};
}
List<long> objNums = new List<long>();
// Populates objNums with all object numbers available
var firstObjectNumber = indexArray[0];
var size = indexArray[1];
for (int i = 0; i < indexArray.Count; i+=2)
for (var i = 0; i < size; i++)
{
var longId = ((CosInt) indexArray.get(i)).AsLong();
var size = ((CosInt)indexArray.get(i + 1)).AsInt();
for (int j = 0; j < size; j++)
{
objNums.Add(longId + j);
}
objNums.Add(firstObjectNumber + i);
}
return objNums;

View File

@@ -32,7 +32,7 @@
throw new InvalidOperationException($"Could not find the object {baseObject.ToIndirectReference()} with type {typeof(T).Name}.");
}
public static T Get<T>(IToken token, IPdfObjectScanner scanner) where T : IToken
public static T Get<T>(IToken token, IPdfTokenScanner scanner) where T : IToken
{
if (token is T result)
{

View File

@@ -53,33 +53,33 @@
private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
{
var log = container.Get<ILog>();
var filterProvider = container.Get<IFilterProvider>();
var bruteForceSearcher = new BruteForceSearcher(reader);
var pool = new CosObjectPool();
CrossReferenceTable crossReferenceTable = null;
// We're ok with this since our intent is to lazily load the cross reference table.
// ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, pool, bruteForceSearcher);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider);
var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
var pool = new CosObjectPool();
// TODO: make this use the scanner.
var validator = new CrossReferenceOffsetValidator(new XrefOffsetValidator(log, reader, container.Get<CosDictionaryParser>(),
container.Get<CosBaseParser>(), pool));
crossReferenceOffset = validator.Validate(crossReferenceOffset, isLenientParsing);
var crossReferenceTable = container.Get<CrossReferenceParser>()
.Parse(reader, isLenientParsing, crossReferenceOffset, pool);
// container.Get<CrossReferenceParser>().ParseNew(crossReferenceOffset, scanner, isLenientParsing);
var filterProvider = container.Get<IFilterProvider>();
var bruteForceSearcher = new BruteForceSearcher(reader);
var pdfObjectParser = new PdfObjectParser(container.Get<ILog>(), container.Get<CosBaseParser>(),
container.Get<CosStreamParser>(), crossReferenceTable, bruteForceSearcher, pool, container.Get<ObjectStreamParser>());
crossReferenceTable = container.Get<CrossReferenceParser>()
.Parse(reader, isLenientParsing, crossReferenceOffset, pool, pdfScanner, scanner);
var trueTypeFontParser = new TrueTypeFontParser();
var fontDescriptorFactory = new FontDescriptorFactory();
var pdfScanner = new PdfTokenScanner(inputBytes, new ObjectLocationProvider(crossReferenceTable, pool, bruteForceSearcher));
var cidFontFactory = new CidFontFactory(pdfScanner, fontDescriptorFactory, trueTypeFontParser, filterProvider);
var encodingReader = new EncodingReader(pdfScanner);
@@ -92,57 +92,37 @@
new Type1FontHandler(pdfScanner, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, new Type1FontParser()),
new Type3FontHandler(pdfScanner, cMapCache, filterProvider, encodingReader));
var dynamicParser = container.Get<DynamicParser>();
var resourceContainer = new ResourceContainer(pdfScanner, fontFactory);
var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
var informationFactory = new DocumentInformationFactory();
var catalogFactory = new CatalogFactory(pdfScanner);
var rootDictionary = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool,
isLenientParsing, pdfScanner);
var rootDictionary = ParseTrailer(crossReferenceTable, isLenientParsing, pdfScanner);
var information = informationFactory.Create(pdfObjectParser, crossReferenceTable.Dictionary, reader, isLenientParsing);
var information = informationFactory.Create(pdfScanner, crossReferenceTable.Dictionary);
var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing);
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
pdfScanner);
}
private static DictionaryToken ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,
DynamicParser dynamicParser, BruteForceSearcher bruteForceSearcher, CosObjectPool pool, bool isLenientParsing, IPdfObjectScanner pdfObjectScanner)
private static DictionaryToken ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner)
{
if (crossReferenceTable.Dictionary.ContainsKey(CosName.ENCRYPT))
if (crossReferenceTable.Dictionary.ContainsKey(NameToken.Encrypt))
{
throw new NotSupportedException("Cannot currently parse a document using encryption: " + crossReferenceTable.Dictionary);
}
foreach (var keyValuePair in crossReferenceTable.Dictionary)
if (!crossReferenceTable.Dictionary.TryGet(NameToken.Root, out var rootToken))
{
if (keyValuePair.Value is CosObject temporaryObject && !keyValuePair.Key.Equals(CosName.ROOT))
{
// Loads these objects into the object pool for access later.
dynamicParser.Parse(reader, temporaryObject, pool, crossReferenceTable, bruteForceSearcher,
isLenientParsing, false);
}
throw new PdfDocumentFormatException($"Missing root object specification in trailer: {crossReferenceTable.Dictionary}.");
}
CosObject root = (CosObject)crossReferenceTable.Dictionary.GetItemOrDefault(CosName.ROOT);
if (root == null)
{
throw new InvalidOperationException("Missing root object specification in trailer.");
}
var obj = pdfObjectScanner.Get(root.ToIndirectReference());
if (!(obj.Data is DictionaryToken rootDictionary))
{
throw new PdfDocumentFormatException($"Could not find the root dictionary, instead found: {obj.Data}");
}
var rootDictionary = DirectObjectFinder.Get<DictionaryToken>(rootToken, pdfTokenScanner);
if (!rootDictionary.ContainsKey(NameToken.Type) && isLenientParsing)
{

View File

@@ -27,7 +27,7 @@
[NotNull]
private readonly ParsingCachingProviders cachingProviders;
private readonly IPdfObjectScanner pdfScanner;
private readonly IPdfTokenScanner pdfScanner;
[NotNull]
internal Catalog Catalog { get; }
@@ -56,7 +56,7 @@
ParsingCachingProviders cachingProviders,
IPageFactory pageFactory,
Catalog catalog,
DocumentInformation information, IPdfObjectScanner pdfScanner)
DocumentInformation information, IPdfTokenScanner pdfScanner)
{
this.log = log;
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));

View File

@@ -1,40 +1,65 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using ContentStream;
using Cos;
using Parser.Parts;
using Tokens;
internal interface IObjectLocationProvider
{
bool TryGetOffset(IndirectReference reference, out long offset);
void UpdateOffset(IndirectReference reference, long offset);
bool TryGetCached(IndirectReference reference, out ObjectToken objectToken);
void Cache(ObjectToken objectToken);
}
internal class ObjectLocationProvider : IObjectLocationProvider
{
private readonly CrossReferenceTable crossReferenceTable;
private readonly Dictionary<IndirectReference, ObjectToken> cache = new Dictionary<IndirectReference, ObjectToken>();
/// <summary>
/// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
/// </summary>
private readonly Func<CrossReferenceTable> crossReferenceTable;
private readonly CosObjectPool pool;
private readonly BruteForceSearcher searcher;
/// <summary>
/// Indicates whether we now have a cross reference table.
/// </summary>
private bool loadedFromTable;
private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
public ObjectLocationProvider(CrossReferenceTable crossReferenceTable, CosObjectPool pool, BruteForceSearcher searcher)
public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, CosObjectPool pool, BruteForceSearcher searcher)
{
this.crossReferenceTable = crossReferenceTable;
foreach (var offset in crossReferenceTable.ObjectOffsets)
{
offsets[offset.Key] = offset.Value;
}
this.pool = pool;
this.searcher = searcher;
}
public bool TryGetOffset(IndirectReference reference, out long offset)
{
if (!loadedFromTable)
{
var table = crossReferenceTable.Invoke();
if (table != null)
{
foreach (var objectOffset in table.ObjectOffsets)
{
offsets[objectOffset.Key] = objectOffset.Value;
}
loadedFromTable = true;
}
}
if (offsets.TryGetValue(reference, out offset))
{
return true;
@@ -54,5 +79,20 @@
{
offsets[reference] = offset;
}
public bool TryGetCached(IndirectReference reference, out ObjectToken objectToken)
{
return cache.TryGetValue(reference, out objectToken);
}
public void Cache(ObjectToken objectToken)
{
if (objectToken == null)
{
throw new ArgumentNullException();
}
cache[objectToken.Number] = objectToken;
}
}
}

View File

@@ -6,19 +6,22 @@
using System.IO;
using ContentStream;
using Exceptions;
using Filters;
using IO;
using Parser.Parts;
using Tokens;
using Util;
internal interface IPdfObjectScanner : ISeekableTokenScanner
internal interface IPdfTokenScanner : ISeekableTokenScanner
{
ObjectToken Get(IndirectReference reference);
}
internal class PdfTokenScanner : IPdfObjectScanner
internal class PdfTokenScanner : IPdfTokenScanner
{
private readonly IInputBytes inputBytes;
private readonly IObjectLocationProvider objectLocationProvider;
private readonly IFilterProvider filterProvider;
private readonly CoreTokenScanner coreTokenScanner;
/// <summary>
@@ -35,10 +38,11 @@
public long CurrentPosition => coreTokenScanner.CurrentPosition;
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider)
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, IFilterProvider filterProvider)
{
this.inputBytes = inputBytes;
this.objectLocationProvider = objectLocationProvider;
this.filterProvider = filterProvider;
coreTokenScanner = new CoreTokenScanner(inputBytes);
}
@@ -46,7 +50,7 @@
{
// Read until we find object-number generation obj, e.g. "69 420 obj".
int tokensRead = 0;
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject)
while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.StartObject))
{
if (coreTokenScanner.CurrentToken is CommentToken)
{
@@ -79,21 +83,21 @@
}
// Read all tokens between obj and endobj.
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject)
while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
{
if (coreTokenScanner.CurrentToken is CommentToken)
{
continue;
}
if (coreTokenScanner.CurrentToken == OperatorToken.StartObject)
if (ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.StartObject))
{
// This should never happen.
Debug.Assert(false, "Encountered a start object 'obj' operator before the end of the previous object.");
return false;
}
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
if (ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.StartStream))
{
// Read stream: special case.
if (TryReadStream(coreTokenScanner.CurrentTokenStart, out var stream))
@@ -114,7 +118,7 @@
previousTokenPositions[1] = coreTokenScanner.CurrentPosition;
}
if (coreTokenScanner.CurrentToken != OperatorToken.EndObject)
if (!ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
{
readTokens.Clear();
return false;
@@ -125,7 +129,7 @@
IToken token;
if (readTokens.Count == 3 && readTokens[0] is NumericToken objNum
&& readTokens[1] is NumericToken genNum
&& readTokens[2] == OperatorToken.R)
&& ReferenceEquals(readTokens[2], OperatorToken.R))
{
// I have no idea if this can ever happen.
token = new IndirectReferenceToken(new IndirectReference(objNum.Long, genNum.Int));
@@ -197,9 +201,6 @@
// Track any 'endobj' or 'endstream' operators we see.
var observedEndLocations = new List<PossibleStreamEndLocation>();
// Keep track of the previous byte.
byte previousByte = 0;
// Begin reading the stream.
using (var memoryStream = new MemoryStream())
using (var binaryWrite = new BinaryWriter(memoryStream))
@@ -288,7 +289,6 @@
commonPartPosition = 0;
}
previousByte = inputBytes.CurrentByte;
binaryWrite.Write(inputBytes.CurrentByte);
read++;
@@ -451,11 +451,24 @@
public ObjectToken Get(IndirectReference reference)
{
if (objectLocationProvider.TryGetCached(reference, out var objectToken))
{
return objectToken;
}
if (!objectLocationProvider.TryGetOffset(reference, out var offset))
{
throw new InvalidOperationException($"Could not find the object with reference: {reference}.");
}
// Negative offsets refer to a stream with that number.
if (offset < 0)
{
var result = GetObjectFromStream(reference, offset);
return result;
}
Seek(offset);
if (!MoveNext())
@@ -465,5 +478,79 @@
return (ObjectToken)CurrentToken;
}
private ObjectToken GetObjectFromStream(IndirectReference reference, long offset)
{
var streamObjectNumber = offset * -1;
var streamObject = Get(new IndirectReference(streamObjectNumber, 0));
if (!(streamObject.Data is StreamToken stream))
{
throw new PdfDocumentFormatException("Requested a stream object by reference but the requested stream object " +
$"was not a stream: {reference}, {streamObject.Data}.");
}
var objects = ParseObjectStream(stream, offset);
foreach (var o in objects)
{
objectLocationProvider.Cache(o);
}
if (!objectLocationProvider.TryGetCached(reference, out var result))
{
throw new PdfDocumentFormatException($"Could not find the object {reference} in the stream {streamObjectNumber}.");
}
return result;
}
private IReadOnlyList<ObjectToken> ParseObjectStream(StreamToken stream, long offset)
{
if (!stream.StreamDictionary.TryGet(NameToken.N, out var numberToken)
|| !(numberToken is NumericToken numberOfObjects))
{
throw new PdfDocumentFormatException($"Object stream dictionary did not provide number of objects {stream.StreamDictionary}.");
}
if (!stream.StreamDictionary.TryGet(NameToken.First, out var firstToken)
|| !(firstToken is NumericToken))
{
throw new PdfDocumentFormatException($"Object stream dictionary did not provide first object offset {stream.StreamDictionary}.");
}
// Read the N integers
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider));
var scanner = new CoreTokenScanner(bytes);
var objects = new List<Tuple<long, long>>();
for (var i = 0; i < numberOfObjects.Int; i++)
{
scanner.MoveNext();
var objectNumber = (NumericToken) scanner.CurrentToken;
scanner.MoveNext();
var byteOffset = (NumericToken) scanner.CurrentToken;
objects.Add(Tuple.Create(objectNumber.Long, byteOffset.Long));
}
var results = new List<ObjectToken>();
for (var i = 0; i < objects.Count; i++)
{
var obj = objects[i];
scanner.MoveNext();
var token = scanner.CurrentToken;
results.Add(new ObjectToken(offset, new IndirectReference(obj.Item1, 0), token));
}
return results;
}
}
}

View File

@@ -38,10 +38,8 @@
var filterProvider = new MemoryFilterProvider(new DecodeParameterResolver(logger), new PngPredictor(), logger);
var crossReferenceParser = new CrossReferenceStreamParser(filterProvider);
var objectStreamParser = new ObjectStreamParser(logger, filterProvider, baseParser);
var dynamicParser = new DynamicParser(logger, baseParser, streamParser, objectStreamParser);
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, streamParser, crossReferenceParser, new CrossReferenceTableParser(),
new OldCrossReferenceTableParser(logger, dictionaryParser, baseParser));
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, crossReferenceParser, new CrossReferenceTableParser());
var cmapParser = new CMapParser();
var afmParser = new AdobeFontMetricsParser();
@@ -55,7 +53,6 @@
container.Register(streamParser);
container.Register(crossReferenceParser);
container.Register(crossReferenceTableParser);
container.Register(dynamicParser);
container.Register(objectStreamParser);
container.Register(filterProvider);
container.Register(cmapParser);