mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-12-21 19:29:51 +08:00
continue migrating code to tokenizer
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
using System.Text;
|
||||
using IO;
|
||||
using PdfPig.ContentStream;
|
||||
using PdfPig.Filters;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
using PdfPig.Tokenization.Tokens;
|
||||
using PdfPig.Util;
|
||||
@@ -298,7 +299,7 @@ endobj
|
||||
stream
|
||||
%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼
|
||||
endstream
|
||||
endobj";
|
||||
endobj";
|
||||
|
||||
var scanner = GetScanner(s);
|
||||
|
||||
@@ -310,7 +311,8 @@ endobj";
|
||||
|
||||
Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString());
|
||||
|
||||
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data));
|
||||
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadsStreamWithoutBreakBeforeEndstream()
|
||||
@@ -332,4 +334,22 @@ endobj";
|
||||
16
|
||||
endobj";
|
||||
var inputBytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
|
||||
|
||||
var scanner = new PdfTokenScanner(inputBytes, new TestObjectLocationProvider(), new TestFilterProvider());
|
||||
|
||||
var token = ReadToEnd(scanner)[1];
|
||||
|
||||
Assert.Equal(7, token.Number.ObjectNumber);
|
||||
|
||||
}
|
||||
|
||||
private PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null)
|
||||
{
|
||||
var input = StringBytesTestConverter.Convert(s, false);
|
||||
|
||||
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
|
||||
new TestFilterProvider());
|
||||
}
|
||||
|
||||
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
|
||||
{
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
using System.Collections.Generic;
|
||||
using PdfPig.ContentStream;
|
||||
using PdfPig.Tokenization.Scanner;
|
||||
using PdfPig.Tokenization.Tokens;
|
||||
|
||||
internal class TestObjectLocationProvider : IObjectLocationProvider
|
||||
{
|
||||
@@ -17,5 +18,15 @@
|
||||
{
|
||||
Offsets[reference] = offset;
|
||||
}
|
||||
|
||||
public bool TryGetCached(IndirectReference reference, out ObjectToken objectToken)
|
||||
{
|
||||
objectToken = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
public void Cache(ObjectToken objectToken)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16,14 +16,14 @@
|
||||
private readonly IPageFactory pageFactory;
|
||||
private readonly IRandomAccessRead reader;
|
||||
private readonly bool isLenientParsing;
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly DictionaryToken rootPageDictionary;
|
||||
private readonly Dictionary<int, DictionaryToken> locatedPages = new Dictionary<int, DictionaryToken>();
|
||||
|
||||
public int Count { get; }
|
||||
|
||||
internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory,
|
||||
IRandomAccessRead reader, bool isLenientParsing, IPdfObjectScanner pdfScanner)
|
||||
IRandomAccessRead reader, bool isLenientParsing, IPdfTokenScanner pdfScanner)
|
||||
{
|
||||
if (catalog == null)
|
||||
{
|
||||
|
||||
@@ -10,13 +10,13 @@
|
||||
|
||||
internal class ResourceContainer : IResourceStore
|
||||
{
|
||||
private readonly IPdfObjectScanner scanner;
|
||||
private readonly IPdfTokenScanner scanner;
|
||||
private readonly IFontFactory fontFactory;
|
||||
|
||||
private readonly Dictionary<IndirectReference, IFont> loadedFonts = new Dictionary<IndirectReference, IFont>();
|
||||
private readonly Dictionary<NameToken, IndirectReference> currentResourceState = new Dictionary<NameToken, IndirectReference>();
|
||||
|
||||
public ResourceContainer(IPdfObjectScanner scanner, IFontFactory fontFactory)
|
||||
public ResourceContainer(IPdfTokenScanner scanner, IFontFactory fontFactory)
|
||||
{
|
||||
this.scanner = scanner;
|
||||
this.fontFactory = fontFactory;
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
namespace UglyToad.PdfPig.ContentStream
|
||||
{
|
||||
using System.Diagnostics;
|
||||
|
||||
/// <summary>
|
||||
/// Used to uniquely identify and refer to objects in the PDF file.
|
||||
/// </summary>
|
||||
@@ -20,6 +22,7 @@
|
||||
/// </summary>
|
||||
/// <param name="objectNumber">The object number.</param>
|
||||
/// <param name="generation">The generation number.</param>
|
||||
[DebuggerStepThrough]
|
||||
public IndirectReference(long objectNumber, int generation)
|
||||
{
|
||||
ObjectNumber = objectNumber;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using Tokenization.Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
internal class CrossReferenceTable
|
||||
@@ -15,9 +16,9 @@
|
||||
public IReadOnlyDictionary<IndirectReference, long> ObjectOffsets => objectOffsets;
|
||||
|
||||
[NotNull]
|
||||
public PdfDictionary Dictionary { get; }
|
||||
public DictionaryToken Dictionary { get; }
|
||||
|
||||
public CrossReferenceTable(CrossReferenceType type, IReadOnlyDictionary<IndirectReference, long> objectOffsets, PdfDictionary dictionary)
|
||||
public CrossReferenceTable(CrossReferenceType type, IReadOnlyDictionary<IndirectReference, long> objectOffsets, DictionaryToken dictionary)
|
||||
{
|
||||
if (objectOffsets == null)
|
||||
{
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Logging;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
@@ -31,7 +31,7 @@
|
||||
public CrossReferenceTable Build(long startXrefOffset, ILog log)
|
||||
{
|
||||
CrossReferenceType type = CrossReferenceType.Table;
|
||||
PdfDictionary trailerDictionary = new PdfDictionary();
|
||||
DictionaryToken trailerDictionary = new DictionaryToken(new Dictionary<IToken, IToken>());
|
||||
Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
|
||||
|
||||
List<long> xrefSeqBytePos = new List<long>();
|
||||
@@ -59,7 +59,7 @@
|
||||
|
||||
while (currentPart.Dictionary != null)
|
||||
{
|
||||
long prevBytePos = currentPart.Dictionary.GetLongOrDefault(CosName.PREV, -1L);
|
||||
long prevBytePos = currentPart.GetPreviousOffset();
|
||||
if (prevBytePos == -1)
|
||||
{
|
||||
break;
|
||||
@@ -91,16 +91,16 @@
|
||||
var currentObject = parts.First(x => x.Offset == bPos);
|
||||
if (currentObject.Dictionary != null)
|
||||
{
|
||||
foreach (var entry in currentObject.Dictionary)
|
||||
foreach (var entry in currentObject.Dictionary.Data)
|
||||
{
|
||||
/*
|
||||
* If we're at a second trailer, we have a linearized pdf file, meaning that the first Size entry represents
|
||||
* all of the objects so we don't need to grab the second.
|
||||
*/
|
||||
if (!entry.Key.Name.Equals("Size")
|
||||
|| !trailerDictionary.ContainsKey(CosName.Create("Size")))
|
||||
if (!entry.Key.Equals("Size", StringComparison.OrdinalIgnoreCase)
|
||||
|| !trailerDictionary.ContainsKey(NameToken.Size))
|
||||
{
|
||||
trailerDictionary.Set(entry.Key, entry.Value);
|
||||
trailerDictionary = trailerDictionary.With(entry.Key, entry.Value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
@@ -29,11 +29,11 @@
|
||||
|
||||
public long Previous { get; }
|
||||
|
||||
public PdfDictionary Dictionary { get; }
|
||||
public DictionaryToken Dictionary { get; private set; }
|
||||
|
||||
public CrossReferenceType Type { get; }
|
||||
|
||||
public CrossReferenceTablePart(IReadOnlyDictionary<IndirectReference, long> objectOffsets, long offset, long previous, PdfDictionary dictionary, CrossReferenceType type)
|
||||
public CrossReferenceTablePart(IReadOnlyDictionary<IndirectReference, long> objectOffsets, long offset, long previous, DictionaryToken dictionary, CrossReferenceType type)
|
||||
{
|
||||
ObjectOffsets = objectOffsets;
|
||||
Offset = offset;
|
||||
@@ -45,7 +45,17 @@
|
||||
public void FixOffset(long offset)
|
||||
{
|
||||
Offset = offset;
|
||||
Dictionary.SetLong(CosName.PREV, offset);
|
||||
Dictionary = Dictionary.With(NameToken.Prev, new NumericToken(offset));
|
||||
}
|
||||
|
||||
public long GetPreviousOffset()
|
||||
{
|
||||
if (Dictionary.TryGet(NameToken.Prev, out var token) && token is NumericToken numeric)
|
||||
{
|
||||
return numeric.Long;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class CrossReferenceTablePartBuilder
|
||||
{
|
||||
@@ -11,7 +12,7 @@
|
||||
|
||||
public long Previous { get; set; }
|
||||
|
||||
public PdfDictionary Dictionary { get; set; }
|
||||
public DictionaryToken Dictionary { get; set; }
|
||||
|
||||
public CrossReferenceType XRefType { get; set; }
|
||||
|
||||
|
||||
@@ -10,9 +10,9 @@
|
||||
|
||||
internal class EncodingReader : IEncodingReader
|
||||
{
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
public EncodingReader(IPdfObjectScanner pdfScanner)
|
||||
public EncodingReader(IPdfTokenScanner pdfScanner)
|
||||
{
|
||||
this.pdfScanner = pdfScanner;
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
return number.Int;
|
||||
}
|
||||
|
||||
public static decimal[] GetWidths(IPdfObjectScanner pdfScanner, DictionaryToken dictionary, bool isLenientParsing)
|
||||
public static decimal[] GetWidths(IPdfTokenScanner pdfScanner, DictionaryToken dictionary, bool isLenientParsing)
|
||||
{
|
||||
if (!dictionary.TryGet(NameToken.Widths, out var token))
|
||||
{
|
||||
@@ -53,7 +53,7 @@
|
||||
return result;
|
||||
}
|
||||
|
||||
public static FontDescriptor GetFontDescriptor(IPdfObjectScanner pdfScanner, FontDescriptorFactory fontDescriptorFactory, DictionaryToken dictionary,
|
||||
public static FontDescriptor GetFontDescriptor(IPdfTokenScanner pdfScanner, FontDescriptorFactory fontDescriptorFactory, DictionaryToken dictionary,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
if (!dictionary.TryGet(NameToken.FontDesc, out var obj))
|
||||
@@ -68,7 +68,7 @@
|
||||
return descriptor;
|
||||
}
|
||||
|
||||
public static NameToken GetName(IPdfObjectScanner pdfScanner, DictionaryToken dictionary, FontDescriptor descriptor, bool isLenientParsing)
|
||||
public static NameToken GetName(IPdfTokenScanner pdfScanner, DictionaryToken dictionary, FontDescriptor descriptor, bool isLenientParsing)
|
||||
{
|
||||
if (dictionary.TryGet(NameToken.BaseFont, out var nameBase))
|
||||
{
|
||||
|
||||
@@ -20,9 +20,9 @@
|
||||
private readonly FontDescriptorFactory fontDescriptorFactory;
|
||||
private readonly TrueTypeFontParser trueTypeFontParser;
|
||||
private readonly IEncodingReader encodingReader;
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
public TrueTypeFontHandler(IPdfObjectScanner pdfScanner, IFilterProvider filterProvider,
|
||||
public TrueTypeFontHandler(IPdfTokenScanner pdfScanner, IFilterProvider filterProvider,
|
||||
CMapCache cMapCache,
|
||||
FontDescriptorFactory fontDescriptorFactory,
|
||||
TrueTypeFontParser trueTypeFontParser,
|
||||
|
||||
@@ -18,10 +18,10 @@
|
||||
private readonly CidFontFactory cidFontFactory;
|
||||
private readonly CMapCache cMapCache;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly IPdfObjectScanner scanner;
|
||||
private readonly IPdfTokenScanner scanner;
|
||||
|
||||
public Type0FontHandler(CidFontFactory cidFontFactory, CMapCache cMapCache, IFilterProvider filterProvider,
|
||||
IPdfObjectScanner scanner)
|
||||
IPdfTokenScanner scanner)
|
||||
{
|
||||
this.cidFontFactory = cidFontFactory;
|
||||
this.cMapCache = cMapCache;
|
||||
|
||||
@@ -16,14 +16,14 @@
|
||||
|
||||
internal class Type1FontHandler : IFontHandler
|
||||
{
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly CMapCache cMapCache;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly FontDescriptorFactory fontDescriptorFactory;
|
||||
private readonly IEncodingReader encodingReader;
|
||||
private readonly Type1FontParser type1FontParser;
|
||||
|
||||
public Type1FontHandler(IPdfObjectScanner pdfScanner, CMapCache cMapCache, IFilterProvider filterProvider,
|
||||
public Type1FontHandler(IPdfTokenScanner pdfScanner, CMapCache cMapCache, IFilterProvider filterProvider,
|
||||
FontDescriptorFactory fontDescriptorFactory,
|
||||
IEncodingReader encodingReader,
|
||||
Type1FontParser type1FontParser)
|
||||
|
||||
@@ -18,9 +18,9 @@
|
||||
private readonly CMapCache cMapCache;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly IEncodingReader encodingReader;
|
||||
private readonly IPdfObjectScanner scanner;
|
||||
private readonly IPdfTokenScanner scanner;
|
||||
|
||||
public Type3FontHandler(IPdfObjectScanner scanner, CMapCache cMapCache, IFilterProvider filterProvider,
|
||||
public Type3FontHandler(IPdfTokenScanner scanner, CMapCache cMapCache, IFilterProvider filterProvider,
|
||||
IEncodingReader encodingReader)
|
||||
{
|
||||
this.cMapCache = cMapCache;
|
||||
|
||||
@@ -19,9 +19,9 @@
|
||||
private readonly FontDescriptorFactory descriptorFactory;
|
||||
private readonly TrueTypeFontParser trueTypeFontParser;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
public CidFontFactory(IPdfObjectScanner pdfScanner, FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser,
|
||||
public CidFontFactory(IPdfTokenScanner pdfScanner, FontDescriptorFactory descriptorFactory, TrueTypeFontParser trueTypeFontParser,
|
||||
IFilterProvider filterProvider)
|
||||
{
|
||||
this.descriptorFactory = descriptorFactory;
|
||||
|
||||
@@ -11,9 +11,9 @@
|
||||
|
||||
internal class CatalogFactory
|
||||
{
|
||||
private readonly IPdfObjectScanner scanner;
|
||||
private readonly IPdfTokenScanner scanner;
|
||||
|
||||
public CatalogFactory(IPdfObjectScanner scanner)
|
||||
public CatalogFactory(IPdfTokenScanner scanner)
|
||||
{
|
||||
this.scanner = scanner;
|
||||
}
|
||||
|
||||
@@ -1,44 +1,48 @@
|
||||
namespace UglyToad.PdfPig.Parser
|
||||
{
|
||||
using Content;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Parts;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class DocumentInformationFactory
|
||||
{
|
||||
public DocumentInformation Create(IPdfObjectParser pdfObjectParser,
|
||||
PdfDictionary rootDictionary, IRandomAccessRead reader,
|
||||
bool isLenientParsing)
|
||||
public DocumentInformation Create(IPdfTokenScanner pdfTokenScanner, DictionaryToken rootDictionary)
|
||||
{
|
||||
if (!rootDictionary.TryGetItemOfType(CosName.INFO, out CosObject infoBase))
|
||||
if (!rootDictionary.TryGet(NameToken.Info, out var infoBase))
|
||||
{
|
||||
return DocumentInformation.Default;
|
||||
}
|
||||
|
||||
var infoParsed = pdfObjectParser.Parse(infoBase.ToIndirectReference(), reader, isLenientParsing);
|
||||
var infoParsed = DirectObjectFinder.Get<DictionaryToken>(infoBase, pdfTokenScanner);
|
||||
|
||||
if (!(infoParsed is PdfDictionary infoDictionary))
|
||||
{
|
||||
return DocumentInformation.Default;
|
||||
}
|
||||
|
||||
var title = GetEntryOrDefault(infoDictionary, CosName.TITLE);
|
||||
var author = GetEntryOrDefault(infoDictionary, CosName.AUTHOR);
|
||||
var subject = GetEntryOrDefault(infoDictionary, CosName.SUBJECT);
|
||||
var keywords = GetEntryOrDefault(infoDictionary, CosName.KEYWORDS);
|
||||
var creator = GetEntryOrDefault(infoDictionary, CosName.CREATOR);
|
||||
var producer = GetEntryOrDefault(infoDictionary, CosName.PRODUCER);
|
||||
var title = GetEntryOrDefault(infoParsed, NameToken.Title);
|
||||
var author = GetEntryOrDefault(infoParsed, NameToken.Author);
|
||||
var subject = GetEntryOrDefault(infoParsed, NameToken.Subject);
|
||||
var keywords = GetEntryOrDefault(infoParsed, NameToken.Keywords);
|
||||
var creator = GetEntryOrDefault(infoParsed, NameToken.Creator);
|
||||
var producer = GetEntryOrDefault(infoParsed, NameToken.Producer);
|
||||
|
||||
return new DocumentInformation(title, author, subject,
|
||||
keywords, creator, producer);
|
||||
}
|
||||
|
||||
private static string GetEntryOrDefault(PdfDictionary infoDictionary, CosName key)
|
||||
private static string GetEntryOrDefault(DictionaryToken infoDictionary, NameToken key)
|
||||
{
|
||||
if (infoDictionary.TryGetItemOfType(key, out CosString str))
|
||||
if (!infoDictionary.TryGet(key, out var value))
|
||||
{
|
||||
return str.GetAscii();
|
||||
return null;
|
||||
}
|
||||
|
||||
if (value is StringToken str)
|
||||
{
|
||||
return str.Data;
|
||||
}
|
||||
|
||||
if (value is HexToken hex)
|
||||
{
|
||||
return hex.Data;
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
@@ -15,122 +13,93 @@
|
||||
|
||||
internal class CrossReferenceParser
|
||||
{
|
||||
private const int X = 'x';
|
||||
|
||||
private readonly ILog log;
|
||||
private readonly CosDictionaryParser dictionaryParser;
|
||||
private readonly CosBaseParser baseParser;
|
||||
private readonly CosStreamParser streamParser;
|
||||
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
|
||||
private readonly CrossReferenceTableParser crossReferenceTableParser;
|
||||
private readonly OldCrossReferenceTableParser oldCrossReferenceTableParser;
|
||||
private readonly XrefCosOffsetChecker xrefCosChecker;
|
||||
|
||||
public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
|
||||
CosStreamParser streamParser,
|
||||
CrossReferenceStreamParser crossReferenceStreamParser,
|
||||
CrossReferenceTableParser crossReferenceTableParser,
|
||||
OldCrossReferenceTableParser oldCrossReferenceTableParser)
|
||||
CrossReferenceTableParser crossReferenceTableParser)
|
||||
{
|
||||
this.log = log;
|
||||
this.dictionaryParser = dictionaryParser;
|
||||
this.baseParser = baseParser;
|
||||
this.streamParser = streamParser;
|
||||
this.crossReferenceStreamParser = crossReferenceStreamParser;
|
||||
this.crossReferenceTableParser = crossReferenceTableParser;
|
||||
this.oldCrossReferenceTableParser = oldCrossReferenceTableParser;
|
||||
}
|
||||
|
||||
public CrossReferenceTable ParseNew(long crossReferenceLocation, ISeekableTokenScanner scanner,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
var previousLocation = crossReferenceLocation;
|
||||
|
||||
var visitedCrossReferences = new HashSet<long>();
|
||||
|
||||
while (previousLocation >= 0)
|
||||
{
|
||||
scanner.Seek(crossReferenceLocation);
|
||||
|
||||
scanner.MoveNext();
|
||||
|
||||
if (scanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
|
||||
{
|
||||
var table = crossReferenceTableParser.Parse(scanner, crossReferenceLocation, isLenientParsing);
|
||||
|
||||
previousLocation = table.Dictionary.GetLongOrDefault(CosName.PREV, -1);
|
||||
|
||||
|
||||
}
|
||||
else if (scanner.CurrentToken is NumericToken streamObjectNumberToken)
|
||||
{
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The xref object was not a stream or a table, was instead: {scanner.CurrentToken}.");
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
xrefCosChecker = new XrefCosOffsetChecker();
|
||||
}
|
||||
|
||||
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
|
||||
CosObjectPool pool)
|
||||
CosObjectPool pool, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
|
||||
{
|
||||
var xrefOffsetValidator = new XrefOffsetValidator(log, reader, dictionaryParser, baseParser, pool);
|
||||
var xrefCosChecker = new XrefCosOffsetChecker();
|
||||
long fixedOffset = xrefOffsetValidator.CheckXRefOffset(xrefLocation, isLenientParsing);
|
||||
if (fixedOffset > -1)
|
||||
{
|
||||
xrefLocation = fixedOffset;
|
||||
|
||||
log.Debug($"Found the first cross reference table or stream at {fixedOffset}.");
|
||||
}
|
||||
|
||||
var table = new CrossReferenceTableBuilder();
|
||||
|
||||
var prevSet = new HashSet<long>();
|
||||
long previousCrossReferenceLocation = xrefLocation;
|
||||
// ---- parse whole chain of xref tables/object streams using PREV reference
|
||||
HashSet<long> prevSet = new HashSet<long>();
|
||||
|
||||
// Parse all cross reference tables and streams.
|
||||
while (previousCrossReferenceLocation > 0)
|
||||
{
|
||||
log.Debug($"Reading cross reference table or stream at {previousCrossReferenceLocation}.");
|
||||
|
||||
// seek to xref table
|
||||
reader.Seek(previousCrossReferenceLocation);
|
||||
tokenScanner.Seek(previousCrossReferenceLocation);
|
||||
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
tokenScanner.MoveNext();
|
||||
|
||||
var isTable = reader.Peek() == X;
|
||||
|
||||
// -- parse xref
|
||||
if (isTable)
|
||||
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
|
||||
{
|
||||
// xref table and trailer
|
||||
// use existing parser to parse xref table
|
||||
if (!oldCrossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
|
||||
{
|
||||
throw new InvalidOperationException($"Expected trailer object at position: {reader.GetPosition()}");
|
||||
}
|
||||
log.Debug("Element was cross reference table.");
|
||||
|
||||
CrossReferenceTablePart tablePart = crossReferenceTableParser.Parse(tokenScanner,
|
||||
previousCrossReferenceLocation, isLenientParsing);
|
||||
|
||||
previousCrossReferenceLocation = tablePart.GetPreviousOffset();
|
||||
|
||||
DictionaryToken tableDictionary = tablePart.Dictionary;
|
||||
|
||||
PdfDictionary trailer = tableBuilder.Dictionary;
|
||||
CrossReferenceTablePart streamPart = null;
|
||||
|
||||
// check for a XRef stream, it may contain some object ids of compressed objects
|
||||
if (trailer.ContainsKey(CosName.XREF_STM))
|
||||
if (tableDictionary.ContainsKey(NameToken.XrefStm))
|
||||
{
|
||||
int streamOffset = trailer.GetIntOrDefault(CosName.XREF_STM);
|
||||
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
|
||||
|
||||
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
|
||||
|
||||
// check the xref stream reference
|
||||
fixedOffset = xrefOffsetValidator.CheckXRefOffset(streamOffset, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != streamOffset)
|
||||
{
|
||||
log.Warn("/XRefStm offset " + streamOffset + " is incorrect, corrected to " + fixedOffset);
|
||||
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
|
||||
|
||||
streamOffset = (int)fixedOffset;
|
||||
trailer.SetInt(CosName.XREF_STM, streamOffset);
|
||||
tableBuilder.Offset = streamOffset;
|
||||
|
||||
// Update the cross reference table to be a stream instead.
|
||||
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
|
||||
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
|
||||
tablePart.Previous, tableDictionary, tablePart.Type);
|
||||
}
|
||||
|
||||
// Read the stream from the table.
|
||||
if (streamOffset > 0)
|
||||
{
|
||||
reader.Seek(streamOffset);
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
try
|
||||
{
|
||||
streamPart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
|
||||
streamPart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
@@ -140,7 +109,7 @@
|
||||
}
|
||||
else
|
||||
{
|
||||
throw ex;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -152,35 +121,27 @@
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
|
||||
throw new PdfDocumentFormatException("Skipped XRef stream due to a corrupt offset:" + streamOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
previousCrossReferenceLocation = trailer.GetLongOrDefault(CosName.PREV);
|
||||
if (previousCrossReferenceLocation > 0)
|
||||
{
|
||||
// check the xref table reference
|
||||
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
|
||||
{
|
||||
previousCrossReferenceLocation = fixedOffset;
|
||||
trailer.SetLong(CosName.PREV, previousCrossReferenceLocation);
|
||||
}
|
||||
}
|
||||
|
||||
tableBuilder.Previous = tableBuilder.Dictionary.GetLongOrDefault(CosName.PREV);
|
||||
|
||||
table.Add(tableBuilder.Build());
|
||||
table.Add(tablePart);
|
||||
|
||||
if (streamPart != null)
|
||||
{
|
||||
table.Add(streamPart);
|
||||
}
|
||||
}
|
||||
else
|
||||
else if (tokenScanner.CurrentToken is NumericToken)
|
||||
{
|
||||
log.Debug("Element was cross reference stream.");
|
||||
|
||||
// Unread the numeric token.
|
||||
tokenScanner.Seek(previousCrossReferenceLocation);
|
||||
|
||||
// parse xref stream
|
||||
var tablePart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
|
||||
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
|
||||
table.Add(tablePart);
|
||||
|
||||
previousCrossReferenceLocation = tablePart.Previous;
|
||||
@@ -195,10 +156,19 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log.Debug("Element was invalid.");
|
||||
|
||||
throw new PdfDocumentFormatException("The cross reference found at this location was not a " +
|
||||
$"table or a stream: Location - {previousCrossReferenceLocation}, {tokenScanner.CurrentPosition}.");
|
||||
}
|
||||
|
||||
if (prevSet.Contains(previousCrossReferenceLocation))
|
||||
{
|
||||
throw new InvalidOperationException("/Prev loop at offset " + previousCrossReferenceLocation);
|
||||
throw new PdfDocumentFormatException("The cross references formed an infinite loop.");
|
||||
}
|
||||
|
||||
prevSet.Add(previousCrossReferenceLocation);
|
||||
}
|
||||
|
||||
@@ -210,19 +180,20 @@
|
||||
return resolved;
|
||||
}
|
||||
|
||||
private CrossReferenceTablePart ParseCrossReferenceStream(IRandomAccessRead reader, long objByteOffset, CosObjectPool pool,
|
||||
bool isLenientParsing)
|
||||
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
|
||||
{
|
||||
// ---- parse indirect object head
|
||||
ObjectHelper.ReadObjectNumber(reader);
|
||||
ObjectHelper.ReadGenerationNumber(reader);
|
||||
pdfScanner.Seek(objByteOffset);
|
||||
|
||||
ReadHelper.ReadExpectedString(reader, "obj", true);
|
||||
pdfScanner.MoveNext();
|
||||
|
||||
PdfDictionary dict = dictionaryParser.Parse(reader, baseParser, pool);
|
||||
var streamObjectToken = (ObjectToken)pdfScanner.CurrentToken;
|
||||
|
||||
PdfRawStream xrefStream = streamParser.Parse(reader, dict, isLenientParsing, null);
|
||||
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, xrefStream);
|
||||
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
|
||||
}
|
||||
|
||||
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
|
||||
|
||||
return xrefTablePart;
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using Parts.CrossReference;
|
||||
@@ -183,7 +182,7 @@
|
||||
return objectCount;
|
||||
}
|
||||
|
||||
private static PdfDictionary ParseTrailer(ISeekableTokenScanner scanner, bool isLenientParsing)
|
||||
private static DictionaryToken ParseTrailer(ISeekableTokenScanner scanner, bool isLenientParsing)
|
||||
{
|
||||
if (scanner.CurrentToken is OperatorToken trailerToken && trailerToken.Data == "trailer")
|
||||
{
|
||||
@@ -192,7 +191,7 @@
|
||||
throw new PdfDocumentFormatException($"Expected to find a dictionary in the trailer but instead found: {scanner.CurrentToken}.");
|
||||
}
|
||||
|
||||
return PdfDictionary.FromDictionaryToken(trailerDictionary);
|
||||
return trailerDictionary;
|
||||
}
|
||||
|
||||
if (isLenientParsing)
|
||||
@@ -210,7 +209,7 @@
|
||||
|
||||
if (foundTrailer && scanner.TryReadToken(out DictionaryToken trailerDictionary))
|
||||
{
|
||||
return PdfDictionary.FromDictionaryToken(trailerDictionary);
|
||||
return trailerDictionary;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -146,7 +146,7 @@
|
||||
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
|
||||
}
|
||||
|
||||
builder.Dictionary = trailer;
|
||||
//builder.Dictionary = trailer;
|
||||
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
|
||||
|
||||
return true;
|
||||
|
||||
@@ -20,9 +20,9 @@
|
||||
private readonly IResourceStore resourceStore;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly IPageContentParser pageContentParser;
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
public PageFactory(IPdfObjectScanner pdfScanner, IResourceStore resourceStore, IFilterProvider filterProvider,
|
||||
public PageFactory(IPdfTokenScanner pdfScanner, IResourceStore resourceStore, IFilterProvider filterProvider,
|
||||
IPageContentParser pageContentParser)
|
||||
{
|
||||
this.resourceStore = resourceStore;
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
namespace UglyToad.PdfPig.Parser.Parts.CrossReference
|
||||
{
|
||||
using System;
|
||||
using Exceptions;
|
||||
using Tokenization.Tokens;
|
||||
using Util;
|
||||
|
||||
/// <summary>
|
||||
/// The array representing the size of the fields in a cross reference stream.
|
||||
/// </summary>
|
||||
internal class CrossReferenceStreamFieldSize
|
||||
{
|
||||
/// <summary>
|
||||
/// The type of the entry.
|
||||
/// </summary>
|
||||
public int Field1Size { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Type 0 and 2 is the object number, Type 1 this is the byte offset from beginning of file.
|
||||
/// </summary>
|
||||
public int Field2Size { get; }
|
||||
|
||||
/// <summary>
|
||||
/// For types 0 and 1 this is the generation number. For type 2 it is the stream index.
|
||||
/// </summary>
|
||||
public int Field3Size { get; }
|
||||
|
||||
/// <summary>
|
||||
/// How many bytes are in a line.
|
||||
/// </summary>
|
||||
public int LineLength { get; }
|
||||
|
||||
public CrossReferenceStreamFieldSize(DictionaryToken dictionary)
|
||||
{
|
||||
if (dictionary == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(dictionary));
|
||||
}
|
||||
|
||||
if (!dictionary.TryGet(NameToken.W, out var token) || !(token is ArrayToken wArray))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The W entry for the stream dictionary was not an array: {token}.");
|
||||
}
|
||||
|
||||
if (wArray.Data.Count < 3)
|
||||
{
|
||||
throw new PdfDocumentFormatException($"There must be at least 3 entries in a W entry for a stream dictionary: {wArray}.");
|
||||
}
|
||||
|
||||
Field1Size = wArray.GetNumeric(0).Int;
|
||||
Field2Size = wArray.GetNumeric(1).Int;
|
||||
Field3Size = wArray.GetNumeric(2).Int;
|
||||
|
||||
LineLength = Field1Size + Field2Size + Field3Size;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
namespace UglyToad.PdfPig.Parser.Parts.CrossReference
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using Filters;
|
||||
using Tokenization.Tokens;
|
||||
using Util;
|
||||
|
||||
internal class CrossReferenceStreamParser
|
||||
{
|
||||
@@ -19,93 +19,91 @@
|
||||
/// <summary>
|
||||
/// Parses through the unfiltered stream and populates the xrefTable HashMap.
|
||||
/// </summary>
|
||||
public CrossReferenceTablePart Parse(long streamOffset, PdfRawStream stream)
|
||||
public CrossReferenceTablePart Parse(long streamOffset, StreamToken stream)
|
||||
{
|
||||
var w = stream.Dictionary.GetDictionaryObject(CosName.W);
|
||||
if (!(w is COSArray format))
|
||||
byte[] decoded = stream.Decode(filterProvider);
|
||||
|
||||
var fieldSizes = new CrossReferenceStreamFieldSize(stream.StreamDictionary);
|
||||
|
||||
var lineCount = decoded.Length / fieldSizes.LineLength;
|
||||
|
||||
long previousOffset = -1;
|
||||
if (stream.StreamDictionary.TryGet(NameToken.Prev, out var prevToken) && prevToken is NumericToken prevNumeric)
|
||||
{
|
||||
throw new IOException("/W array is missing in Xref stream");
|
||||
previousOffset = prevNumeric.Long;
|
||||
}
|
||||
|
||||
var objNums = GetObjectNumbers(stream);
|
||||
|
||||
/*
|
||||
* Calculating the size of the line in bytes
|
||||
*/
|
||||
int w0 = format.getInt(0);
|
||||
int w1 = format.getInt(1);
|
||||
int w2 = format.getInt(2);
|
||||
int lineSize = w0 + w1 + w2;
|
||||
|
||||
var decoded = stream.Decode(filterProvider);
|
||||
|
||||
var lineCount = decoded.Length / lineSize;
|
||||
var lineNumber = 0;
|
||||
|
||||
var builder = new CrossReferenceTablePartBuilder
|
||||
{
|
||||
Offset = streamOffset,
|
||||
Previous = stream.Dictionary.GetLongOrDefault(CosName.PREV),
|
||||
Dictionary = stream.Dictionary,
|
||||
Previous = previousOffset,
|
||||
Dictionary = stream.StreamDictionary,
|
||||
XRefType = CrossReferenceType.Stream
|
||||
};
|
||||
|
||||
using (IEnumerator<long> objIter = objNums.GetEnumerator())
|
||||
{
|
||||
var currLine = new byte[lineSize];
|
||||
var objectNumbers = GetObjectNumbers(stream.StreamDictionary);
|
||||
|
||||
while (lineNumber < lineCount && objIter.MoveNext())
|
||||
var lineNumber = 0;
|
||||
var lineBuffer = new byte[fieldSizes.LineLength];
|
||||
foreach (var objectNumber in objectNumbers)
|
||||
{
|
||||
var byteOffset = lineNumber * lineSize;
|
||||
for (int i = 0; i < lineSize; i++)
|
||||
if (lineNumber >= lineCount)
|
||||
{
|
||||
currLine[i] = decoded[byteOffset + i];
|
||||
break;
|
||||
}
|
||||
|
||||
var byteOffset = lineNumber * fieldSizes.LineLength;
|
||||
|
||||
for (var i = 0; i < fieldSizes.LineLength; i++)
|
||||
{
|
||||
lineBuffer[i] = decoded[byteOffset + i];
|
||||
}
|
||||
|
||||
int type;
|
||||
if (w0 == 0)
|
||||
if (fieldSizes.Field1Size == 0)
|
||||
{
|
||||
// "If the first element is zero,
|
||||
// the type field shall not be present, and shall default to type 1"
|
||||
type = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
type = 0;
|
||||
/*
|
||||
* Grabs the number of bytes specified for the first column in
|
||||
* the W array and stores it.
|
||||
*/
|
||||
for (int i = 0; i < w0; i++)
|
||||
|
||||
for (var i = 0; i < fieldSizes.Field1Size; i++)
|
||||
{
|
||||
type += (currLine[i] & 0x00ff) << ((w0 - i - 1) * 8);
|
||||
type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8);
|
||||
}
|
||||
}
|
||||
//Need to remember the current objID
|
||||
long objectId = objIter.Current;
|
||||
/*
|
||||
* 3 different types of entries.
|
||||
*/
|
||||
|
||||
ReadNextStreamObject(type, objectNumber, fieldSizes, builder, lineBuffer);
|
||||
|
||||
lineNumber++;
|
||||
}
|
||||
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
private static void ReadNextStreamObject(int type, long objectNumber, CrossReferenceStreamFieldSize fieldSizes,
|
||||
CrossReferenceTablePartBuilder builder, byte[] lineBuffer)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case 0:
|
||||
/*
|
||||
* Skipping free objects
|
||||
*/
|
||||
// Ignore free objects.
|
||||
break;
|
||||
case 1:
|
||||
// Non object stream entries.
|
||||
int offset = 0;
|
||||
for (int i = 0; i < w1; i++)
|
||||
for (int i = 0; i < fieldSizes.Field2Size; i++)
|
||||
{
|
||||
offset += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8);
|
||||
offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
|
||||
}
|
||||
int genNum = 0;
|
||||
for (int i = 0; i < w2; i++)
|
||||
for (int i = 0; i < fieldSizes.Field3Size; i++)
|
||||
{
|
||||
genNum += (currLine[i + w0 + w1] & 0x00ff) << ((w2 - i - 1) * 8);
|
||||
genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8);
|
||||
}
|
||||
|
||||
builder.Add(objectId, genNum, offset);
|
||||
builder.Add(objectNumber, genNum, offset);
|
||||
|
||||
break;
|
||||
case 2:
|
||||
@@ -125,48 +123,43 @@
|
||||
* distinguish from file offsets
|
||||
*/
|
||||
int objstmObjNr = 0;
|
||||
for (int i = 0; i < w1; i++)
|
||||
for (int i = 0; i < fieldSizes.Field2Size; i++)
|
||||
{
|
||||
objstmObjNr += (currLine[i + w0] & 0x00ff) << ((w1 - i - 1) * 8);
|
||||
objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
|
||||
}
|
||||
|
||||
builder.Add(objectId, 0, -objstmObjNr);
|
||||
builder.Add(objectNumber, 0, -objstmObjNr);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
lineNumber++;
|
||||
}
|
||||
}
|
||||
|
||||
return builder.Build();
|
||||
}
|
||||
|
||||
private static List<long> GetObjectNumbers(PdfRawStream stream)
|
||||
private static List<long> GetObjectNumbers(DictionaryToken dictionary)
|
||||
{
|
||||
var indexArray = (COSArray) stream.Dictionary.GetDictionaryObject(CosName.INDEX);
|
||||
|
||||
// If Index doesn't exist, we will use the default values.
|
||||
if (indexArray == null)
|
||||
if (!dictionary.TryGet(NameToken.Size, out var sizeToken) || !(sizeToken is NumericToken sizeNumeric))
|
||||
{
|
||||
indexArray = new COSArray();
|
||||
indexArray.add(CosInt.Zero);
|
||||
indexArray.add(stream.Dictionary.GetDictionaryObject(CosName.SIZE));
|
||||
throw new PdfDocumentFormatException($"The stream dictionary must contain a numeric size value: {dictionary}.");
|
||||
}
|
||||
|
||||
var indexArray = new[] { 0, sizeNumeric.Int };
|
||||
|
||||
if (dictionary.TryGet(NameToken.Index, out var indexToken) && indexToken is ArrayToken indexArrayToken)
|
||||
{
|
||||
indexArray = new[]
|
||||
{
|
||||
indexArrayToken.GetNumeric(0).Int,
|
||||
indexArrayToken.GetNumeric(1).Int
|
||||
};
|
||||
}
|
||||
|
||||
List<long> objNums = new List<long>();
|
||||
|
||||
// Populates objNums with all object numbers available
|
||||
var firstObjectNumber = indexArray[0];
|
||||
var size = indexArray[1];
|
||||
|
||||
for (int i = 0; i < indexArray.Count; i+=2)
|
||||
for (var i = 0; i < size; i++)
|
||||
{
|
||||
var longId = ((CosInt) indexArray.get(i)).AsLong();
|
||||
var size = ((CosInt)indexArray.get(i + 1)).AsInt();
|
||||
|
||||
for (int j = 0; j < size; j++)
|
||||
{
|
||||
objNums.Add(longId + j);
|
||||
}
|
||||
objNums.Add(firstObjectNumber + i);
|
||||
}
|
||||
|
||||
return objNums;
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
throw new InvalidOperationException($"Could not find the object {baseObject.ToIndirectReference()} with type {typeof(T).Name}.");
|
||||
}
|
||||
|
||||
public static T Get<T>(IToken token, IPdfObjectScanner scanner) where T : IToken
|
||||
public static T Get<T>(IToken token, IPdfTokenScanner scanner) where T : IToken
|
||||
{
|
||||
if (token is T result)
|
||||
{
|
||||
|
||||
@@ -53,33 +53,33 @@
|
||||
private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
|
||||
{
|
||||
var log = container.Get<ILog>();
|
||||
var filterProvider = container.Get<IFilterProvider>();
|
||||
var bruteForceSearcher = new BruteForceSearcher(reader);
|
||||
var pool = new CosObjectPool();
|
||||
|
||||
CrossReferenceTable crossReferenceTable = null;
|
||||
|
||||
// We're ok with this since our intent is to lazily load the cross reference table.
|
||||
// ReSharper disable once AccessToModifiedClosure
|
||||
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, pool, bruteForceSearcher);
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider);
|
||||
|
||||
var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
|
||||
|
||||
var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
|
||||
|
||||
var pool = new CosObjectPool();
|
||||
|
||||
// TODO: make this use the scanner.
|
||||
var validator = new CrossReferenceOffsetValidator(new XrefOffsetValidator(log, reader, container.Get<CosDictionaryParser>(),
|
||||
container.Get<CosBaseParser>(), pool));
|
||||
|
||||
crossReferenceOffset = validator.Validate(crossReferenceOffset, isLenientParsing);
|
||||
|
||||
var crossReferenceTable = container.Get<CrossReferenceParser>()
|
||||
.Parse(reader, isLenientParsing, crossReferenceOffset, pool);
|
||||
|
||||
// container.Get<CrossReferenceParser>().ParseNew(crossReferenceOffset, scanner, isLenientParsing);
|
||||
|
||||
var filterProvider = container.Get<IFilterProvider>();
|
||||
var bruteForceSearcher = new BruteForceSearcher(reader);
|
||||
var pdfObjectParser = new PdfObjectParser(container.Get<ILog>(), container.Get<CosBaseParser>(),
|
||||
container.Get<CosStreamParser>(), crossReferenceTable, bruteForceSearcher, pool, container.Get<ObjectStreamParser>());
|
||||
crossReferenceTable = container.Get<CrossReferenceParser>()
|
||||
.Parse(reader, isLenientParsing, crossReferenceOffset, pool, pdfScanner, scanner);
|
||||
|
||||
var trueTypeFontParser = new TrueTypeFontParser();
|
||||
var fontDescriptorFactory = new FontDescriptorFactory();
|
||||
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, new ObjectLocationProvider(crossReferenceTable, pool, bruteForceSearcher));
|
||||
var cidFontFactory = new CidFontFactory(pdfScanner, fontDescriptorFactory, trueTypeFontParser, filterProvider);
|
||||
var encodingReader = new EncodingReader(pdfScanner);
|
||||
|
||||
@@ -92,57 +92,37 @@
|
||||
new Type1FontHandler(pdfScanner, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, new Type1FontParser()),
|
||||
new Type3FontHandler(pdfScanner, cMapCache, filterProvider, encodingReader));
|
||||
|
||||
var dynamicParser = container.Get<DynamicParser>();
|
||||
var resourceContainer = new ResourceContainer(pdfScanner, fontFactory);
|
||||
|
||||
var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
|
||||
var informationFactory = new DocumentInformationFactory();
|
||||
var catalogFactory = new CatalogFactory(pdfScanner);
|
||||
|
||||
var rootDictionary = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool,
|
||||
isLenientParsing, pdfScanner);
|
||||
var rootDictionary = ParseTrailer(crossReferenceTable, isLenientParsing, pdfScanner);
|
||||
|
||||
var information = informationFactory.Create(pdfObjectParser, crossReferenceTable.Dictionary, reader, isLenientParsing);
|
||||
var information = informationFactory.Create(pdfScanner, crossReferenceTable.Dictionary);
|
||||
|
||||
var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing);
|
||||
|
||||
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
|
||||
|
||||
|
||||
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
|
||||
pdfScanner);
|
||||
}
|
||||
|
||||
private static DictionaryToken ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,
|
||||
DynamicParser dynamicParser, BruteForceSearcher bruteForceSearcher, CosObjectPool pool, bool isLenientParsing, IPdfObjectScanner pdfObjectScanner)
|
||||
private static DictionaryToken ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner)
|
||||
{
|
||||
if (crossReferenceTable.Dictionary.ContainsKey(CosName.ENCRYPT))
|
||||
if (crossReferenceTable.Dictionary.ContainsKey(NameToken.Encrypt))
|
||||
{
|
||||
throw new NotSupportedException("Cannot currently parse a document using encryption: " + crossReferenceTable.Dictionary);
|
||||
}
|
||||
|
||||
foreach (var keyValuePair in crossReferenceTable.Dictionary)
|
||||
if (!crossReferenceTable.Dictionary.TryGet(NameToken.Root, out var rootToken))
|
||||
{
|
||||
if (keyValuePair.Value is CosObject temporaryObject && !keyValuePair.Key.Equals(CosName.ROOT))
|
||||
{
|
||||
// Loads these objects into the object pool for access later.
|
||||
dynamicParser.Parse(reader, temporaryObject, pool, crossReferenceTable, bruteForceSearcher,
|
||||
isLenientParsing, false);
|
||||
}
|
||||
throw new PdfDocumentFormatException($"Missing root object specification in trailer: {crossReferenceTable.Dictionary}.");
|
||||
}
|
||||
|
||||
CosObject root = (CosObject)crossReferenceTable.Dictionary.GetItemOrDefault(CosName.ROOT);
|
||||
if (root == null)
|
||||
{
|
||||
throw new InvalidOperationException("Missing root object specification in trailer.");
|
||||
}
|
||||
|
||||
var obj = pdfObjectScanner.Get(root.ToIndirectReference());
|
||||
|
||||
if (!(obj.Data is DictionaryToken rootDictionary))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Could not find the root dictionary, instead found: {obj.Data}");
|
||||
}
|
||||
var rootDictionary = DirectObjectFinder.Get<DictionaryToken>(rootToken, pdfTokenScanner);
|
||||
|
||||
if (!rootDictionary.ContainsKey(NameToken.Type) && isLenientParsing)
|
||||
{
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
[NotNull]
|
||||
private readonly ParsingCachingProviders cachingProviders;
|
||||
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
[NotNull]
|
||||
internal Catalog Catalog { get; }
|
||||
@@ -56,7 +56,7 @@
|
||||
ParsingCachingProviders cachingProviders,
|
||||
IPageFactory pageFactory,
|
||||
Catalog catalog,
|
||||
DocumentInformation information, IPdfObjectScanner pdfScanner)
|
||||
DocumentInformation information, IPdfTokenScanner pdfScanner)
|
||||
{
|
||||
this.log = log;
|
||||
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
|
||||
@@ -1,40 +1,65 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
|
||||
internal interface IObjectLocationProvider
|
||||
{
|
||||
bool TryGetOffset(IndirectReference reference, out long offset);
|
||||
|
||||
void UpdateOffset(IndirectReference reference, long offset);
|
||||
|
||||
bool TryGetCached(IndirectReference reference, out ObjectToken objectToken);
|
||||
|
||||
void Cache(ObjectToken objectToken);
|
||||
}
|
||||
|
||||
internal class ObjectLocationProvider : IObjectLocationProvider
|
||||
{
|
||||
private readonly CrossReferenceTable crossReferenceTable;
|
||||
private readonly Dictionary<IndirectReference, ObjectToken> cache = new Dictionary<IndirectReference, ObjectToken>();
|
||||
|
||||
/// <summary>
|
||||
/// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
|
||||
/// </summary>
|
||||
private readonly Func<CrossReferenceTable> crossReferenceTable;
|
||||
private readonly CosObjectPool pool;
|
||||
private readonly BruteForceSearcher searcher;
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether we now have a cross reference table.
|
||||
/// </summary>
|
||||
private bool loadedFromTable;
|
||||
|
||||
private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
|
||||
|
||||
public ObjectLocationProvider(CrossReferenceTable crossReferenceTable, CosObjectPool pool, BruteForceSearcher searcher)
|
||||
public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, CosObjectPool pool, BruteForceSearcher searcher)
|
||||
{
|
||||
this.crossReferenceTable = crossReferenceTable;
|
||||
|
||||
foreach (var offset in crossReferenceTable.ObjectOffsets)
|
||||
{
|
||||
offsets[offset.Key] = offset.Value;
|
||||
}
|
||||
|
||||
this.pool = pool;
|
||||
this.searcher = searcher;
|
||||
}
|
||||
|
||||
public bool TryGetOffset(IndirectReference reference, out long offset)
|
||||
{
|
||||
if (!loadedFromTable)
|
||||
{
|
||||
var table = crossReferenceTable.Invoke();
|
||||
|
||||
if (table != null)
|
||||
{
|
||||
foreach (var objectOffset in table.ObjectOffsets)
|
||||
{
|
||||
offsets[objectOffset.Key] = objectOffset.Value;
|
||||
}
|
||||
|
||||
loadedFromTable = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets.TryGetValue(reference, out offset))
|
||||
{
|
||||
return true;
|
||||
@@ -54,5 +79,20 @@
|
||||
{
|
||||
offsets[reference] = offset;
|
||||
}
|
||||
|
||||
public bool TryGetCached(IndirectReference reference, out ObjectToken objectToken)
|
||||
{
|
||||
return cache.TryGetValue(reference, out objectToken);
|
||||
}
|
||||
|
||||
public void Cache(ObjectToken objectToken)
|
||||
{
|
||||
if (objectToken == null)
|
||||
{
|
||||
throw new ArgumentNullException();
|
||||
}
|
||||
|
||||
cache[objectToken.Number] = objectToken;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,19 +6,22 @@
|
||||
using System.IO;
|
||||
using ContentStream;
|
||||
using Exceptions;
|
||||
using Filters;
|
||||
using IO;
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
using Util;
|
||||
|
||||
internal interface IPdfObjectScanner : ISeekableTokenScanner
|
||||
internal interface IPdfTokenScanner : ISeekableTokenScanner
|
||||
{
|
||||
ObjectToken Get(IndirectReference reference);
|
||||
}
|
||||
|
||||
internal class PdfTokenScanner : IPdfObjectScanner
|
||||
internal class PdfTokenScanner : IPdfTokenScanner
|
||||
{
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly IObjectLocationProvider objectLocationProvider;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly CoreTokenScanner coreTokenScanner;
|
||||
|
||||
/// <summary>
|
||||
@@ -35,10 +38,11 @@
|
||||
|
||||
public long CurrentPosition => coreTokenScanner.CurrentPosition;
|
||||
|
||||
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider)
|
||||
public PdfTokenScanner(IInputBytes inputBytes, IObjectLocationProvider objectLocationProvider, IFilterProvider filterProvider)
|
||||
{
|
||||
this.inputBytes = inputBytes;
|
||||
this.objectLocationProvider = objectLocationProvider;
|
||||
this.filterProvider = filterProvider;
|
||||
coreTokenScanner = new CoreTokenScanner(inputBytes);
|
||||
}
|
||||
|
||||
@@ -46,7 +50,7 @@
|
||||
{
|
||||
// Read until we find object-number generation obj, e.g. "69 420 obj".
|
||||
int tokensRead = 0;
|
||||
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.StartObject)
|
||||
while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.StartObject))
|
||||
{
|
||||
if (coreTokenScanner.CurrentToken is CommentToken)
|
||||
{
|
||||
@@ -79,21 +83,21 @@
|
||||
}
|
||||
|
||||
// Read all tokens between obj and endobj.
|
||||
while (coreTokenScanner.MoveNext() && coreTokenScanner.CurrentToken != OperatorToken.EndObject)
|
||||
while (coreTokenScanner.MoveNext() && !Equals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
|
||||
{
|
||||
if (coreTokenScanner.CurrentToken is CommentToken)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (coreTokenScanner.CurrentToken == OperatorToken.StartObject)
|
||||
if (ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.StartObject))
|
||||
{
|
||||
// This should never happen.
|
||||
Debug.Assert(false, "Encountered a start object 'obj' operator before the end of the previous object.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (coreTokenScanner.CurrentToken == OperatorToken.StartStream)
|
||||
if (ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.StartStream))
|
||||
{
|
||||
// Read stream: special case.
|
||||
if (TryReadStream(coreTokenScanner.CurrentTokenStart, out var stream))
|
||||
@@ -114,7 +118,7 @@
|
||||
previousTokenPositions[1] = coreTokenScanner.CurrentPosition;
|
||||
}
|
||||
|
||||
if (coreTokenScanner.CurrentToken != OperatorToken.EndObject)
|
||||
if (!ReferenceEquals(coreTokenScanner.CurrentToken, OperatorToken.EndObject))
|
||||
{
|
||||
readTokens.Clear();
|
||||
return false;
|
||||
@@ -125,7 +129,7 @@
|
||||
IToken token;
|
||||
if (readTokens.Count == 3 && readTokens[0] is NumericToken objNum
|
||||
&& readTokens[1] is NumericToken genNum
|
||||
&& readTokens[2] == OperatorToken.R)
|
||||
&& ReferenceEquals(readTokens[2], OperatorToken.R))
|
||||
{
|
||||
// I have no idea if this can ever happen.
|
||||
token = new IndirectReferenceToken(new IndirectReference(objNum.Long, genNum.Int));
|
||||
@@ -197,9 +201,6 @@
|
||||
// Track any 'endobj' or 'endstream' operators we see.
|
||||
var observedEndLocations = new List<PossibleStreamEndLocation>();
|
||||
|
||||
// Keep track of the previous byte.
|
||||
byte previousByte = 0;
|
||||
|
||||
// Begin reading the stream.
|
||||
using (var memoryStream = new MemoryStream())
|
||||
using (var binaryWrite = new BinaryWriter(memoryStream))
|
||||
@@ -288,7 +289,6 @@
|
||||
commonPartPosition = 0;
|
||||
}
|
||||
|
||||
previousByte = inputBytes.CurrentByte;
|
||||
binaryWrite.Write(inputBytes.CurrentByte);
|
||||
|
||||
read++;
|
||||
@@ -451,11 +451,24 @@
|
||||
|
||||
public ObjectToken Get(IndirectReference reference)
|
||||
{
|
||||
if (objectLocationProvider.TryGetCached(reference, out var objectToken))
|
||||
{
|
||||
return objectToken;
|
||||
}
|
||||
|
||||
if (!objectLocationProvider.TryGetOffset(reference, out var offset))
|
||||
{
|
||||
throw new InvalidOperationException($"Could not find the object with reference: {reference}.");
|
||||
}
|
||||
|
||||
// Negative offsets refer to a stream with that number.
|
||||
if (offset < 0)
|
||||
{
|
||||
var result = GetObjectFromStream(reference, offset);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Seek(offset);
|
||||
|
||||
if (!MoveNext())
|
||||
@@ -465,5 +478,79 @@
|
||||
|
||||
return (ObjectToken)CurrentToken;
|
||||
}
|
||||
|
||||
private ObjectToken GetObjectFromStream(IndirectReference reference, long offset)
|
||||
{
|
||||
var streamObjectNumber = offset * -1;
|
||||
|
||||
var streamObject = Get(new IndirectReference(streamObjectNumber, 0));
|
||||
|
||||
if (!(streamObject.Data is StreamToken stream))
|
||||
{
|
||||
throw new PdfDocumentFormatException("Requested a stream object by reference but the requested stream object " +
|
||||
$"was not a stream: {reference}, {streamObject.Data}.");
|
||||
}
|
||||
|
||||
var objects = ParseObjectStream(stream, offset);
|
||||
|
||||
foreach (var o in objects)
|
||||
{
|
||||
objectLocationProvider.Cache(o);
|
||||
}
|
||||
|
||||
if (!objectLocationProvider.TryGetCached(reference, out var result))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Could not find the object {reference} in the stream {streamObjectNumber}.");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private IReadOnlyList<ObjectToken> ParseObjectStream(StreamToken stream, long offset)
|
||||
{
|
||||
if (!stream.StreamDictionary.TryGet(NameToken.N, out var numberToken)
|
||||
|| !(numberToken is NumericToken numberOfObjects))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Object stream dictionary did not provide number of objects {stream.StreamDictionary}.");
|
||||
}
|
||||
|
||||
if (!stream.StreamDictionary.TryGet(NameToken.First, out var firstToken)
|
||||
|| !(firstToken is NumericToken))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"Object stream dictionary did not provide first object offset {stream.StreamDictionary}.");
|
||||
}
|
||||
|
||||
// Read the N integers
|
||||
var bytes = new ByteArrayInputBytes(stream.Decode(filterProvider));
|
||||
|
||||
var scanner = new CoreTokenScanner(bytes);
|
||||
|
||||
var objects = new List<Tuple<long, long>>();
|
||||
|
||||
for (var i = 0; i < numberOfObjects.Int; i++)
|
||||
{
|
||||
scanner.MoveNext();
|
||||
var objectNumber = (NumericToken) scanner.CurrentToken;
|
||||
scanner.MoveNext();
|
||||
var byteOffset = (NumericToken) scanner.CurrentToken;
|
||||
|
||||
objects.Add(Tuple.Create(objectNumber.Long, byteOffset.Long));
|
||||
}
|
||||
|
||||
var results = new List<ObjectToken>();
|
||||
|
||||
for (var i = 0; i < objects.Count; i++)
|
||||
{
|
||||
var obj = objects[i];
|
||||
|
||||
scanner.MoveNext();
|
||||
|
||||
var token = scanner.CurrentToken;
|
||||
|
||||
results.Add(new ObjectToken(offset, new IndirectReference(obj.Item1, 0), token));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,10 +38,8 @@
|
||||
var filterProvider = new MemoryFilterProvider(new DecodeParameterResolver(logger), new PngPredictor(), logger);
|
||||
var crossReferenceParser = new CrossReferenceStreamParser(filterProvider);
|
||||
var objectStreamParser = new ObjectStreamParser(logger, filterProvider, baseParser);
|
||||
var dynamicParser = new DynamicParser(logger, baseParser, streamParser, objectStreamParser);
|
||||
|
||||
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, streamParser, crossReferenceParser, new CrossReferenceTableParser(),
|
||||
new OldCrossReferenceTableParser(logger, dictionaryParser, baseParser));
|
||||
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, crossReferenceParser, new CrossReferenceTableParser());
|
||||
|
||||
var cmapParser = new CMapParser();
|
||||
var afmParser = new AdobeFontMetricsParser();
|
||||
@@ -55,7 +53,6 @@
|
||||
container.Register(streamParser);
|
||||
container.Register(crossReferenceParser);
|
||||
container.Register(crossReferenceTableParser);
|
||||
container.Register(dynamicParser);
|
||||
container.Register(objectStreamParser);
|
||||
container.Register(filterProvider);
|
||||
container.Register(cmapParser);
|
||||
|
||||
Reference in New Issue
Block a user