From da7d83d86342947ca75af9b3a89398eb7ca35747 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sat, 20 Jan 2018 20:20:40 +0000 Subject: [PATCH] finish the migration --- src/UglyToad.PdfPig/Parser/CatalogFactory.cs | 1 - src/UglyToad.PdfPig/Parser/DynamicParser.cs | 236 ------------- .../FileStructure/CrossReferenceParser.cs | 2 +- .../OldCrossReferenceTableParser.cs | 215 ------------ .../Parser/IPdfObjectParser.cs | 223 ------------ .../Parser/ObjectStreamParser.cs | 82 ----- .../Parser/Parts/CosStreamParser.cs | 332 ------------------ .../Parser/Parts/DirectObjectFinder.cs | 26 -- src/UglyToad.PdfPig/Util/Bootstrapper.cs | 5 - 9 files changed, 1 insertion(+), 1121 deletions(-) delete mode 100644 src/UglyToad.PdfPig/Parser/DynamicParser.cs delete mode 100644 src/UglyToad.PdfPig/Parser/FileStructure/OldCrossReferenceTableParser.cs delete mode 100644 src/UglyToad.PdfPig/Parser/IPdfObjectParser.cs delete mode 100644 src/UglyToad.PdfPig/Parser/ObjectStreamParser.cs delete mode 100644 src/UglyToad.PdfPig/Parser/Parts/CosStreamParser.cs diff --git a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs index 9d6d3951..00263435 100644 --- a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs +++ b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs @@ -2,7 +2,6 @@ { using System; using Content; - using ContentStream; using Exceptions; using IO; using Parts; diff --git a/src/UglyToad.PdfPig/Parser/DynamicParser.cs b/src/UglyToad.PdfPig/Parser/DynamicParser.cs deleted file mode 100644 index ec4b0284..00000000 --- a/src/UglyToad.PdfPig/Parser/DynamicParser.cs +++ /dev/null @@ -1,236 +0,0 @@ -namespace UglyToad.PdfPig.Parser -{ - using System; - using System.Collections.Generic; - using System.Linq; - using ContentStream; - using Cos; - using IO; - using Logging; - using Parts; - using Util; - - internal class DynamicParser - { - private readonly ILog log; - private readonly CosBaseParser baseParser; - private readonly CosStreamParser streamParser; - private readonly ObjectStreamParser objectStreamParser; - - public DynamicParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, ObjectStreamParser objectStreamParser) - { - this.log = log; - this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser)); - this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser)); - this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser)); - } - - public CosBase Parse(ParsingArguments arguments, CosObject obj, bool requiresExistingObject) - { - return Parse(arguments.Reader, obj, arguments.CachingProviders.ObjectPool, - arguments.CrossReferenceTable, arguments.CachingProviders.BruteForceSearcher, - arguments.IsLenientParsing, - requiresExistingObject); - } - - public CosBase Parse(IRandomAccessRead reader, CosObject obj, CosObjectPool pool, - CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenient, bool requireExistingObject) - { - if (obj == null) - { - throw new ArgumentNullException(nameof(obj)); - } - - return Parse(reader, obj.GetObjectNumber(), obj.GetGenerationNumber(), pool, - crossReferenceTable, bruteForceSearcher, isLenient, requireExistingObject); - } - - public CosBase Parse(IRandomAccessRead reader, long objectNumber, int objectGeneration, - CosObjectPool pool, CrossReferenceTable crossReferenceTable, - BruteForceSearcher bruteForceSearcher, - bool isLenient, - bool requireExistingObject) - { - if (pool == null) - { - throw new ArgumentNullException(nameof(pool)); - } - - var key = new IndirectReference(objectNumber, objectGeneration); - - var pdfObject = pool.GetOrCreateDefault(key); - - if (pdfObject.GetObject() != null) - { - return pdfObject.GetObject(); - } - - if (crossReferenceTable == null) - { - throw new ArgumentNullException(nameof(crossReferenceTable)); - } - - var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets); - - if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0)) - { - throw new InvalidOperationException("Object must be defined and not compressed: " + key); - } - - if (isLenient && offsetOrStreamNumber == null) - { - var locations = bruteForceSearcher.GetObjectLocations(); - - offsetOrStreamNumber = TryGet(key, locations); - - if (offsetOrStreamNumber != null) - { - crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value); - } - } - - if (offsetOrStreamNumber == null) - { - return CosNull.Null; - } - - var isCompressedStreamObject = offsetOrStreamNumber <= 0; - - if (!isCompressedStreamObject) - { - return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, pool, isLenient); - } - - return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, objectNumber, pool, crossReferenceTable, bruteForceSearcher, isLenient); - } - - private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader, - IndirectReference key, - CosObjectPool pool, - bool isLenientParsing) - { - reader.Seek(offset); - - var objectNumber = ObjectHelper.ReadObjectNumber(reader); - var objectGeneration = ObjectHelper.ReadGenerationNumber(reader); - - ReadHelper.ReadExpectedString(reader, "obj", true); - - if (objectNumber != key.ObjectNumber || objectGeneration != key.Generation) - { - throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}"); - } - - ReadHelper.SkipSpaces(reader); - - var baseObject = baseParser.Parse(reader, pool); - - var endObjectKey = ReadHelper.ReadString(reader); - - var atStreamStart = string.Equals(endObjectKey, "stream"); - - if (atStreamStart) - { - var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey); - - reader.Rewind(streamStartBytes.Length); - - baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey); - } - - var message = - $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'"; - - if (isLenientParsing) - { - log.Warn(message); - } - else - { - throw new InvalidOperationException(message); - } - - return baseObject; - } - - private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset, - bool isLenientParsing, - out string endObjectKey) - { - if (currentBase is PdfDictionary dictionary) - { - PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, null); - - currentBase = stream; - } - else - { - // this is not legal - // the combination of a dict and the stream/endstream - // forms a complete stream object - throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset})."); - } - - ReadHelper.SkipSpaces(reader); - endObjectKey = ReadHelper.ReadLine(reader); - - // we have case with a second 'endstream' before endobj - if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream")) - { - endObjectKey = endObjectKey.Substring(9).Trim(); - if (endObjectKey.Length == 0) - { - // no other characters in extra endstream line - // read next line - endObjectKey = ReadHelper.ReadLine(reader); - } - } - - return currentBase; - } - - private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, CosObjectPool objectPool, CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenientParsing) - { - var baseStream = Parse(reader, streamObjectNumber, 0, objectPool, crossReferenceTable, bruteForceSearcher, - isLenientParsing, true); - - if (!(baseStream is PdfRawStream stream)) - { - log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}"); - - return CosNull.Null; - } - - var objects = objectStreamParser.Parse(stream, objectPool); - - // register all objects which are referenced to be contained in object stream - foreach (var next in objects) - { - var streamKey = new IndirectReference(next.GetObjectNumber(), next.GetGenerationNumber()); - var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets); - - if (offset != null && offset == -streamObjectNumber) - { - var streamObject = objectPool.Get(streamKey); - streamObject.SetObject(next.GetObject()); - } - } - - var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber); - - if (matchingStreamObject != null) - { - return matchingStreamObject; - } - - log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull."); - - return CosNull.Null; - } - - private static T? TryGet(TKey key, IReadOnlyDictionary dictionary) where T : struct - { - return dictionary.TryGetValue(key, out var value) ? value : default(T?); - } - } -} diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs index 26ddd955..74736bab 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs @@ -99,7 +99,7 @@ { try { - streamPart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner); + streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner); } catch (InvalidOperationException ex) { diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/OldCrossReferenceTableParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/OldCrossReferenceTableParser.cs deleted file mode 100644 index 19cbd817..00000000 --- a/src/UglyToad.PdfPig/Parser/FileStructure/OldCrossReferenceTableParser.cs +++ /dev/null @@ -1,215 +0,0 @@ -namespace UglyToad.PdfPig.Parser.FileStructure -{ - using System; - using ContentStream; - using ContentStream.TypedAccessors; - using Cos; - using IO; - using Logging; - using Parts; - using Parts.CrossReference; - using Util; - - internal class OldCrossReferenceTableParser - { - private const string InUseEntry = "n"; - private const string FreeEntry = "f"; - - private readonly ILog log; - private readonly IDictionaryParser dictionaryParser; - private readonly IBaseParser baseParser; - - public OldCrossReferenceTableParser(ILog log, IDictionaryParser dictionaryParser, IBaseParser baseParser) - { - this.log = log; - this.dictionaryParser = dictionaryParser; - this.baseParser = baseParser; - } - - public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder) - { - builder = null; - - var tableStartOffset = source.GetPosition(); - - if (source.Peek() != 'x') - { - return false; - } - - var xref = ReadHelper.ReadString(source); - if (!xref.Trim().Equals("xref")) - { - return false; - } - - // check for trailer after xref - var str = ReadHelper.ReadString(source); - byte[] b = OtherEncodings.StringAsLatin1Bytes(str); - - source.Rewind(b.Length); - - if (str.StartsWith("trailer")) - { - log.Warn("skipping empty xref table"); - return false; - } - - builder = new CrossReferenceTablePartBuilder - { - Offset = offset, - XRefType = CrossReferenceType.Table - }; - - // Tables can have multiple sections. Each starts with a starting object id and a count. - while (true) - { - if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition)) - { - log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}"); - - if (isLenientParsing) - { - - break; - } - - return false; - } - - var currentObjectId = subsectionDefinition.FirstNumber; - - ReadHelper.SkipSpaces(source); - for (var i = 0; i < subsectionDefinition.Count; i++) - { - if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek())) - { - break; - } - - if (source.Peek() == 't') - { - break; - } - - //Ignore table contents - var currentLine = ReadHelper.ReadLine(source); - var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); - if (splitString.Length < 3) - { - log.Warn("invalid xref line: " + currentLine); - break; - } - - // This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n) - if (splitString[splitString.Length - 1].Equals(InUseEntry)) - { - try - { - var objectOffset = long.Parse(splitString[0]); - - if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition()) - { - // PDFBOX-3923: offset points inside this table - that can't be good - throw new InvalidOperationException( - $"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}"); - } - - var generation = int.Parse(splitString[1]); - builder.Add(currentObjectId, generation, objectOffset); - } - catch (FormatException e) - { - throw new InvalidOperationException("Bad", e); - } - } - else if (!splitString[2].Equals(FreeEntry)) - { - throw new InvalidOperationException( - $"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}."); - } - - currentObjectId++; - - ReadHelper.SkipSpaces(source); - } - - ReadHelper.SkipSpaces(source); - if (!ReadHelper.IsDigit(source)) - { - break; - } - } - - if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer)) - { - throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}."); - } - - //builder.Dictionary = trailer; - builder.Previous = trailer.GetLongOrDefault(CosName.PREV); - - return true; - } - - private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer) - { - trailer = null; - // parse the last trailer. - var trailerOffset = source.GetPosition(); - // PDFBOX-1739 skip extra xref entries in RegisSTAR documents - if (isLenientParsing) - { - int nextCharacter = source.Peek(); - while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter)) - { - if (source.GetPosition() == trailerOffset) - { - // warn only the first time - //LOG.warn("Expected trailer object at position " + trailerOffset - // + ", keep trying"); - } - ReadHelper.ReadLine(source); - nextCharacter = source.Peek(); - } - } - if (source.Peek() != 't') - { - return false; - } - //read "trailer" - long currentOffset = source.GetPosition(); - string nextLine = ReadHelper.ReadLine(source); - if (!nextLine.Trim().Equals("trailer")) - { - // in some cases the EOL is missing and the trailer immediately - // continues with "<<" or with a blank character - // even if this does not comply with PDF reference we want to support as many PDFs as possible - // Acrobat reader can also deal with this. - if (nextLine.StartsWith("trailer")) - { - // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes - int len = "trailer".Length; - // jump back right after "trailer" - source.Seek(currentOffset + len); - } - else - { - return false; - } - } - - // in some cases the EOL is missing and the trailer continues with " <<" - // even if this does not comply with PDF reference we want to support as many PDFs as possible - // Acrobat reader can also deal with this. - ReadHelper.SkipSpaces(source); - - PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool); - - trailer = parsedTrailer; - - ReadHelper.SkipSpaces(source); - return true; - } - } -} diff --git a/src/UglyToad.PdfPig/Parser/IPdfObjectParser.cs b/src/UglyToad.PdfPig/Parser/IPdfObjectParser.cs deleted file mode 100644 index 86e9f5b2..00000000 --- a/src/UglyToad.PdfPig/Parser/IPdfObjectParser.cs +++ /dev/null @@ -1,223 +0,0 @@ -namespace UglyToad.PdfPig.Parser -{ - using System; - using System.Collections.Generic; - using System.Linq; - using ContentStream; - using Cos; - using IO; - using Logging; - using Parts; - using Util; - - internal interface IPdfObjectParser - { - CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false); - } - - internal class PdfObjectParser : IPdfObjectParser - { - private readonly ILog log; - private readonly CosBaseParser baseParser; - private readonly CosStreamParser streamParser; - private readonly CrossReferenceTable crossReferenceTable; - private readonly BruteForceSearcher bruteForceSearcher; - private readonly CosObjectPool objectPool; - private readonly ObjectStreamParser objectStreamParser; - - public PdfObjectParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, CrossReferenceTable crossReferenceTable, - BruteForceSearcher bruteForceSearcher, - CosObjectPool objectPool, - ObjectStreamParser objectStreamParser) - { - this.log = log ?? new NoOpLog(); - this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser)); - this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser)); - this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable)); - this.bruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher)); - this.objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool)); - this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser)); - } - - public CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false) - { - var key = new IndirectReference(indirectReference.ObjectNumber, indirectReference.Generation); - - var pdfObject = objectPool.GetOrCreateDefault(key); - - if (pdfObject.GetObject() != null) - { - return pdfObject.GetObject(); - } - - var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets); - - if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0)) - { - throw new InvalidOperationException("Object must be defined and not compressed: " + key); - } - - if (isLenientParsing && offsetOrStreamNumber == null) - { - var locations = bruteForceSearcher.GetObjectLocations(); - - offsetOrStreamNumber = TryGet(key, locations); - - if (offsetOrStreamNumber != null) - { - crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value); - } - } - - if (offsetOrStreamNumber == null) - { - if (isLenientParsing) - { - return CosNull.Null; - } - - throw new InvalidOperationException($"Could not locate the object {key.ObjectNumber} which was not found in the cross reference table."); - } - - var isCompressedStreamObject = offsetOrStreamNumber <= 0; - - if (!isCompressedStreamObject) - { - return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, objectPool, isLenientParsing); - } - - return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, indirectReference.ObjectNumber, isLenientParsing); - } - - private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader, - IndirectReference key, - CosObjectPool pool, - bool isLenientParsing) - { - reader.Seek(offset); - - var objectNumber = ObjectHelper.ReadObjectNumber(reader); - var objectGeneration = ObjectHelper.ReadGenerationNumber(reader); - - ReadHelper.ReadExpectedString(reader, "obj", true); - - if (objectNumber != key.ObjectNumber || objectGeneration != key.Generation) - { - throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}"); - } - - ReadHelper.SkipSpaces(reader); - - var baseObject = baseParser.Parse(reader, pool); - - var endObjectKey = ReadHelper.ReadString(reader); - - var atStreamStart = string.Equals(endObjectKey, "stream"); - - if (atStreamStart) - { - var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey); - - reader.Rewind(streamStartBytes.Length); - - baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey); - } - - if (!string.Equals(endObjectKey, "endobj")) - { - var message = - $"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'"; - - if (isLenientParsing) - { - log.Warn(message); - } - else - { - throw new InvalidOperationException(message); - } - } - - return baseObject; - } - - private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset, - bool isLenientParsing, - out string endObjectKey) - { - if (currentBase is PdfDictionary dictionary) - { - PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, this); - - currentBase = stream; - } - else - { - // this is not legal - // the combination of a dict and the stream/endstream - // forms a complete stream object - throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset})."); - } - - ReadHelper.SkipSpaces(reader); - endObjectKey = ReadHelper.ReadLine(reader); - - // we have case with a second 'endstream' before endobj - if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream")) - { - endObjectKey = endObjectKey.Substring(9).Trim(); - if (endObjectKey.Length == 0) - { - // no other characters in extra endstream line - // read next line - endObjectKey = ReadHelper.ReadLine(reader); - } - } - - return currentBase; - } - - private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, bool isLenientParsing) - { - var baseStream = Parse(new IndirectReference(streamObjectNumber, 0), reader, isLenientParsing, true); - - if (!(baseStream is PdfRawStream stream)) - { - log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}"); - - return CosNull.Null; - } - - var objects = objectStreamParser.Parse(stream, objectPool); - - // register all objects which are referenced to be contained in object stream - foreach (var next in objects) - { - var streamKey = new IndirectReference(next.GetObjectNumber(), next.GetGenerationNumber()); - var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets); - - if (offset != null && offset == -streamObjectNumber) - { - var streamObject = objectPool.Get(streamKey); - streamObject.SetObject(next.GetObject()); - } - } - - var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber); - - if (matchingStreamObject != null) - { - return matchingStreamObject; - } - - log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull."); - - return CosNull.Null; - } - - private static T? TryGet(TKey key, IReadOnlyDictionary dictionary) where T : struct - { - return dictionary.TryGetValue(key, out var value) ? value : default(T?); - } - } -} diff --git a/src/UglyToad.PdfPig/Parser/ObjectStreamParser.cs b/src/UglyToad.PdfPig/Parser/ObjectStreamParser.cs deleted file mode 100644 index 1ddbd5c4..00000000 --- a/src/UglyToad.PdfPig/Parser/ObjectStreamParser.cs +++ /dev/null @@ -1,82 +0,0 @@ -namespace UglyToad.PdfPig.Parser -{ - using System; - using System.Collections.Generic; - using ContentStream; - using ContentStream.TypedAccessors; - using Cos; - using Filters; - using IO; - using Logging; - using Parts; - - internal class ObjectStreamParser - { - private readonly ILog log; - private readonly IFilterProvider filterProvider; - private readonly CosBaseParser baseParser; - - public ObjectStreamParser(ILog log, IFilterProvider filterProvider, CosBaseParser baseParser) - { - this.log = log; - this.filterProvider = filterProvider; - this.baseParser = baseParser; - } - - public IReadOnlyList Parse(PdfRawStream stream, CosObjectPool pool) - { - if (stream == null) - { - throw new ArgumentNullException(nameof(stream)); - } - - //need to first parse the header. - var numberOfObjects = stream.Dictionary.GetIntOrDefault(CosName.N); - var objectNumbers = new List(numberOfObjects); - - var streamObjects = new List(numberOfObjects); - - var bytes = stream.Decode(filterProvider); - - var reader = new RandomAccessBuffer(bytes); - - for (int i = 0; i < numberOfObjects; i++) - { - long objectNumber = ObjectHelper.ReadObjectNumber(reader); - // skip offset - ReadHelper.ReadLong(reader); - objectNumbers.Add(objectNumber); - } - - CosObject obj; - CosBase cosObject; - int objectCounter = 0; - while ((cosObject = baseParser.Parse(reader, pool)) != null) - { - obj = new CosObject(cosObject); - obj.SetGenerationNumber(0); - - if (objectCounter >= objectNumbers.Count) - { - log.Error("/ObjStm (object stream) has more objects than /N " + numberOfObjects); - break; - } - - obj.SetObjectNumber(objectNumbers[objectCounter]); - streamObjects.Add(obj); - - // According to the spec objects within an object stream shall not be enclosed - // by obj/endobj tags, but there are some pdfs in the wild using those tags - // skip endobject marker if present - if (!reader.IsEof() && reader.Peek() == 'e') - { - ReadHelper.ReadLine(reader); - } - - objectCounter++; - } - - return streamObjects; - } - } -} diff --git a/src/UglyToad.PdfPig/Parser/Parts/CosStreamParser.cs b/src/UglyToad.PdfPig/Parser/Parts/CosStreamParser.cs deleted file mode 100644 index 3d86aed7..00000000 --- a/src/UglyToad.PdfPig/Parser/Parts/CosStreamParser.cs +++ /dev/null @@ -1,332 +0,0 @@ -namespace UglyToad.PdfPig.Parser.Parts -{ - using System; - using System.IO; - using ContentStream; - using Cos; - using IO; - using Logging; - using Util; - - internal class CosStreamParser - { - private static readonly int STREAMCOPYBUFLEN = 8192; - private static readonly int STRMBUFLEN = 2048; - private static readonly byte[] ENDOBJ = - { - (byte) 'E', (byte) 'N', (byte) 'D', - (byte) 'O', (byte) 'B', (byte) 'J' - }; - private static readonly byte[] ENDSTREAM = - { - (byte) 'E', (byte) 'N', (byte) 'D', - (byte) 'S', (byte) 'T', (byte) 'R', (byte) 'E', (byte) 'A', (byte) 'M' - }; - - private readonly ILog log; - private readonly byte[] streamCopyBuf = new byte[STREAMCOPYBUFLEN]; - private readonly byte[] strmBuf = new byte[STRMBUFLEN]; - - public CosStreamParser(ILog log) - { - this.log = log; - } - - public PdfRawStream Parse(IRandomAccessRead reader, PdfDictionary streamDictionary, bool isLenientParsing, IPdfObjectParser parser) - { - PdfRawStream result; - - // read 'stream'; this was already tested in parseObjectsDynamically() - ReadHelper.ReadExpectedString(reader, "stream"); - - skipWhiteSpaces(reader); - - // This needs to be streamDictionary.getItem because when we are parsing, the underlying object might still be null. - ICosNumber streamLength = GetLength(reader, streamDictionary.GetItemOrDefault(CosName.LENGTH), streamDictionary.GetName(CosName.TYPE), isLenientParsing, parser); - - ValidateStreamLength(reader, isLenientParsing, streamLength); - - // get output stream to copy data to - using (var stream = new MemoryStream()) - using (var writer = new BinaryWriter(stream)) - { - if (streamLength != null && validateStreamLength(reader, streamLength.AsLong(), reader.Length())) - { - ReadValidStream(reader, writer, streamLength); - } - else - { - ReadUntilEndStream(reader, writer); - } - - result = new PdfRawStream(stream.ToArray(), streamDictionary); - } - - String endStream = ReadHelper.ReadString(reader); - if (endStream.Equals("endobj") && isLenientParsing) - { - log.Warn($"stream ends with \'endobj\' instead of \'endstream\' at offset {reader.GetPosition()}"); - - // avoid follow-up warning about missing endobj - reader.Rewind("endobj".Length); - } - else if (endStream.Length > 9 && isLenientParsing && endStream.Substring(0, 9).Equals("endstream")) - { - log.Warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + reader.GetPosition()); - // unread the "extra" bytes - reader.Rewind(OtherEncodings.StringAsLatin1Bytes(endStream.Substring(9)).Length); - } - else if (!endStream.Equals("endstream")) - { - throw new InvalidOperationException("Error reading stream, expected='endstream' actual='" - + endStream + "' at offset " + reader.GetPosition()); - } - - return result; - } - - private void ValidateStreamLength(IRandomAccessRead reader, bool isLenientParsing, ICosNumber streamLength) - { - if (streamLength != null) - { - return; - } - - if (isLenientParsing) - { - log.Warn("The stream doesn't provide any stream length, using fallback readUntilEnd, at offset " + - reader.GetPosition()); - } - else - { - throw new InvalidOperationException("Missing length for stream."); - } - } - - private ICosNumber GetLength(IRandomAccessRead source, CosBase lengthBaseObj, CosName streamType, bool isLenientParsing, IPdfObjectParser parser) - { - if (lengthBaseObj == null) - { - return null; - } - - // Length is given directly in the stream dictionary - if (lengthBaseObj is ICosNumber number) - { - return number; - } - - // length in referenced object - if (lengthBaseObj is CosObject lengthObj) - { - var currentObject = lengthObj.GetObject(); - - if (currentObject == null) - { - if (parser == null) - { - throw new InvalidOperationException("This method required access to the PDF object parser but it was not created yet. Figure out how to fix this."); - } - - var currentOffset = source.GetPosition(); - - var obj = parser.Parse(lengthObj.ToIndirectReference(), source, isLenientParsing); - - source.Seek(currentOffset); - - if (obj is ICosNumber referenceNumber) - { - return referenceNumber; - } - - throw new InvalidOperationException("Length object content was not read."); - } - - if (currentObject is ICosNumber objectNumber) - { - return objectNumber; - } - - - throw new InvalidOperationException("Wrong type of referenced length object " + lengthObj - + ": " + lengthObj.GetObject().GetType().Name); - } - - throw new InvalidOperationException($"Wrong type of length object: {lengthBaseObj.GetType().Name}"); - } - - private void ReadValidStream(IRandomAccessRead reader, BinaryWriter output, ICosNumber streamLengthObj) - { - long remainBytes = streamLengthObj.AsLong(); - while (remainBytes > 0) - { - int chunk = (remainBytes > STREAMCOPYBUFLEN) ? STREAMCOPYBUFLEN : (int)remainBytes; - int readBytes = reader.Read(streamCopyBuf, 0, chunk); - if (readBytes <= 0) - { - // shouldn't happen, the stream length has already been validated - throw new InvalidOperationException( - $"read error at offset {reader.GetPosition()}: expected {chunk} bytes, but read() returns {readBytes}"); - } - output.Write(streamCopyBuf, 0, readBytes); - remainBytes -= readBytes; - } - } - - protected void skipWhiteSpaces(IRandomAccessRead reader) - { - //PDF Ref 3.2.7 A stream must be followed by either - //a CRLF or LF but nothing else. - - int whitespace = reader.Read(); - - //see brother_scan_cover.pdf, it adds whitespaces - //after the stream but before the start of the - //data, so just read those first - while (whitespace == ' ') - { - whitespace = reader.Read(); - } - - if (whitespace == ReadHelper.AsciiCarriageReturn) - { - whitespace = reader.Read(); - if (whitespace != ReadHelper.AsciiLineFeed) - { - reader.Unread(whitespace); - //The spec says this is invalid but it happens in the real - //world so we must support it. - } - } - else if (whitespace != ReadHelper.AsciiLineFeed) - { - //we are in an error. - //but again we will do a lenient parsing and just assume that everything - //is fine - reader.Unread(whitespace); - } - } - - private bool validateStreamLength(IRandomAccessRead source, long streamLength, long fileLength) - { - bool streamLengthIsValid = true; - long originOffset = source.GetPosition(); - long expectedEndOfStream = originOffset + streamLength; - if (expectedEndOfStream > fileLength) - { - streamLengthIsValid = false; - //LOG.warn("The end of the stream is out of range, using workaround to read the stream, " - // + "stream start position: " + originOffset + ", length: " + streamLength - // + ", expected end position: " + expectedEndOfStream); - } - else - { - source.Seek(expectedEndOfStream); - ReadHelper.SkipSpaces(source); - if (!ReadHelper.IsString(source, "endstream")) - { - streamLengthIsValid = false; - //LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, " - // + "stream start position: " + originOffset + ", length: " + streamLength - // + ", expected end position: " + expectedEndOfStream); - } - source.Seek(originOffset); - } - return streamLengthIsValid; - } - - private void ReadUntilEndStream(IRandomAccessRead source, BinaryWriter output) - { - int bufSize; - int charMatchCount = 0; - byte[] keyw = ENDSTREAM; - - // last character position of shortest keyword ('endobj') - int quickTestOffset = 5; - - // read next chunk into buffer; already matched chars are added to beginning of buffer - while ((bufSize = source.Read(strmBuf, charMatchCount, STRMBUFLEN - charMatchCount)) > 0) - { - bufSize += charMatchCount; - - int bIdx = charMatchCount; - int quickTestIdx; - - // iterate over buffer, trying to find keyword match - for (int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++) - { - // reduce compare operations by first test last character we would have to - // match if current one matches; if it is not a character from keywords - // we can move behind the test character; this shortcut is inspired by the - // Boyer-Moore string search algorithm and can reduce parsing time by approx. 20% - quickTestIdx = bIdx + quickTestOffset; - if (charMatchCount == 0 && quickTestIdx < maxQuicktestIdx) - { - byte ch = strmBuf[quickTestIdx]; - if ((ch > 't') || (ch < 'a')) - { - // last character we would have to match if current character would match - // is not a character from keywords -> jump behind and start over - bIdx = quickTestIdx; - continue; - } - } - - // could be negative - but we only compare to ASCII - byte ch1 = strmBuf[bIdx]; - - if (ch1 == keyw[charMatchCount]) - { - if (++charMatchCount == keyw.Length) - { - // match found - bIdx++; - break; - } - } - else - { - if ((charMatchCount == 3) && (ch1 == ENDOBJ[charMatchCount])) - { - // maybe ENDSTREAM is missing but we could have ENDOBJ - keyw = ENDOBJ; - charMatchCount++; - } - else - { - // no match; incrementing match start by 1 would be dumb since we already know - // matched chars depending on current char read we may already have beginning - // of a new match: 'e': first char matched; 'n': if we are at match position - // idx 7 we already read 'e' thus 2 chars matched for each other char we have - // to start matching first keyword char beginning with next read position - charMatchCount = (ch1 == 'e') ? 1 : ((ch1 == 'n') && (charMatchCount == 7)) ? 2 : 0; - // search again for 'endstream' - keyw = ENDSTREAM; - } - } - } - - int contentBytes = Math.Max(0, bIdx - charMatchCount); - - // write buffer content until first matched char to output stream - if (contentBytes > 0) - { - output.Write(strmBuf, 0, contentBytes); - } - if (charMatchCount == keyw.Length) - { - // keyword matched; unread matched keyword (endstream/endobj) and following buffered content - source.Rewind(bufSize - contentBytes); - break; - } - - // copy matched chars at start of buffer - Array.Copy(keyw, 0, strmBuf, 0, charMatchCount); - } - // this writes a lonely CR or drops trailing CR LF and LF - // output.flush(); - } - - - } -} diff --git a/src/UglyToad.PdfPig/Parser/Parts/DirectObjectFinder.cs b/src/UglyToad.PdfPig/Parser/Parts/DirectObjectFinder.cs index 457834a0..378203d6 100644 --- a/src/UglyToad.PdfPig/Parser/Parts/DirectObjectFinder.cs +++ b/src/UglyToad.PdfPig/Parser/Parts/DirectObjectFinder.cs @@ -1,37 +1,11 @@ namespace UglyToad.PdfPig.Parser.Parts { - using System; - using Cos; using Exceptions; - using IO; using Tokenization.Scanner; using Tokenization.Tokens; internal static class DirectObjectFinder { - public static T Find(CosObject baseObject, IPdfObjectParser parser, IRandomAccessRead reader, - bool isLenientParsing) where T : CosBase - { - var result = parser.Parse(baseObject.ToIndirectReference(), reader, isLenientParsing); - - if (result is T resultT) - { - return resultT; - } - - if (result is CosObject obj) - { - return Find(obj, parser, reader, isLenientParsing); - } - - if (result is COSArray arr && arr.Count == 1 && arr.get(0) is CosObject arrayObject) - { - return Find(arrayObject, parser, reader, isLenientParsing); - } - - throw new InvalidOperationException($"Could not find the object {baseObject.ToIndirectReference()} with type {typeof(T).Name}."); - } - public static T Get(IToken token, IPdfTokenScanner scanner) where T : IToken { if (token is T result) diff --git a/src/UglyToad.PdfPig/Util/Bootstrapper.cs b/src/UglyToad.PdfPig/Util/Bootstrapper.cs index 965e41c4..ea59e70a 100644 --- a/src/UglyToad.PdfPig/Util/Bootstrapper.cs +++ b/src/UglyToad.PdfPig/Util/Bootstrapper.cs @@ -3,7 +3,6 @@ using Filters; using Fonts.Parser; using Logging; - using Parser; using Parser.FileStructure; using Parser.Parts; using Parser.Parts.CrossReference; @@ -34,10 +33,8 @@ var nameParser = new CosNameParser(); var dictionaryParser = new CosDictionaryParser(nameParser, logger); var baseParser = new CosBaseParser(nameParser, new CosStringParser(), dictionaryParser, new CosArrayParser()); - var streamParser = new CosStreamParser(logger); var filterProvider = new MemoryFilterProvider(new DecodeParameterResolver(logger), new PngPredictor(), logger); var crossReferenceParser = new CrossReferenceStreamParser(filterProvider); - var objectStreamParser = new ObjectStreamParser(logger, filterProvider, baseParser); var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, crossReferenceParser, new CrossReferenceTableParser()); @@ -50,10 +47,8 @@ container.Register(nameParser); container.Register(dictionaryParser); container.Register(baseParser); - container.Register(streamParser); container.Register(crossReferenceParser); container.Register(crossReferenceTableParser); - container.Register(objectStreamParser); container.Register(filterProvider); container.Register(cmapParser); container.Register(afmParser);