mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-14 19:05:01 +08:00
finish the migration
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
{
|
||||
using System;
|
||||
using Content;
|
||||
using ContentStream;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Parts;
|
||||
|
@@ -1,236 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Parser
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
using Util;
|
||||
|
||||
internal class DynamicParser
|
||||
{
|
||||
private readonly ILog log;
|
||||
private readonly CosBaseParser baseParser;
|
||||
private readonly CosStreamParser streamParser;
|
||||
private readonly ObjectStreamParser objectStreamParser;
|
||||
|
||||
public DynamicParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, ObjectStreamParser objectStreamParser)
|
||||
{
|
||||
this.log = log;
|
||||
this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser));
|
||||
this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser));
|
||||
this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser));
|
||||
}
|
||||
|
||||
public CosBase Parse(ParsingArguments arguments, CosObject obj, bool requiresExistingObject)
|
||||
{
|
||||
return Parse(arguments.Reader, obj, arguments.CachingProviders.ObjectPool,
|
||||
arguments.CrossReferenceTable, arguments.CachingProviders.BruteForceSearcher,
|
||||
arguments.IsLenientParsing,
|
||||
requiresExistingObject);
|
||||
}
|
||||
|
||||
public CosBase Parse(IRandomAccessRead reader, CosObject obj, CosObjectPool pool,
|
||||
CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenient, bool requireExistingObject)
|
||||
{
|
||||
if (obj == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(obj));
|
||||
}
|
||||
|
||||
return Parse(reader, obj.GetObjectNumber(), obj.GetGenerationNumber(), pool,
|
||||
crossReferenceTable, bruteForceSearcher, isLenient, requireExistingObject);
|
||||
}
|
||||
|
||||
public CosBase Parse(IRandomAccessRead reader, long objectNumber, int objectGeneration,
|
||||
CosObjectPool pool, CrossReferenceTable crossReferenceTable,
|
||||
BruteForceSearcher bruteForceSearcher,
|
||||
bool isLenient,
|
||||
bool requireExistingObject)
|
||||
{
|
||||
if (pool == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(pool));
|
||||
}
|
||||
|
||||
var key = new IndirectReference(objectNumber, objectGeneration);
|
||||
|
||||
var pdfObject = pool.GetOrCreateDefault(key);
|
||||
|
||||
if (pdfObject.GetObject() != null)
|
||||
{
|
||||
return pdfObject.GetObject();
|
||||
}
|
||||
|
||||
if (crossReferenceTable == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(crossReferenceTable));
|
||||
}
|
||||
|
||||
var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets);
|
||||
|
||||
if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0))
|
||||
{
|
||||
throw new InvalidOperationException("Object must be defined and not compressed: " + key);
|
||||
}
|
||||
|
||||
if (isLenient && offsetOrStreamNumber == null)
|
||||
{
|
||||
var locations = bruteForceSearcher.GetObjectLocations();
|
||||
|
||||
offsetOrStreamNumber = TryGet(key, locations);
|
||||
|
||||
if (offsetOrStreamNumber != null)
|
||||
{
|
||||
crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value);
|
||||
}
|
||||
}
|
||||
|
||||
if (offsetOrStreamNumber == null)
|
||||
{
|
||||
return CosNull.Null;
|
||||
}
|
||||
|
||||
var isCompressedStreamObject = offsetOrStreamNumber <= 0;
|
||||
|
||||
if (!isCompressedStreamObject)
|
||||
{
|
||||
return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, pool, isLenient);
|
||||
}
|
||||
|
||||
return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, objectNumber, pool, crossReferenceTable, bruteForceSearcher, isLenient);
|
||||
}
|
||||
|
||||
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
|
||||
IndirectReference key,
|
||||
CosObjectPool pool,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
reader.Seek(offset);
|
||||
|
||||
var objectNumber = ObjectHelper.ReadObjectNumber(reader);
|
||||
var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);
|
||||
|
||||
ReadHelper.ReadExpectedString(reader, "obj", true);
|
||||
|
||||
if (objectNumber != key.ObjectNumber || objectGeneration != key.Generation)
|
||||
{
|
||||
throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
|
||||
var baseObject = baseParser.Parse(reader, pool);
|
||||
|
||||
var endObjectKey = ReadHelper.ReadString(reader);
|
||||
|
||||
var atStreamStart = string.Equals(endObjectKey, "stream");
|
||||
|
||||
if (atStreamStart)
|
||||
{
|
||||
var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);
|
||||
|
||||
reader.Rewind(streamStartBytes.Length);
|
||||
|
||||
baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
|
||||
}
|
||||
|
||||
var message =
|
||||
$"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";
|
||||
|
||||
if (isLenientParsing)
|
||||
{
|
||||
log.Warn(message);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(message);
|
||||
}
|
||||
|
||||
return baseObject;
|
||||
}
|
||||
|
||||
private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset,
|
||||
bool isLenientParsing,
|
||||
out string endObjectKey)
|
||||
{
|
||||
if (currentBase is PdfDictionary dictionary)
|
||||
{
|
||||
PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, null);
|
||||
|
||||
currentBase = stream;
|
||||
}
|
||||
else
|
||||
{
|
||||
// this is not legal
|
||||
// the combination of a dict and the stream/endstream
|
||||
// forms a complete stream object
|
||||
throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset}).");
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
endObjectKey = ReadHelper.ReadLine(reader);
|
||||
|
||||
// we have case with a second 'endstream' before endobj
|
||||
if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream"))
|
||||
{
|
||||
endObjectKey = endObjectKey.Substring(9).Trim();
|
||||
if (endObjectKey.Length == 0)
|
||||
{
|
||||
// no other characters in extra endstream line
|
||||
// read next line
|
||||
endObjectKey = ReadHelper.ReadLine(reader);
|
||||
}
|
||||
}
|
||||
|
||||
return currentBase;
|
||||
}
|
||||
|
||||
private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, CosObjectPool objectPool, CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenientParsing)
|
||||
{
|
||||
var baseStream = Parse(reader, streamObjectNumber, 0, objectPool, crossReferenceTable, bruteForceSearcher,
|
||||
isLenientParsing, true);
|
||||
|
||||
if (!(baseStream is PdfRawStream stream))
|
||||
{
|
||||
log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}");
|
||||
|
||||
return CosNull.Null;
|
||||
}
|
||||
|
||||
var objects = objectStreamParser.Parse(stream, objectPool);
|
||||
|
||||
// register all objects which are referenced to be contained in object stream
|
||||
foreach (var next in objects)
|
||||
{
|
||||
var streamKey = new IndirectReference(next.GetObjectNumber(), next.GetGenerationNumber());
|
||||
var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets);
|
||||
|
||||
if (offset != null && offset == -streamObjectNumber)
|
||||
{
|
||||
var streamObject = objectPool.Get(streamKey);
|
||||
streamObject.SetObject(next.GetObject());
|
||||
}
|
||||
}
|
||||
|
||||
var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber);
|
||||
|
||||
if (matchingStreamObject != null)
|
||||
{
|
||||
return matchingStreamObject;
|
||||
}
|
||||
|
||||
log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull.");
|
||||
|
||||
return CosNull.Null;
|
||||
}
|
||||
|
||||
private static T? TryGet<T, TKey>(TKey key, IReadOnlyDictionary<TKey, T> dictionary) where T : struct
|
||||
{
|
||||
return dictionary.TryGetValue(key, out var value) ? value : default(T?);
|
||||
}
|
||||
}
|
||||
}
|
@@ -99,7 +99,7 @@
|
||||
{
|
||||
try
|
||||
{
|
||||
streamPart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
|
||||
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
|
@@ -1,215 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Parser.FileStructure
|
||||
{
|
||||
using System;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
using Parts.CrossReference;
|
||||
using Util;
|
||||
|
||||
internal class OldCrossReferenceTableParser
|
||||
{
|
||||
private const string InUseEntry = "n";
|
||||
private const string FreeEntry = "f";
|
||||
|
||||
private readonly ILog log;
|
||||
private readonly IDictionaryParser dictionaryParser;
|
||||
private readonly IBaseParser baseParser;
|
||||
|
||||
public OldCrossReferenceTableParser(ILog log, IDictionaryParser dictionaryParser, IBaseParser baseParser)
|
||||
{
|
||||
this.log = log;
|
||||
this.dictionaryParser = dictionaryParser;
|
||||
this.baseParser = baseParser;
|
||||
}
|
||||
|
||||
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
|
||||
{
|
||||
builder = null;
|
||||
|
||||
var tableStartOffset = source.GetPosition();
|
||||
|
||||
if (source.Peek() != 'x')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var xref = ReadHelper.ReadString(source);
|
||||
if (!xref.Trim().Equals("xref"))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// check for trailer after xref
|
||||
var str = ReadHelper.ReadString(source);
|
||||
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
|
||||
|
||||
source.Rewind(b.Length);
|
||||
|
||||
if (str.StartsWith("trailer"))
|
||||
{
|
||||
log.Warn("skipping empty xref table");
|
||||
return false;
|
||||
}
|
||||
|
||||
builder = new CrossReferenceTablePartBuilder
|
||||
{
|
||||
Offset = offset,
|
||||
XRefType = CrossReferenceType.Table
|
||||
};
|
||||
|
||||
// Tables can have multiple sections. Each starts with a starting object id and a count.
|
||||
while (true)
|
||||
{
|
||||
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
|
||||
{
|
||||
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
|
||||
|
||||
if (isLenientParsing)
|
||||
{
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
var currentObjectId = subsectionDefinition.FirstNumber;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
for (var i = 0; i < subsectionDefinition.Count; i++)
|
||||
{
|
||||
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (source.Peek() == 't')
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
//Ignore table contents
|
||||
var currentLine = ReadHelper.ReadLine(source);
|
||||
var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
|
||||
if (splitString.Length < 3)
|
||||
{
|
||||
log.Warn("invalid xref line: " + currentLine);
|
||||
break;
|
||||
}
|
||||
|
||||
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
|
||||
if (splitString[splitString.Length - 1].Equals(InUseEntry))
|
||||
{
|
||||
try
|
||||
{
|
||||
var objectOffset = long.Parse(splitString[0]);
|
||||
|
||||
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
|
||||
{
|
||||
// PDFBOX-3923: offset points inside this table - that can't be good
|
||||
throw new InvalidOperationException(
|
||||
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
|
||||
}
|
||||
|
||||
var generation = int.Parse(splitString[1]);
|
||||
builder.Add(currentObjectId, generation, objectOffset);
|
||||
}
|
||||
catch (FormatException e)
|
||||
{
|
||||
throw new InvalidOperationException("Bad", e);
|
||||
}
|
||||
}
|
||||
else if (!splitString[2].Equals(FreeEntry))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
|
||||
}
|
||||
|
||||
currentObjectId++;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
if (!ReadHelper.IsDigit(source))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
|
||||
{
|
||||
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
|
||||
}
|
||||
|
||||
//builder.Dictionary = trailer;
|
||||
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
|
||||
{
|
||||
trailer = null;
|
||||
// parse the last trailer.
|
||||
var trailerOffset = source.GetPosition();
|
||||
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
|
||||
if (isLenientParsing)
|
||||
{
|
||||
int nextCharacter = source.Peek();
|
||||
while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
|
||||
{
|
||||
if (source.GetPosition() == trailerOffset)
|
||||
{
|
||||
// warn only the first time
|
||||
//LOG.warn("Expected trailer object at position " + trailerOffset
|
||||
// + ", keep trying");
|
||||
}
|
||||
ReadHelper.ReadLine(source);
|
||||
nextCharacter = source.Peek();
|
||||
}
|
||||
}
|
||||
if (source.Peek() != 't')
|
||||
{
|
||||
return false;
|
||||
}
|
||||
//read "trailer"
|
||||
long currentOffset = source.GetPosition();
|
||||
string nextLine = ReadHelper.ReadLine(source);
|
||||
if (!nextLine.Trim().Equals("trailer"))
|
||||
{
|
||||
// in some cases the EOL is missing and the trailer immediately
|
||||
// continues with "<<" or with a blank character
|
||||
// even if this does not comply with PDF reference we want to support as many PDFs as possible
|
||||
// Acrobat reader can also deal with this.
|
||||
if (nextLine.StartsWith("trailer"))
|
||||
{
|
||||
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
|
||||
int len = "trailer".Length;
|
||||
// jump back right after "trailer"
|
||||
source.Seek(currentOffset + len);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// in some cases the EOL is missing and the trailer continues with " <<"
|
||||
// even if this does not comply with PDF reference we want to support as many PDFs as possible
|
||||
// Acrobat reader can also deal with this.
|
||||
ReadHelper.SkipSpaces(source);
|
||||
|
||||
PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);
|
||||
|
||||
trailer = parsedTrailer;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,223 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Parser
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
using Util;
|
||||
|
||||
internal interface IPdfObjectParser
|
||||
{
|
||||
CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false);
|
||||
}
|
||||
|
||||
internal class PdfObjectParser : IPdfObjectParser
|
||||
{
|
||||
private readonly ILog log;
|
||||
private readonly CosBaseParser baseParser;
|
||||
private readonly CosStreamParser streamParser;
|
||||
private readonly CrossReferenceTable crossReferenceTable;
|
||||
private readonly BruteForceSearcher bruteForceSearcher;
|
||||
private readonly CosObjectPool objectPool;
|
||||
private readonly ObjectStreamParser objectStreamParser;
|
||||
|
||||
public PdfObjectParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, CrossReferenceTable crossReferenceTable,
|
||||
BruteForceSearcher bruteForceSearcher,
|
||||
CosObjectPool objectPool,
|
||||
ObjectStreamParser objectStreamParser)
|
||||
{
|
||||
this.log = log ?? new NoOpLog();
|
||||
this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser));
|
||||
this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser));
|
||||
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
|
||||
this.bruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher));
|
||||
this.objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool));
|
||||
this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser));
|
||||
}
|
||||
|
||||
public CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false)
|
||||
{
|
||||
var key = new IndirectReference(indirectReference.ObjectNumber, indirectReference.Generation);
|
||||
|
||||
var pdfObject = objectPool.GetOrCreateDefault(key);
|
||||
|
||||
if (pdfObject.GetObject() != null)
|
||||
{
|
||||
return pdfObject.GetObject();
|
||||
}
|
||||
|
||||
var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets);
|
||||
|
||||
if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0))
|
||||
{
|
||||
throw new InvalidOperationException("Object must be defined and not compressed: " + key);
|
||||
}
|
||||
|
||||
if (isLenientParsing && offsetOrStreamNumber == null)
|
||||
{
|
||||
var locations = bruteForceSearcher.GetObjectLocations();
|
||||
|
||||
offsetOrStreamNumber = TryGet(key, locations);
|
||||
|
||||
if (offsetOrStreamNumber != null)
|
||||
{
|
||||
crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value);
|
||||
}
|
||||
}
|
||||
|
||||
if (offsetOrStreamNumber == null)
|
||||
{
|
||||
if (isLenientParsing)
|
||||
{
|
||||
return CosNull.Null;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Could not locate the object {key.ObjectNumber} which was not found in the cross reference table.");
|
||||
}
|
||||
|
||||
var isCompressedStreamObject = offsetOrStreamNumber <= 0;
|
||||
|
||||
if (!isCompressedStreamObject)
|
||||
{
|
||||
return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, objectPool, isLenientParsing);
|
||||
}
|
||||
|
||||
return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, indirectReference.ObjectNumber, isLenientParsing);
|
||||
}
|
||||
|
||||
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
|
||||
IndirectReference key,
|
||||
CosObjectPool pool,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
reader.Seek(offset);
|
||||
|
||||
var objectNumber = ObjectHelper.ReadObjectNumber(reader);
|
||||
var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);
|
||||
|
||||
ReadHelper.ReadExpectedString(reader, "obj", true);
|
||||
|
||||
if (objectNumber != key.ObjectNumber || objectGeneration != key.Generation)
|
||||
{
|
||||
throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
|
||||
var baseObject = baseParser.Parse(reader, pool);
|
||||
|
||||
var endObjectKey = ReadHelper.ReadString(reader);
|
||||
|
||||
var atStreamStart = string.Equals(endObjectKey, "stream");
|
||||
|
||||
if (atStreamStart)
|
||||
{
|
||||
var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);
|
||||
|
||||
reader.Rewind(streamStartBytes.Length);
|
||||
|
||||
baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
|
||||
}
|
||||
|
||||
if (!string.Equals(endObjectKey, "endobj"))
|
||||
{
|
||||
var message =
|
||||
$"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";
|
||||
|
||||
if (isLenientParsing)
|
||||
{
|
||||
log.Warn(message);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException(message);
|
||||
}
|
||||
}
|
||||
|
||||
return baseObject;
|
||||
}
|
||||
|
||||
private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset,
|
||||
bool isLenientParsing,
|
||||
out string endObjectKey)
|
||||
{
|
||||
if (currentBase is PdfDictionary dictionary)
|
||||
{
|
||||
PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, this);
|
||||
|
||||
currentBase = stream;
|
||||
}
|
||||
else
|
||||
{
|
||||
// this is not legal
|
||||
// the combination of a dict and the stream/endstream
|
||||
// forms a complete stream object
|
||||
throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset}).");
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
endObjectKey = ReadHelper.ReadLine(reader);
|
||||
|
||||
// we have case with a second 'endstream' before endobj
|
||||
if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream"))
|
||||
{
|
||||
endObjectKey = endObjectKey.Substring(9).Trim();
|
||||
if (endObjectKey.Length == 0)
|
||||
{
|
||||
// no other characters in extra endstream line
|
||||
// read next line
|
||||
endObjectKey = ReadHelper.ReadLine(reader);
|
||||
}
|
||||
}
|
||||
|
||||
return currentBase;
|
||||
}
|
||||
|
||||
private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, bool isLenientParsing)
|
||||
{
|
||||
var baseStream = Parse(new IndirectReference(streamObjectNumber, 0), reader, isLenientParsing, true);
|
||||
|
||||
if (!(baseStream is PdfRawStream stream))
|
||||
{
|
||||
log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}");
|
||||
|
||||
return CosNull.Null;
|
||||
}
|
||||
|
||||
var objects = objectStreamParser.Parse(stream, objectPool);
|
||||
|
||||
// register all objects which are referenced to be contained in object stream
|
||||
foreach (var next in objects)
|
||||
{
|
||||
var streamKey = new IndirectReference(next.GetObjectNumber(), next.GetGenerationNumber());
|
||||
var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets);
|
||||
|
||||
if (offset != null && offset == -streamObjectNumber)
|
||||
{
|
||||
var streamObject = objectPool.Get(streamKey);
|
||||
streamObject.SetObject(next.GetObject());
|
||||
}
|
||||
}
|
||||
|
||||
var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber);
|
||||
|
||||
if (matchingStreamObject != null)
|
||||
{
|
||||
return matchingStreamObject;
|
||||
}
|
||||
|
||||
log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull.");
|
||||
|
||||
return CosNull.Null;
|
||||
}
|
||||
|
||||
private static T? TryGet<T, TKey>(TKey key, IReadOnlyDictionary<TKey, T> dictionary) where T : struct
|
||||
{
|
||||
return dictionary.TryGetValue(key, out var value) ? value : default(T?);
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,82 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Parser
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using ContentStream.TypedAccessors;
|
||||
using Cos;
|
||||
using Filters;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
|
||||
internal class ObjectStreamParser
|
||||
{
|
||||
private readonly ILog log;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly CosBaseParser baseParser;
|
||||
|
||||
public ObjectStreamParser(ILog log, IFilterProvider filterProvider, CosBaseParser baseParser)
|
||||
{
|
||||
this.log = log;
|
||||
this.filterProvider = filterProvider;
|
||||
this.baseParser = baseParser;
|
||||
}
|
||||
|
||||
public IReadOnlyList<CosObject> Parse(PdfRawStream stream, CosObjectPool pool)
|
||||
{
|
||||
if (stream == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(stream));
|
||||
}
|
||||
|
||||
//need to first parse the header.
|
||||
var numberOfObjects = stream.Dictionary.GetIntOrDefault(CosName.N);
|
||||
var objectNumbers = new List<long>(numberOfObjects);
|
||||
|
||||
var streamObjects = new List<CosObject>(numberOfObjects);
|
||||
|
||||
var bytes = stream.Decode(filterProvider);
|
||||
|
||||
var reader = new RandomAccessBuffer(bytes);
|
||||
|
||||
for (int i = 0; i < numberOfObjects; i++)
|
||||
{
|
||||
long objectNumber = ObjectHelper.ReadObjectNumber(reader);
|
||||
// skip offset
|
||||
ReadHelper.ReadLong(reader);
|
||||
objectNumbers.Add(objectNumber);
|
||||
}
|
||||
|
||||
CosObject obj;
|
||||
CosBase cosObject;
|
||||
int objectCounter = 0;
|
||||
while ((cosObject = baseParser.Parse(reader, pool)) != null)
|
||||
{
|
||||
obj = new CosObject(cosObject);
|
||||
obj.SetGenerationNumber(0);
|
||||
|
||||
if (objectCounter >= objectNumbers.Count)
|
||||
{
|
||||
log.Error("/ObjStm (object stream) has more objects than /N " + numberOfObjects);
|
||||
break;
|
||||
}
|
||||
|
||||
obj.SetObjectNumber(objectNumbers[objectCounter]);
|
||||
streamObjects.Add(obj);
|
||||
|
||||
// According to the spec objects within an object stream shall not be enclosed
|
||||
// by obj/endobj tags, but there are some pdfs in the wild using those tags
|
||||
// skip endobject marker if present
|
||||
if (!reader.IsEof() && reader.Peek() == 'e')
|
||||
{
|
||||
ReadHelper.ReadLine(reader);
|
||||
}
|
||||
|
||||
objectCounter++;
|
||||
}
|
||||
|
||||
return streamObjects;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,332 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using System.IO;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Util;
|
||||
|
||||
internal class CosStreamParser
|
||||
{
|
||||
private static readonly int STREAMCOPYBUFLEN = 8192;
|
||||
private static readonly int STRMBUFLEN = 2048;
|
||||
private static readonly byte[] ENDOBJ =
|
||||
{
|
||||
(byte) 'E', (byte) 'N', (byte) 'D',
|
||||
(byte) 'O', (byte) 'B', (byte) 'J'
|
||||
};
|
||||
private static readonly byte[] ENDSTREAM =
|
||||
{
|
||||
(byte) 'E', (byte) 'N', (byte) 'D',
|
||||
(byte) 'S', (byte) 'T', (byte) 'R', (byte) 'E', (byte) 'A', (byte) 'M'
|
||||
};
|
||||
|
||||
private readonly ILog log;
|
||||
private readonly byte[] streamCopyBuf = new byte[STREAMCOPYBUFLEN];
|
||||
private readonly byte[] strmBuf = new byte[STRMBUFLEN];
|
||||
|
||||
public CosStreamParser(ILog log)
|
||||
{
|
||||
this.log = log;
|
||||
}
|
||||
|
||||
public PdfRawStream Parse(IRandomAccessRead reader, PdfDictionary streamDictionary, bool isLenientParsing, IPdfObjectParser parser)
|
||||
{
|
||||
PdfRawStream result;
|
||||
|
||||
// read 'stream'; this was already tested in parseObjectsDynamically()
|
||||
ReadHelper.ReadExpectedString(reader, "stream");
|
||||
|
||||
skipWhiteSpaces(reader);
|
||||
|
||||
// This needs to be streamDictionary.getItem because when we are parsing, the underlying object might still be null.
|
||||
ICosNumber streamLength = GetLength(reader, streamDictionary.GetItemOrDefault(CosName.LENGTH), streamDictionary.GetName(CosName.TYPE), isLenientParsing, parser);
|
||||
|
||||
ValidateStreamLength(reader, isLenientParsing, streamLength);
|
||||
|
||||
// get output stream to copy data to
|
||||
using (var stream = new MemoryStream())
|
||||
using (var writer = new BinaryWriter(stream))
|
||||
{
|
||||
if (streamLength != null && validateStreamLength(reader, streamLength.AsLong(), reader.Length()))
|
||||
{
|
||||
ReadValidStream(reader, writer, streamLength);
|
||||
}
|
||||
else
|
||||
{
|
||||
ReadUntilEndStream(reader, writer);
|
||||
}
|
||||
|
||||
result = new PdfRawStream(stream.ToArray(), streamDictionary);
|
||||
}
|
||||
|
||||
String endStream = ReadHelper.ReadString(reader);
|
||||
if (endStream.Equals("endobj") && isLenientParsing)
|
||||
{
|
||||
log.Warn($"stream ends with \'endobj\' instead of \'endstream\' at offset {reader.GetPosition()}");
|
||||
|
||||
// avoid follow-up warning about missing endobj
|
||||
reader.Rewind("endobj".Length);
|
||||
}
|
||||
else if (endStream.Length > 9 && isLenientParsing && endStream.Substring(0, 9).Equals("endstream"))
|
||||
{
|
||||
log.Warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + reader.GetPosition());
|
||||
// unread the "extra" bytes
|
||||
reader.Rewind(OtherEncodings.StringAsLatin1Bytes(endStream.Substring(9)).Length);
|
||||
}
|
||||
else if (!endStream.Equals("endstream"))
|
||||
{
|
||||
throw new InvalidOperationException("Error reading stream, expected='endstream' actual='"
|
||||
+ endStream + "' at offset " + reader.GetPosition());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private void ValidateStreamLength(IRandomAccessRead reader, bool isLenientParsing, ICosNumber streamLength)
|
||||
{
|
||||
if (streamLength != null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (isLenientParsing)
|
||||
{
|
||||
log.Warn("The stream doesn't provide any stream length, using fallback readUntilEnd, at offset " +
|
||||
reader.GetPosition());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException("Missing length for stream.");
|
||||
}
|
||||
}
|
||||
|
||||
private ICosNumber GetLength(IRandomAccessRead source, CosBase lengthBaseObj, CosName streamType, bool isLenientParsing, IPdfObjectParser parser)
|
||||
{
|
||||
if (lengthBaseObj == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Length is given directly in the stream dictionary
|
||||
if (lengthBaseObj is ICosNumber number)
|
||||
{
|
||||
return number;
|
||||
}
|
||||
|
||||
// length in referenced object
|
||||
if (lengthBaseObj is CosObject lengthObj)
|
||||
{
|
||||
var currentObject = lengthObj.GetObject();
|
||||
|
||||
if (currentObject == null)
|
||||
{
|
||||
if (parser == null)
|
||||
{
|
||||
throw new InvalidOperationException("This method required access to the PDF object parser but it was not created yet. Figure out how to fix this.");
|
||||
}
|
||||
|
||||
var currentOffset = source.GetPosition();
|
||||
|
||||
var obj = parser.Parse(lengthObj.ToIndirectReference(), source, isLenientParsing);
|
||||
|
||||
source.Seek(currentOffset);
|
||||
|
||||
if (obj is ICosNumber referenceNumber)
|
||||
{
|
||||
return referenceNumber;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Length object content was not read.");
|
||||
}
|
||||
|
||||
if (currentObject is ICosNumber objectNumber)
|
||||
{
|
||||
return objectNumber;
|
||||
}
|
||||
|
||||
|
||||
throw new InvalidOperationException("Wrong type of referenced length object " + lengthObj
|
||||
+ ": " + lengthObj.GetObject().GetType().Name);
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Wrong type of length object: {lengthBaseObj.GetType().Name}");
|
||||
}
|
||||
|
||||
private void ReadValidStream(IRandomAccessRead reader, BinaryWriter output, ICosNumber streamLengthObj)
|
||||
{
|
||||
long remainBytes = streamLengthObj.AsLong();
|
||||
while (remainBytes > 0)
|
||||
{
|
||||
int chunk = (remainBytes > STREAMCOPYBUFLEN) ? STREAMCOPYBUFLEN : (int)remainBytes;
|
||||
int readBytes = reader.Read(streamCopyBuf, 0, chunk);
|
||||
if (readBytes <= 0)
|
||||
{
|
||||
// shouldn't happen, the stream length has already been validated
|
||||
throw new InvalidOperationException(
|
||||
$"read error at offset {reader.GetPosition()}: expected {chunk} bytes, but read() returns {readBytes}");
|
||||
}
|
||||
output.Write(streamCopyBuf, 0, readBytes);
|
||||
remainBytes -= readBytes;
|
||||
}
|
||||
}
|
||||
|
||||
protected void skipWhiteSpaces(IRandomAccessRead reader)
|
||||
{
|
||||
//PDF Ref 3.2.7 A stream must be followed by either
|
||||
//a CRLF or LF but nothing else.
|
||||
|
||||
int whitespace = reader.Read();
|
||||
|
||||
//see brother_scan_cover.pdf, it adds whitespaces
|
||||
//after the stream but before the start of the
|
||||
//data, so just read those first
|
||||
while (whitespace == ' ')
|
||||
{
|
||||
whitespace = reader.Read();
|
||||
}
|
||||
|
||||
if (whitespace == ReadHelper.AsciiCarriageReturn)
|
||||
{
|
||||
whitespace = reader.Read();
|
||||
if (whitespace != ReadHelper.AsciiLineFeed)
|
||||
{
|
||||
reader.Unread(whitespace);
|
||||
//The spec says this is invalid but it happens in the real
|
||||
//world so we must support it.
|
||||
}
|
||||
}
|
||||
else if (whitespace != ReadHelper.AsciiLineFeed)
|
||||
{
|
||||
//we are in an error.
|
||||
//but again we will do a lenient parsing and just assume that everything
|
||||
//is fine
|
||||
reader.Unread(whitespace);
|
||||
}
|
||||
}
|
||||
|
||||
private bool validateStreamLength(IRandomAccessRead source, long streamLength, long fileLength)
|
||||
{
|
||||
bool streamLengthIsValid = true;
|
||||
long originOffset = source.GetPosition();
|
||||
long expectedEndOfStream = originOffset + streamLength;
|
||||
if (expectedEndOfStream > fileLength)
|
||||
{
|
||||
streamLengthIsValid = false;
|
||||
//LOG.warn("The end of the stream is out of range, using workaround to read the stream, "
|
||||
// + "stream start position: " + originOffset + ", length: " + streamLength
|
||||
// + ", expected end position: " + expectedEndOfStream);
|
||||
}
|
||||
else
|
||||
{
|
||||
source.Seek(expectedEndOfStream);
|
||||
ReadHelper.SkipSpaces(source);
|
||||
if (!ReadHelper.IsString(source, "endstream"))
|
||||
{
|
||||
streamLengthIsValid = false;
|
||||
//LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, "
|
||||
// + "stream start position: " + originOffset + ", length: " + streamLength
|
||||
// + ", expected end position: " + expectedEndOfStream);
|
||||
}
|
||||
source.Seek(originOffset);
|
||||
}
|
||||
return streamLengthIsValid;
|
||||
}
|
||||
|
||||
private void ReadUntilEndStream(IRandomAccessRead source, BinaryWriter output)
|
||||
{
|
||||
int bufSize;
|
||||
int charMatchCount = 0;
|
||||
byte[] keyw = ENDSTREAM;
|
||||
|
||||
// last character position of shortest keyword ('endobj')
|
||||
int quickTestOffset = 5;
|
||||
|
||||
// read next chunk into buffer; already matched chars are added to beginning of buffer
|
||||
while ((bufSize = source.Read(strmBuf, charMatchCount, STRMBUFLEN - charMatchCount)) > 0)
|
||||
{
|
||||
bufSize += charMatchCount;
|
||||
|
||||
int bIdx = charMatchCount;
|
||||
int quickTestIdx;
|
||||
|
||||
// iterate over buffer, trying to find keyword match
|
||||
for (int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++)
|
||||
{
|
||||
// reduce compare operations by first test last character we would have to
|
||||
// match if current one matches; if it is not a character from keywords
|
||||
// we can move behind the test character; this shortcut is inspired by the
|
||||
// Boyer-Moore string search algorithm and can reduce parsing time by approx. 20%
|
||||
quickTestIdx = bIdx + quickTestOffset;
|
||||
if (charMatchCount == 0 && quickTestIdx < maxQuicktestIdx)
|
||||
{
|
||||
byte ch = strmBuf[quickTestIdx];
|
||||
if ((ch > 't') || (ch < 'a'))
|
||||
{
|
||||
// last character we would have to match if current character would match
|
||||
// is not a character from keywords -> jump behind and start over
|
||||
bIdx = quickTestIdx;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// could be negative - but we only compare to ASCII
|
||||
byte ch1 = strmBuf[bIdx];
|
||||
|
||||
if (ch1 == keyw[charMatchCount])
|
||||
{
|
||||
if (++charMatchCount == keyw.Length)
|
||||
{
|
||||
// match found
|
||||
bIdx++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ((charMatchCount == 3) && (ch1 == ENDOBJ[charMatchCount]))
|
||||
{
|
||||
// maybe ENDSTREAM is missing but we could have ENDOBJ
|
||||
keyw = ENDOBJ;
|
||||
charMatchCount++;
|
||||
}
|
||||
else
|
||||
{
|
||||
// no match; incrementing match start by 1 would be dumb since we already know
|
||||
// matched chars depending on current char read we may already have beginning
|
||||
// of a new match: 'e': first char matched; 'n': if we are at match position
|
||||
// idx 7 we already read 'e' thus 2 chars matched for each other char we have
|
||||
// to start matching first keyword char beginning with next read position
|
||||
charMatchCount = (ch1 == 'e') ? 1 : ((ch1 == 'n') && (charMatchCount == 7)) ? 2 : 0;
|
||||
// search again for 'endstream'
|
||||
keyw = ENDSTREAM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int contentBytes = Math.Max(0, bIdx - charMatchCount);
|
||||
|
||||
// write buffer content until first matched char to output stream
|
||||
if (contentBytes > 0)
|
||||
{
|
||||
output.Write(strmBuf, 0, contentBytes);
|
||||
}
|
||||
if (charMatchCount == keyw.Length)
|
||||
{
|
||||
// keyword matched; unread matched keyword (endstream/endobj) and following buffered content
|
||||
source.Rewind(bufSize - contentBytes);
|
||||
break;
|
||||
}
|
||||
|
||||
// copy matched chars at start of buffer
|
||||
Array.Copy(keyw, 0, strmBuf, 0, charMatchCount);
|
||||
}
|
||||
// this writes a lonely CR or drops trailing CR LF and LF
|
||||
// output.flush();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -1,37 +1,11 @@
|
||||
namespace UglyToad.PdfPig.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal static class DirectObjectFinder
|
||||
{
|
||||
public static T Find<T>(CosObject baseObject, IPdfObjectParser parser, IRandomAccessRead reader,
|
||||
bool isLenientParsing) where T : CosBase
|
||||
{
|
||||
var result = parser.Parse(baseObject.ToIndirectReference(), reader, isLenientParsing);
|
||||
|
||||
if (result is T resultT)
|
||||
{
|
||||
return resultT;
|
||||
}
|
||||
|
||||
if (result is CosObject obj)
|
||||
{
|
||||
return Find<T>(obj, parser, reader, isLenientParsing);
|
||||
}
|
||||
|
||||
if (result is COSArray arr && arr.Count == 1 && arr.get(0) is CosObject arrayObject)
|
||||
{
|
||||
return Find<T>(arrayObject, parser, reader, isLenientParsing);
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Could not find the object {baseObject.ToIndirectReference()} with type {typeof(T).Name}.");
|
||||
}
|
||||
|
||||
public static T Get<T>(IToken token, IPdfTokenScanner scanner) where T : IToken
|
||||
{
|
||||
if (token is T result)
|
||||
|
@@ -3,7 +3,6 @@
|
||||
using Filters;
|
||||
using Fonts.Parser;
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Parser.FileStructure;
|
||||
using Parser.Parts;
|
||||
using Parser.Parts.CrossReference;
|
||||
@@ -34,10 +33,8 @@
|
||||
var nameParser = new CosNameParser();
|
||||
var dictionaryParser = new CosDictionaryParser(nameParser, logger);
|
||||
var baseParser = new CosBaseParser(nameParser, new CosStringParser(), dictionaryParser, new CosArrayParser());
|
||||
var streamParser = new CosStreamParser(logger);
|
||||
var filterProvider = new MemoryFilterProvider(new DecodeParameterResolver(logger), new PngPredictor(), logger);
|
||||
var crossReferenceParser = new CrossReferenceStreamParser(filterProvider);
|
||||
var objectStreamParser = new ObjectStreamParser(logger, filterProvider, baseParser);
|
||||
|
||||
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, crossReferenceParser, new CrossReferenceTableParser());
|
||||
|
||||
@@ -50,10 +47,8 @@
|
||||
container.Register(nameParser);
|
||||
container.Register(dictionaryParser);
|
||||
container.Register(baseParser);
|
||||
container.Register(streamParser);
|
||||
container.Register(crossReferenceParser);
|
||||
container.Register(crossReferenceTableParser);
|
||||
container.Register(objectStreamParser);
|
||||
container.Register(filterProvider);
|
||||
container.Register(cmapParser);
|
||||
container.Register(afmParser);
|
||||
|
Reference in New Issue
Block a user