finish the migration

This commit is contained in:
Eliot Jones
2018-01-20 20:20:40 +00:00
parent 7d90f4858a
commit da7d83d863
9 changed files with 1 additions and 1121 deletions

View File

@@ -2,7 +2,6 @@
{
using System;
using Content;
using ContentStream;
using Exceptions;
using IO;
using Parts;

View File

@@ -1,236 +0,0 @@
namespace UglyToad.PdfPig.Parser
{
using System;
using System.Collections.Generic;
using System.Linq;
using ContentStream;
using Cos;
using IO;
using Logging;
using Parts;
using Util;
internal class DynamicParser
{
private readonly ILog log;
private readonly CosBaseParser baseParser;
private readonly CosStreamParser streamParser;
private readonly ObjectStreamParser objectStreamParser;
public DynamicParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, ObjectStreamParser objectStreamParser)
{
this.log = log;
this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser));
this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser));
this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser));
}
public CosBase Parse(ParsingArguments arguments, CosObject obj, bool requiresExistingObject)
{
return Parse(arguments.Reader, obj, arguments.CachingProviders.ObjectPool,
arguments.CrossReferenceTable, arguments.CachingProviders.BruteForceSearcher,
arguments.IsLenientParsing,
requiresExistingObject);
}
public CosBase Parse(IRandomAccessRead reader, CosObject obj, CosObjectPool pool,
CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenient, bool requireExistingObject)
{
if (obj == null)
{
throw new ArgumentNullException(nameof(obj));
}
return Parse(reader, obj.GetObjectNumber(), obj.GetGenerationNumber(), pool,
crossReferenceTable, bruteForceSearcher, isLenient, requireExistingObject);
}
public CosBase Parse(IRandomAccessRead reader, long objectNumber, int objectGeneration,
CosObjectPool pool, CrossReferenceTable crossReferenceTable,
BruteForceSearcher bruteForceSearcher,
bool isLenient,
bool requireExistingObject)
{
if (pool == null)
{
throw new ArgumentNullException(nameof(pool));
}
var key = new IndirectReference(objectNumber, objectGeneration);
var pdfObject = pool.GetOrCreateDefault(key);
if (pdfObject.GetObject() != null)
{
return pdfObject.GetObject();
}
if (crossReferenceTable == null)
{
throw new ArgumentNullException(nameof(crossReferenceTable));
}
var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets);
if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0))
{
throw new InvalidOperationException("Object must be defined and not compressed: " + key);
}
if (isLenient && offsetOrStreamNumber == null)
{
var locations = bruteForceSearcher.GetObjectLocations();
offsetOrStreamNumber = TryGet(key, locations);
if (offsetOrStreamNumber != null)
{
crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value);
}
}
if (offsetOrStreamNumber == null)
{
return CosNull.Null;
}
var isCompressedStreamObject = offsetOrStreamNumber <= 0;
if (!isCompressedStreamObject)
{
return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, pool, isLenient);
}
return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, objectNumber, pool, crossReferenceTable, bruteForceSearcher, isLenient);
}
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
IndirectReference key,
CosObjectPool pool,
bool isLenientParsing)
{
reader.Seek(offset);
var objectNumber = ObjectHelper.ReadObjectNumber(reader);
var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);
ReadHelper.ReadExpectedString(reader, "obj", true);
if (objectNumber != key.ObjectNumber || objectGeneration != key.Generation)
{
throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
}
ReadHelper.SkipSpaces(reader);
var baseObject = baseParser.Parse(reader, pool);
var endObjectKey = ReadHelper.ReadString(reader);
var atStreamStart = string.Equals(endObjectKey, "stream");
if (atStreamStart)
{
var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);
reader.Rewind(streamStartBytes.Length);
baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
}
var message =
$"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";
if (isLenientParsing)
{
log.Warn(message);
}
else
{
throw new InvalidOperationException(message);
}
return baseObject;
}
private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset,
bool isLenientParsing,
out string endObjectKey)
{
if (currentBase is PdfDictionary dictionary)
{
PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, null);
currentBase = stream;
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream
// forms a complete stream object
throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset}).");
}
ReadHelper.SkipSpaces(reader);
endObjectKey = ReadHelper.ReadLine(reader);
// we have case with a second 'endstream' before endobj
if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream"))
{
endObjectKey = endObjectKey.Substring(9).Trim();
if (endObjectKey.Length == 0)
{
// no other characters in extra endstream line
// read next line
endObjectKey = ReadHelper.ReadLine(reader);
}
}
return currentBase;
}
private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, CosObjectPool objectPool, CrossReferenceTable crossReferenceTable, BruteForceSearcher bruteForceSearcher, bool isLenientParsing)
{
var baseStream = Parse(reader, streamObjectNumber, 0, objectPool, crossReferenceTable, bruteForceSearcher,
isLenientParsing, true);
if (!(baseStream is PdfRawStream stream))
{
log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}");
return CosNull.Null;
}
var objects = objectStreamParser.Parse(stream, objectPool);
// register all objects which are referenced to be contained in object stream
foreach (var next in objects)
{
var streamKey = new IndirectReference(next.GetObjectNumber(), next.GetGenerationNumber());
var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets);
if (offset != null && offset == -streamObjectNumber)
{
var streamObject = objectPool.Get(streamKey);
streamObject.SetObject(next.GetObject());
}
}
var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber);
if (matchingStreamObject != null)
{
return matchingStreamObject;
}
log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull.");
return CosNull.Null;
}
private static T? TryGet<T, TKey>(TKey key, IReadOnlyDictionary<TKey, T> dictionary) where T : struct
{
return dictionary.TryGetValue(key, out var value) ? value : default(T?);
}
}
}

View File

@@ -99,7 +99,7 @@
{
try
{
streamPart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
}
catch (InvalidOperationException ex)
{

View File

@@ -1,215 +0,0 @@
namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using ContentStream;
using ContentStream.TypedAccessors;
using Cos;
using IO;
using Logging;
using Parts;
using Parts.CrossReference;
using Util;
internal class OldCrossReferenceTableParser
{
private const string InUseEntry = "n";
private const string FreeEntry = "f";
private readonly ILog log;
private readonly IDictionaryParser dictionaryParser;
private readonly IBaseParser baseParser;
public OldCrossReferenceTableParser(ILog log, IDictionaryParser dictionaryParser, IBaseParser baseParser)
{
this.log = log;
this.dictionaryParser = dictionaryParser;
this.baseParser = baseParser;
}
public bool TryParse(IRandomAccessRead source, long offset, bool isLenientParsing, CosObjectPool pool, out CrossReferenceTablePartBuilder builder)
{
builder = null;
var tableStartOffset = source.GetPosition();
if (source.Peek() != 'x')
{
return false;
}
var xref = ReadHelper.ReadString(source);
if (!xref.Trim().Equals("xref"))
{
return false;
}
// check for trailer after xref
var str = ReadHelper.ReadString(source);
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
source.Rewind(b.Length);
if (str.StartsWith("trailer"))
{
log.Warn("skipping empty xref table");
return false;
}
builder = new CrossReferenceTablePartBuilder
{
Offset = offset,
XRefType = CrossReferenceType.Table
};
// Tables can have multiple sections. Each starts with a starting object id and a count.
while (true)
{
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
{
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
if (isLenientParsing)
{
break;
}
return false;
}
var currentObjectId = subsectionDefinition.FirstNumber;
ReadHelper.SkipSpaces(source);
for (var i = 0; i < subsectionDefinition.Count; i++)
{
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
{
break;
}
if (source.Peek() == 't')
{
break;
}
//Ignore table contents
var currentLine = ReadHelper.ReadLine(source);
var splitString = currentLine.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
if (splitString.Length < 3)
{
log.Warn("invalid xref line: " + currentLine);
break;
}
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
if (splitString[splitString.Length - 1].Equals(InUseEntry))
{
try
{
var objectOffset = long.Parse(splitString[0]);
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
{
// PDFBOX-3923: offset points inside this table - that can't be good
throw new InvalidOperationException(
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
}
var generation = int.Parse(splitString[1]);
builder.Add(currentObjectId, generation, objectOffset);
}
catch (FormatException e)
{
throw new InvalidOperationException("Bad", e);
}
}
else if (!splitString[2].Equals(FreeEntry))
{
throw new InvalidOperationException(
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
}
currentObjectId++;
ReadHelper.SkipSpaces(source);
}
ReadHelper.SkipSpaces(source);
if (!ReadHelper.IsDigit(source))
{
break;
}
}
if (!TryParseTrailer(source, isLenientParsing, pool, out var trailer))
{
throw new InvalidOperationException($"Something went wrong trying to read the XREF table at {offset}.");
}
//builder.Dictionary = trailer;
builder.Previous = trailer.GetLongOrDefault(CosName.PREV);
return true;
}
private bool TryParseTrailer(IRandomAccessRead source, bool isLenientParsing, CosObjectPool pool, out PdfDictionary trailer)
{
trailer = null;
// parse the last trailer.
var trailerOffset = source.GetPosition();
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
if (isLenientParsing)
{
int nextCharacter = source.Peek();
while (nextCharacter != 't' && ReadHelper.IsDigit(nextCharacter))
{
if (source.GetPosition() == trailerOffset)
{
// warn only the first time
//LOG.warn("Expected trailer object at position " + trailerOffset
// + ", keep trying");
}
ReadHelper.ReadLine(source);
nextCharacter = source.Peek();
}
}
if (source.Peek() != 't')
{
return false;
}
//read "trailer"
long currentOffset = source.GetPosition();
string nextLine = ReadHelper.ReadLine(source);
if (!nextLine.Trim().Equals("trailer"))
{
// in some cases the EOL is missing and the trailer immediately
// continues with "<<" or with a blank character
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
if (nextLine.StartsWith("trailer"))
{
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
int len = "trailer".Length;
// jump back right after "trailer"
source.Seek(currentOffset + len);
}
else
{
return false;
}
}
// in some cases the EOL is missing and the trailer continues with " <<"
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
ReadHelper.SkipSpaces(source);
PdfDictionary parsedTrailer = dictionaryParser.Parse(source, baseParser, pool);
trailer = parsedTrailer;
ReadHelper.SkipSpaces(source);
return true;
}
}
}

View File

@@ -1,223 +0,0 @@
namespace UglyToad.PdfPig.Parser
{
using System;
using System.Collections.Generic;
using System.Linq;
using ContentStream;
using Cos;
using IO;
using Logging;
using Parts;
using Util;
internal interface IPdfObjectParser
{
CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false);
}
internal class PdfObjectParser : IPdfObjectParser
{
private readonly ILog log;
private readonly CosBaseParser baseParser;
private readonly CosStreamParser streamParser;
private readonly CrossReferenceTable crossReferenceTable;
private readonly BruteForceSearcher bruteForceSearcher;
private readonly CosObjectPool objectPool;
private readonly ObjectStreamParser objectStreamParser;
public PdfObjectParser(ILog log, CosBaseParser baseParser, CosStreamParser streamParser, CrossReferenceTable crossReferenceTable,
BruteForceSearcher bruteForceSearcher,
CosObjectPool objectPool,
ObjectStreamParser objectStreamParser)
{
this.log = log ?? new NoOpLog();
this.baseParser = baseParser ?? throw new ArgumentNullException(nameof(baseParser));
this.streamParser = streamParser ?? throw new ArgumentNullException(nameof(streamParser));
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
this.bruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher));
this.objectPool = objectPool ?? throw new ArgumentNullException(nameof(objectPool));
this.objectStreamParser = objectStreamParser ?? throw new ArgumentNullException(nameof(objectStreamParser));
}
public CosBase Parse(IndirectReference indirectReference, IRandomAccessRead reader, bool isLenientParsing = true, bool requireExistingObject = false)
{
var key = new IndirectReference(indirectReference.ObjectNumber, indirectReference.Generation);
var pdfObject = objectPool.GetOrCreateDefault(key);
if (pdfObject.GetObject() != null)
{
return pdfObject.GetObject();
}
var offsetOrStreamNumber = TryGet(key, crossReferenceTable.ObjectOffsets);
if (requireExistingObject && (offsetOrStreamNumber == null || offsetOrStreamNumber <= 0))
{
throw new InvalidOperationException("Object must be defined and not compressed: " + key);
}
if (isLenientParsing && offsetOrStreamNumber == null)
{
var locations = bruteForceSearcher.GetObjectLocations();
offsetOrStreamNumber = TryGet(key, locations);
if (offsetOrStreamNumber != null)
{
crossReferenceTable.UpdateOffset(key, offsetOrStreamNumber.Value);
}
}
if (offsetOrStreamNumber == null)
{
if (isLenientParsing)
{
return CosNull.Null;
}
throw new InvalidOperationException($"Could not locate the object {key.ObjectNumber} which was not found in the cross reference table.");
}
var isCompressedStreamObject = offsetOrStreamNumber <= 0;
if (!isCompressedStreamObject)
{
return ParseObjectFromFile(offsetOrStreamNumber.Value, reader, key, objectPool, isLenientParsing);
}
return ParseCompressedStreamObject(reader, -offsetOrStreamNumber.Value, indirectReference.ObjectNumber, isLenientParsing);
}
private CosBase ParseObjectFromFile(long offset, IRandomAccessRead reader,
IndirectReference key,
CosObjectPool pool,
bool isLenientParsing)
{
reader.Seek(offset);
var objectNumber = ObjectHelper.ReadObjectNumber(reader);
var objectGeneration = ObjectHelper.ReadGenerationNumber(reader);
ReadHelper.ReadExpectedString(reader, "obj", true);
if (objectNumber != key.ObjectNumber || objectGeneration != key.Generation)
{
throw new InvalidOperationException($"Xref for {key} points to object {objectNumber} {objectGeneration} at {offset}");
}
ReadHelper.SkipSpaces(reader);
var baseObject = baseParser.Parse(reader, pool);
var endObjectKey = ReadHelper.ReadString(reader);
var atStreamStart = string.Equals(endObjectKey, "stream");
if (atStreamStart)
{
var streamStartBytes = OtherEncodings.StringAsLatin1Bytes(endObjectKey);
reader.Rewind(streamStartBytes.Length);
baseObject = ReadNormalObjectStream(reader, baseObject, offset, isLenientParsing, out endObjectKey);
}
if (!string.Equals(endObjectKey, "endobj"))
{
var message =
$"Object ({objectNumber}:{objectGeneration}) at offset {offset} does not end with \'endobj\' but with \'{endObjectKey}\'";
if (isLenientParsing)
{
log.Warn(message);
}
else
{
throw new InvalidOperationException(message);
}
}
return baseObject;
}
private CosBase ReadNormalObjectStream(IRandomAccessRead reader, CosBase currentBase, long offset,
bool isLenientParsing,
out string endObjectKey)
{
if (currentBase is PdfDictionary dictionary)
{
PdfRawStream stream = streamParser.Parse(reader, dictionary, isLenientParsing, this);
currentBase = stream;
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream
// forms a complete stream object
throw new InvalidOperationException($"Stream not preceded by dictionary (offset: {offset}).");
}
ReadHelper.SkipSpaces(reader);
endObjectKey = ReadHelper.ReadLine(reader);
// we have case with a second 'endstream' before endobj
if (!endObjectKey.StartsWith("endobj") && endObjectKey.StartsWith("endstream"))
{
endObjectKey = endObjectKey.Substring(9).Trim();
if (endObjectKey.Length == 0)
{
// no other characters in extra endstream line
// read next line
endObjectKey = ReadHelper.ReadLine(reader);
}
}
return currentBase;
}
private CosBase ParseCompressedStreamObject(IRandomAccessRead reader, long streamObjectNumber, long requestedNumber, bool isLenientParsing)
{
var baseStream = Parse(new IndirectReference(streamObjectNumber, 0), reader, isLenientParsing, true);
if (!(baseStream is PdfRawStream stream))
{
log.Warn($"Could not find a stream for the object number, defaults to returning CosNull: {streamObjectNumber}");
return CosNull.Null;
}
var objects = objectStreamParser.Parse(stream, objectPool);
// register all objects which are referenced to be contained in object stream
foreach (var next in objects)
{
var streamKey = new IndirectReference(next.GetObjectNumber(), next.GetGenerationNumber());
var offset = TryGet(streamKey, crossReferenceTable.ObjectOffsets);
if (offset != null && offset == -streamObjectNumber)
{
var streamObject = objectPool.Get(streamKey);
streamObject.SetObject(next.GetObject());
}
}
var matchingStreamObject = objects.FirstOrDefault(x => x.GetObjectNumber() == requestedNumber);
if (matchingStreamObject != null)
{
return matchingStreamObject;
}
log.Error($"Could not find the object {requestedNumber} in the stream for object {streamObjectNumber}. Returning CosNull.");
return CosNull.Null;
}
private static T? TryGet<T, TKey>(TKey key, IReadOnlyDictionary<TKey, T> dictionary) where T : struct
{
return dictionary.TryGetValue(key, out var value) ? value : default(T?);
}
}
}

View File

@@ -1,82 +0,0 @@
namespace UglyToad.PdfPig.Parser
{
using System;
using System.Collections.Generic;
using ContentStream;
using ContentStream.TypedAccessors;
using Cos;
using Filters;
using IO;
using Logging;
using Parts;
internal class ObjectStreamParser
{
private readonly ILog log;
private readonly IFilterProvider filterProvider;
private readonly CosBaseParser baseParser;
public ObjectStreamParser(ILog log, IFilterProvider filterProvider, CosBaseParser baseParser)
{
this.log = log;
this.filterProvider = filterProvider;
this.baseParser = baseParser;
}
public IReadOnlyList<CosObject> Parse(PdfRawStream stream, CosObjectPool pool)
{
if (stream == null)
{
throw new ArgumentNullException(nameof(stream));
}
//need to first parse the header.
var numberOfObjects = stream.Dictionary.GetIntOrDefault(CosName.N);
var objectNumbers = new List<long>(numberOfObjects);
var streamObjects = new List<CosObject>(numberOfObjects);
var bytes = stream.Decode(filterProvider);
var reader = new RandomAccessBuffer(bytes);
for (int i = 0; i < numberOfObjects; i++)
{
long objectNumber = ObjectHelper.ReadObjectNumber(reader);
// skip offset
ReadHelper.ReadLong(reader);
objectNumbers.Add(objectNumber);
}
CosObject obj;
CosBase cosObject;
int objectCounter = 0;
while ((cosObject = baseParser.Parse(reader, pool)) != null)
{
obj = new CosObject(cosObject);
obj.SetGenerationNumber(0);
if (objectCounter >= objectNumbers.Count)
{
log.Error("/ObjStm (object stream) has more objects than /N " + numberOfObjects);
break;
}
obj.SetObjectNumber(objectNumbers[objectCounter]);
streamObjects.Add(obj);
// According to the spec objects within an object stream shall not be enclosed
// by obj/endobj tags, but there are some pdfs in the wild using those tags
// skip endobject marker if present
if (!reader.IsEof() && reader.Peek() == 'e')
{
ReadHelper.ReadLine(reader);
}
objectCounter++;
}
return streamObjects;
}
}
}

View File

@@ -1,332 +0,0 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using System;
using System.IO;
using ContentStream;
using Cos;
using IO;
using Logging;
using Util;
internal class CosStreamParser
{
private static readonly int STREAMCOPYBUFLEN = 8192;
private static readonly int STRMBUFLEN = 2048;
private static readonly byte[] ENDOBJ =
{
(byte) 'E', (byte) 'N', (byte) 'D',
(byte) 'O', (byte) 'B', (byte) 'J'
};
private static readonly byte[] ENDSTREAM =
{
(byte) 'E', (byte) 'N', (byte) 'D',
(byte) 'S', (byte) 'T', (byte) 'R', (byte) 'E', (byte) 'A', (byte) 'M'
};
private readonly ILog log;
private readonly byte[] streamCopyBuf = new byte[STREAMCOPYBUFLEN];
private readonly byte[] strmBuf = new byte[STRMBUFLEN];
public CosStreamParser(ILog log)
{
this.log = log;
}
public PdfRawStream Parse(IRandomAccessRead reader, PdfDictionary streamDictionary, bool isLenientParsing, IPdfObjectParser parser)
{
PdfRawStream result;
// read 'stream'; this was already tested in parseObjectsDynamically()
ReadHelper.ReadExpectedString(reader, "stream");
skipWhiteSpaces(reader);
// This needs to be streamDictionary.getItem because when we are parsing, the underlying object might still be null.
ICosNumber streamLength = GetLength(reader, streamDictionary.GetItemOrDefault(CosName.LENGTH), streamDictionary.GetName(CosName.TYPE), isLenientParsing, parser);
ValidateStreamLength(reader, isLenientParsing, streamLength);
// get output stream to copy data to
using (var stream = new MemoryStream())
using (var writer = new BinaryWriter(stream))
{
if (streamLength != null && validateStreamLength(reader, streamLength.AsLong(), reader.Length()))
{
ReadValidStream(reader, writer, streamLength);
}
else
{
ReadUntilEndStream(reader, writer);
}
result = new PdfRawStream(stream.ToArray(), streamDictionary);
}
String endStream = ReadHelper.ReadString(reader);
if (endStream.Equals("endobj") && isLenientParsing)
{
log.Warn($"stream ends with \'endobj\' instead of \'endstream\' at offset {reader.GetPosition()}");
// avoid follow-up warning about missing endobj
reader.Rewind("endobj".Length);
}
else if (endStream.Length > 9 && isLenientParsing && endStream.Substring(0, 9).Equals("endstream"))
{
log.Warn("stream ends with '" + endStream + "' instead of 'endstream' at offset " + reader.GetPosition());
// unread the "extra" bytes
reader.Rewind(OtherEncodings.StringAsLatin1Bytes(endStream.Substring(9)).Length);
}
else if (!endStream.Equals("endstream"))
{
throw new InvalidOperationException("Error reading stream, expected='endstream' actual='"
+ endStream + "' at offset " + reader.GetPosition());
}
return result;
}
private void ValidateStreamLength(IRandomAccessRead reader, bool isLenientParsing, ICosNumber streamLength)
{
if (streamLength != null)
{
return;
}
if (isLenientParsing)
{
log.Warn("The stream doesn't provide any stream length, using fallback readUntilEnd, at offset " +
reader.GetPosition());
}
else
{
throw new InvalidOperationException("Missing length for stream.");
}
}
private ICosNumber GetLength(IRandomAccessRead source, CosBase lengthBaseObj, CosName streamType, bool isLenientParsing, IPdfObjectParser parser)
{
if (lengthBaseObj == null)
{
return null;
}
// Length is given directly in the stream dictionary
if (lengthBaseObj is ICosNumber number)
{
return number;
}
// length in referenced object
if (lengthBaseObj is CosObject lengthObj)
{
var currentObject = lengthObj.GetObject();
if (currentObject == null)
{
if (parser == null)
{
throw new InvalidOperationException("This method required access to the PDF object parser but it was not created yet. Figure out how to fix this.");
}
var currentOffset = source.GetPosition();
var obj = parser.Parse(lengthObj.ToIndirectReference(), source, isLenientParsing);
source.Seek(currentOffset);
if (obj is ICosNumber referenceNumber)
{
return referenceNumber;
}
throw new InvalidOperationException("Length object content was not read.");
}
if (currentObject is ICosNumber objectNumber)
{
return objectNumber;
}
throw new InvalidOperationException("Wrong type of referenced length object " + lengthObj
+ ": " + lengthObj.GetObject().GetType().Name);
}
throw new InvalidOperationException($"Wrong type of length object: {lengthBaseObj.GetType().Name}");
}
private void ReadValidStream(IRandomAccessRead reader, BinaryWriter output, ICosNumber streamLengthObj)
{
long remainBytes = streamLengthObj.AsLong();
while (remainBytes > 0)
{
int chunk = (remainBytes > STREAMCOPYBUFLEN) ? STREAMCOPYBUFLEN : (int)remainBytes;
int readBytes = reader.Read(streamCopyBuf, 0, chunk);
if (readBytes <= 0)
{
// shouldn't happen, the stream length has already been validated
throw new InvalidOperationException(
$"read error at offset {reader.GetPosition()}: expected {chunk} bytes, but read() returns {readBytes}");
}
output.Write(streamCopyBuf, 0, readBytes);
remainBytes -= readBytes;
}
}
protected void skipWhiteSpaces(IRandomAccessRead reader)
{
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
int whitespace = reader.Read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (whitespace == ' ')
{
whitespace = reader.Read();
}
if (whitespace == ReadHelper.AsciiCarriageReturn)
{
whitespace = reader.Read();
if (whitespace != ReadHelper.AsciiLineFeed)
{
reader.Unread(whitespace);
//The spec says this is invalid but it happens in the real
//world so we must support it.
}
}
else if (whitespace != ReadHelper.AsciiLineFeed)
{
//we are in an error.
//but again we will do a lenient parsing and just assume that everything
//is fine
reader.Unread(whitespace);
}
}
private bool validateStreamLength(IRandomAccessRead source, long streamLength, long fileLength)
{
bool streamLengthIsValid = true;
long originOffset = source.GetPosition();
long expectedEndOfStream = originOffset + streamLength;
if (expectedEndOfStream > fileLength)
{
streamLengthIsValid = false;
//LOG.warn("The end of the stream is out of range, using workaround to read the stream, "
// + "stream start position: " + originOffset + ", length: " + streamLength
// + ", expected end position: " + expectedEndOfStream);
}
else
{
source.Seek(expectedEndOfStream);
ReadHelper.SkipSpaces(source);
if (!ReadHelper.IsString(source, "endstream"))
{
streamLengthIsValid = false;
//LOG.warn("The end of the stream doesn't point to the correct offset, using workaround to read the stream, "
// + "stream start position: " + originOffset + ", length: " + streamLength
// + ", expected end position: " + expectedEndOfStream);
}
source.Seek(originOffset);
}
return streamLengthIsValid;
}
private void ReadUntilEndStream(IRandomAccessRead source, BinaryWriter output)
{
int bufSize;
int charMatchCount = 0;
byte[] keyw = ENDSTREAM;
// last character position of shortest keyword ('endobj')
int quickTestOffset = 5;
// read next chunk into buffer; already matched chars are added to beginning of buffer
while ((bufSize = source.Read(strmBuf, charMatchCount, STRMBUFLEN - charMatchCount)) > 0)
{
bufSize += charMatchCount;
int bIdx = charMatchCount;
int quickTestIdx;
// iterate over buffer, trying to find keyword match
for (int maxQuicktestIdx = bufSize - quickTestOffset; bIdx < bufSize; bIdx++)
{
// reduce compare operations by first test last character we would have to
// match if current one matches; if it is not a character from keywords
// we can move behind the test character; this shortcut is inspired by the
// Boyer-Moore string search algorithm and can reduce parsing time by approx. 20%
quickTestIdx = bIdx + quickTestOffset;
if (charMatchCount == 0 && quickTestIdx < maxQuicktestIdx)
{
byte ch = strmBuf[quickTestIdx];
if ((ch > 't') || (ch < 'a'))
{
// last character we would have to match if current character would match
// is not a character from keywords -> jump behind and start over
bIdx = quickTestIdx;
continue;
}
}
// could be negative - but we only compare to ASCII
byte ch1 = strmBuf[bIdx];
if (ch1 == keyw[charMatchCount])
{
if (++charMatchCount == keyw.Length)
{
// match found
bIdx++;
break;
}
}
else
{
if ((charMatchCount == 3) && (ch1 == ENDOBJ[charMatchCount]))
{
// maybe ENDSTREAM is missing but we could have ENDOBJ
keyw = ENDOBJ;
charMatchCount++;
}
else
{
// no match; incrementing match start by 1 would be dumb since we already know
// matched chars depending on current char read we may already have beginning
// of a new match: 'e': first char matched; 'n': if we are at match position
// idx 7 we already read 'e' thus 2 chars matched for each other char we have
// to start matching first keyword char beginning with next read position
charMatchCount = (ch1 == 'e') ? 1 : ((ch1 == 'n') && (charMatchCount == 7)) ? 2 : 0;
// search again for 'endstream'
keyw = ENDSTREAM;
}
}
}
int contentBytes = Math.Max(0, bIdx - charMatchCount);
// write buffer content until first matched char to output stream
if (contentBytes > 0)
{
output.Write(strmBuf, 0, contentBytes);
}
if (charMatchCount == keyw.Length)
{
// keyword matched; unread matched keyword (endstream/endobj) and following buffered content
source.Rewind(bufSize - contentBytes);
break;
}
// copy matched chars at start of buffer
Array.Copy(keyw, 0, strmBuf, 0, charMatchCount);
}
// this writes a lonely CR or drops trailing CR LF and LF
// output.flush();
}
}
}

View File

@@ -1,37 +1,11 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using System;
using Cos;
using Exceptions;
using IO;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal static class DirectObjectFinder
{
public static T Find<T>(CosObject baseObject, IPdfObjectParser parser, IRandomAccessRead reader,
bool isLenientParsing) where T : CosBase
{
var result = parser.Parse(baseObject.ToIndirectReference(), reader, isLenientParsing);
if (result is T resultT)
{
return resultT;
}
if (result is CosObject obj)
{
return Find<T>(obj, parser, reader, isLenientParsing);
}
if (result is COSArray arr && arr.Count == 1 && arr.get(0) is CosObject arrayObject)
{
return Find<T>(arrayObject, parser, reader, isLenientParsing);
}
throw new InvalidOperationException($"Could not find the object {baseObject.ToIndirectReference()} with type {typeof(T).Name}.");
}
public static T Get<T>(IToken token, IPdfTokenScanner scanner) where T : IToken
{
if (token is T result)

View File

@@ -3,7 +3,6 @@
using Filters;
using Fonts.Parser;
using Logging;
using Parser;
using Parser.FileStructure;
using Parser.Parts;
using Parser.Parts.CrossReference;
@@ -34,10 +33,8 @@
var nameParser = new CosNameParser();
var dictionaryParser = new CosDictionaryParser(nameParser, logger);
var baseParser = new CosBaseParser(nameParser, new CosStringParser(), dictionaryParser, new CosArrayParser());
var streamParser = new CosStreamParser(logger);
var filterProvider = new MemoryFilterProvider(new DecodeParameterResolver(logger), new PngPredictor(), logger);
var crossReferenceParser = new CrossReferenceStreamParser(filterProvider);
var objectStreamParser = new ObjectStreamParser(logger, filterProvider, baseParser);
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, crossReferenceParser, new CrossReferenceTableParser());
@@ -50,10 +47,8 @@
container.Register(nameParser);
container.Register(dictionaryParser);
container.Register(baseParser);
container.Register(streamParser);
container.Register(crossReferenceParser);
container.Register(crossReferenceTableParser);
container.Register(objectStreamParser);
container.Register(filterProvider);
container.Register(cmapParser);
container.Register(afmParser);