unify raw byte access method

This commit is contained in:
Eliot Jones
2018-01-21 18:08:00 +00:00
parent 3172596b7c
commit 07161cef28
27 changed files with 521 additions and 910 deletions

View File

@@ -9,18 +9,9 @@ namespace UglyToad.PdfPig.Tests.Parser.Parts
public class BruteForceSearcherTests
{
[Fact]
public void ReaderNull_Throws()
{
// ReSharper disable once ConvertToLocalFunction
Action action = () => new BruteForceSearcher(null);
Assert.Throws<ArgumentNullException>(action);
}
private const string TestData = @"%PDF-1.5
%¿÷¢þ
2 0 obj
2 17 obj
<< /Linearized 1 /L 26082 /H [ 722 130 ] /O 6 /E 25807 /N 1 /T 25806 >>
endobj
@@ -44,14 +35,21 @@ startxref
216
%%EOF";
[Fact]
public void ReaderNull_Throws()
{
Action action = () => new BruteForceSearcher(null);
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void SearcherFindsCorrectObjects()
{
var bytes = OtherEncodings.StringAsLatin1Bytes(TestData);
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
var reader = new RandomAccessBuffer(bytes);
var searcher = new BruteForceSearcher(reader);
var searcher = new BruteForceSearcher(input);
var locations = searcher.GetObjectLocations();
@@ -59,28 +57,24 @@ startxref
Assert.Equal(locations.Values, new long[]
{
TestData.IndexOf("2 0 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase) + 1,
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase) + 1,
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase) + 1,
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase) + 1
});
}
[Fact]
public void ReaderOnlyCallsOnce()
{
var bytes = OtherEncodings.StringAsLatin1Bytes(TestData);
var reader = StringBytesTestConverter.Convert(TestData, false);
var reader = new ThrowingReader(new RandomAccessBuffer(bytes));
var searcher = new BruteForceSearcher(reader);
var searcher = new BruteForceSearcher(reader.Bytes);
var locations = searcher.GetObjectLocations();
Assert.Equal(4, locations.Count);
reader.Throw = true;
var newLocations = searcher.GetObjectLocations();
Assert.Equal(4, locations.Count);

View File

@@ -1,9 +1,7 @@
namespace UglyToad.PdfPig.Tests.Parser.Parts.CrossReference
{
using System;
using IO;
using PdfPig.Parser.Parts.CrossReference;
using PdfPig.Util;
using Xunit;
public class TableSubsectionDefinitionTests
@@ -39,11 +37,9 @@
[Fact]
public void TryReadIncorrectFormatSinglePartFalse()
{
var bytes = OtherEncodings.StringAsLatin1Bytes(@"76362");
var input = StringBytesTestConverter.Convert("76362", false);
var input = new RandomAccessBuffer(bytes);
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
Assert.False(result);
}
@@ -51,11 +47,9 @@
[Fact]
public void TryReadIncorrectFormatMultiplePartsFalse()
{
var bytes = OtherEncodings.StringAsLatin1Bytes(@"76362 100 1000");
var input = StringBytesTestConverter.Convert("76362 100 1000", false);
var input = new RandomAccessBuffer(bytes);
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
Assert.False(result);
}
@@ -63,11 +57,9 @@
[Fact]
public void FirstPartInvalidFormatFalse()
{
var bytes = OtherEncodings.StringAsLatin1Bytes("00adb85 97");
var input = StringBytesTestConverter.Convert("00adb85 97", false);
var input = new RandomAccessBuffer(bytes);
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
Assert.False(result);
}
@@ -75,11 +67,9 @@
[Fact]
public void SecondPartInvalidFormatFalse()
{
var bytes = OtherEncodings.StringAsLatin1Bytes("85 9t");
var input = new RandomAccessBuffer(bytes);
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
var input = StringBytesTestConverter.Convert("85 9t", false);
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
Assert.False(result);
}
@@ -87,11 +77,9 @@
[Fact]
public void ValidTrue()
{
var bytes = OtherEncodings.StringAsLatin1Bytes("12 32");
var input = StringBytesTestConverter.Convert("12 32", false);
var input = new RandomAccessBuffer(bytes);
var result = TableSubsectionDefinition.TryRead(log, input, out var definition);
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var definition);
Assert.True(result);
@@ -102,11 +90,9 @@
[Fact]
public void ValidWithLongTrue()
{
var bytes = OtherEncodings.StringAsLatin1Bytes("214748364700 6");
var input = StringBytesTestConverter.Convert("214748364700 6", false);
var input = new RandomAccessBuffer(bytes);
var result = TableSubsectionDefinition.TryRead(log, input, out var definition);
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var definition);
Assert.True(result);

View File

@@ -1,120 +0,0 @@
namespace UglyToad.PdfPig.Tests
{
using System;
using IO;
internal class ThrowingReader : IRandomAccessRead
{
private readonly IRandomAccessRead reader;
public bool Throw { get; set; }
public ThrowingReader(IRandomAccessRead reader)
{
this.reader = reader;
}
public void Dispose()
{
if (Throw) throw new InvalidOperationException();
reader.Dispose();
}
public int Read()
{
if (Throw) throw new InvalidOperationException();
return reader.Read();
}
public int Read(byte[] b)
{
if (Throw) throw new InvalidOperationException();
return reader.Read(b);
}
public int Read(byte[] b, int offset, int length)
{
if (Throw) throw new InvalidOperationException();
return reader.Read(b, offset, length);
}
public long GetPosition()
{
if (Throw) throw new InvalidOperationException();
return reader.GetPosition();
}
public void Seek(long position)
{
if (Throw) throw new InvalidOperationException();
reader.Seek(position);
}
public long Length()
{
if (Throw) throw new InvalidOperationException();
return reader.Length();
}
public bool IsClosed()
{
if (Throw) throw new InvalidOperationException();
return reader.IsClosed();
}
public int Peek()
{
if (Throw) throw new InvalidOperationException();
return reader.Peek();
}
public void Rewind(int bytes)
{
if (Throw) throw new InvalidOperationException();
reader.Rewind(bytes);
}
public byte[] ReadFully(int length)
{
if (Throw) throw new InvalidOperationException();
return reader.ReadFully(length);
}
public bool IsEof()
{
if (Throw) throw new InvalidOperationException();
return reader.IsEof();
}
public int Available()
{
if (Throw) throw new InvalidOperationException();
return reader.Available();
}
public void ReturnToBeginning()
{
if (Throw) throw new InvalidOperationException();
reader.ReturnToBeginning();
}
public void Unread(int b)
{
if (Throw) throw new InvalidOperationException();
reader.Unread(b);
}
public void Unread(byte[] bytes)
{
if (Throw) throw new InvalidOperationException();
reader.Unread(bytes);
}
public void Unread(byte[] bytes, int start, int length)
{
if (Throw) throw new InvalidOperationException();
reader.Unread(bytes, start, length);
}
}
}

View File

@@ -1,11 +1,10 @@
namespace UglyToad.PdfPig.Content
{
using IO;
using Tokenization.Tokens;
internal interface IPageFactory
{
Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader,
Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
bool isLenientParsing);
void LoadResources(DictionaryToken dictionary, bool isLenientParsing);

View File

@@ -2,7 +2,6 @@
{
using System;
using System.Collections.Generic;
using IO;
using Logging;
using Parser.Parts;
using Tokenization.Scanner;
@@ -14,7 +13,6 @@
private readonly ILog log;
private readonly Catalog catalog;
private readonly IPageFactory pageFactory;
private readonly IRandomAccessRead reader;
private readonly bool isLenientParsing;
private readonly IPdfTokenScanner pdfScanner;
private readonly DictionaryToken rootPageDictionary;
@@ -22,8 +20,7 @@
public int Count { get; }
internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory,
IRandomAccessRead reader, bool isLenientParsing, IPdfTokenScanner pdfScanner)
internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory, bool isLenientParsing, IPdfTokenScanner pdfScanner)
{
if (catalog == null)
{
@@ -37,7 +34,6 @@
this.log = log;
this.catalog = catalog;
this.pageFactory = pageFactory;
this.reader = reader;
this.isLenientParsing = isLenientParsing;
this.pdfScanner = pdfScanner;
}
@@ -47,7 +43,7 @@
if (locatedPages.TryGetValue(pageNumber, out DictionaryToken targetPageDictionary))
{
// TODO: cache the page
return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader,
return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(),
isLenientParsing);
}
@@ -61,7 +57,7 @@
throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
}
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), isLenientParsing);
locatedPages[pageNumber] = targetPageDictionary;

View File

@@ -44,7 +44,7 @@
var code = CMap.ReadCode(bytes);
codeLength = bytes.CurrentOffset - current;
codeLength = (int)(bytes.CurrentOffset - current);
return code;
}

View File

@@ -1,10 +1,12 @@
namespace UglyToad.PdfPig.Fonts.Parser.Handlers
{
using System;
using Cmap;
using Encodings;
using Exceptions;
using Filters;
using IO;
using Logging;
using Parts;
using PdfPig.Parser.Parts;
using Simple;
@@ -15,6 +17,7 @@
internal class TrueTypeFontHandler : IFontHandler
{
private readonly ILog log;
private readonly IFilterProvider filterProvider;
private readonly CMapCache cMapCache;
private readonly FontDescriptorFactory fontDescriptorFactory;
@@ -22,12 +25,13 @@
private readonly IEncodingReader encodingReader;
private readonly IPdfTokenScanner pdfScanner;
public TrueTypeFontHandler(IPdfTokenScanner pdfScanner, IFilterProvider filterProvider,
public TrueTypeFontHandler(ILog log, IPdfTokenScanner pdfScanner, IFilterProvider filterProvider,
CMapCache cMapCache,
FontDescriptorFactory fontDescriptorFactory,
TrueTypeFontParser trueTypeFontParser,
IEncodingReader encodingReader)
{
this.log = log;
this.filterProvider = filterProvider;
this.cMapCache = cMapCache;
this.fontDescriptorFactory = fontDescriptorFactory;
@@ -47,7 +51,7 @@
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfScanner, fontDescriptorFactory, dictionary, isLenientParsing);
// TODO: use the parsed font fully.
//var font = ParseTrueTypeFont(descriptor, reader, isLenientParsing);
var font = ParseTrueTypeFont(descriptor);
var name = FontDictionaryAccessHelper.GetName(pdfScanner, dictionary, descriptor, isLenientParsing);
@@ -69,8 +73,7 @@
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding);
}
private TrueTypeFont ParseTrueTypeFont(FontDescriptor descriptor, IRandomAccessRead reader,
bool isLenientParsing)
private TrueTypeFont ParseTrueTypeFont(FontDescriptor descriptor)
{
if (descriptor?.FontFile == null)
{
@@ -83,18 +86,23 @@
$"Expected a TrueType font in the TrueType font descriptor, instead it was {descriptor.FontFile.FileType}.");
}
//var fontFileStream = pdfObjectParser.Parse(descriptor.FontFile.ObjectKey, reader, isLenientParsing) as PdfRawStream;
//if (fontFileStream == null)
try
{
var fontFileStream = DirectObjectFinder.Get<StreamToken>(descriptor.FontFile.ObjectKey, pdfScanner);
var fontFile = fontFileStream.Decode(filterProvider);
var font = trueTypeFontParser.Parse(new TrueTypeDataBytes(new ByteArrayInputBytes(fontFile)));
return font;
}
catch (Exception ex)
{
log.Error("Could not parse the TrueType font.", ex);
return null;
}
//var fontFile = fontFileStream.Decode(filterProvider);
//var font = trueTypeFontParser.Parse(new TrueTypeDataBytes(new ByteArrayInputBytes(fontFile)));
//return font;
}
}
}

View File

@@ -12,8 +12,8 @@
currentOffset = -1;
}
private int currentOffset;
public int CurrentOffset => currentOffset + 1;
private long currentOffset;
public long CurrentOffset => currentOffset + 1;
public bool MoveNext()
{
@@ -23,7 +23,7 @@
}
currentOffset++;
CurrentByte = bytes[currentOffset];
CurrentByte = bytes[(int)currentOffset];
return true;
}
@@ -38,7 +38,7 @@
return null;
}
return bytes[currentOffset + 1];
return bytes[(int)currentOffset + 1];
}
public bool IsAtEnd()
@@ -49,7 +49,7 @@
public void Seek(long position)
{
currentOffset = (int)position - 1;
CurrentByte = currentOffset < 0 ? (byte)0 : bytes[currentOffset];
CurrentByte = currentOffset < 0 ? (byte)0 : bytes[(int)currentOffset];
}
}
}

View File

@@ -2,7 +2,7 @@
{
internal interface IInputBytes
{
int CurrentOffset { get; }
long CurrentOffset { get; }
bool MoveNext();

View File

@@ -1,7 +0,0 @@
namespace UglyToad.PdfPig.IO
{
internal interface RandomAccess : IRandomAccessRead, RandomAccessWrite
{
// super interface for both read and write
}
}

View File

@@ -4,7 +4,7 @@
using System.Collections.Generic;
using System.IO;
internal class RandomAccessBuffer : RandomAccess
internal class RandomAccessBuffer : IDisposable
{
// default chunk size is 1kb
private static readonly int DefaultChunkSize = 1024;

View File

@@ -1,123 +0,0 @@
namespace UglyToad.PdfPig.IO
{
using System;
internal interface IRandomAccessRead : IDisposable
{
/**
* Read a single byte of data.
*
* @return The byte of data that is being read.
*
* @throws IOException If there is an error while reading the data.
*/
int Read();
/**
* Read a buffer of data.
*
* @param b The buffer to write the data to.
* @return The number of bytes that were actually read.
* @throws IOException If there was an error while reading the data.
*/
int Read(byte[]
b);
/**
* Read a buffer of data.
*
* @param b The buffer to write the data to.
* @param offset Offset into the buffer to start writing.
* @param length The amount of data to attempt to read.
* @return The number of bytes that were actually read.
* @throws IOException If there was an error while reading the data.
*/
int Read(byte[]
b, int offset, int length);
/**
* Returns offset of next byte to be returned by a read method.
*
* @return offset of next byte which will be returned with next {@link #read()}
* (if no more bytes are left it returns a value &gt;= length of source)
*
* @throws IOException
*/
long GetPosition();
/**
* Seek to a position in the data.
*
* @param position The position to seek to.
* @throws IOException If there is an error while seeking.
*/
void Seek(long position);
/**
* The total number of bytes that are available.
*
* @return The number of bytes available.
*
* @throws IOException If there is an IO error while determining the
* length of the data stream.
*/
long Length();
/**
* Returns true if this stream has been closed.
*/
bool IsClosed();
/**
* This will peek at the next byte.
*
* @return The next byte on the stream, leaving it as available to read.
*
* @throws IOException If there is an error reading the next byte.
*/
int Peek();
/**
* Seek backwards the given number of bytes.
*
* @param bytes the number of bytes to be seeked backwards
* @throws IOException If there is an error while seeking
*/
void Rewind(int bytes);
/**
* Reads a given number of bytes.
* @param length the number of bytes to be read
* @return a byte array containing the bytes just read
* @throws IOException if an I/O error occurs while reading data
*/
byte[]
ReadFully(int length);
/**
* A simple test to see if we are at the end of the data.
*
* @return true if we are at the end of the data.
*
* @throws IOException If there is an error reading the next byte.
*/
bool IsEof();
/**
* Returns an estimate of the number of bytes that can be read.
*
* @return the number of bytes that can be read
* @throws IOException if this random access has been closed
*/
int Available();
void ReturnToBeginning();
void Unread(int b);
void Unread(byte[] bytes);
void Unread(byte[] bytes, int start, int length);
}
}

View File

@@ -1,40 +0,0 @@
namespace UglyToad.PdfPig.IO
{
using System;
internal interface RandomAccessWrite : IDisposable
{
/**
* Write a byte to the stream.
*
* @param b The byte to write.
* @throws IOException If there is an IO error while writing.
*/
void write(int b);
/**
* Write a buffer of data to the stream.
*
* @param b The buffer to get the data from.
* @throws IOException If there is an error while writing the data.
*/
void write(byte[]
b);
/**
* Write a buffer of data to the stream.
*
* @param b The buffer to get the data from.
* @param offset An offset into the buffer to get the data from.
* @param length The length of data to write.
* @throws IOException If there is an error while writing the data.
*/
void write(byte[]
b, int offset, int length);
/**
* Clears all data of the buffer.
*/
void clear();
}
}

View File

@@ -3,21 +3,13 @@
using System;
using Content;
using Exceptions;
using IO;
using Parts;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class CatalogFactory
{
private readonly IPdfTokenScanner scanner;
public CatalogFactory(IPdfTokenScanner scanner)
{
this.scanner = scanner;
}
public Catalog Create(DictionaryToken dictionary, IRandomAccessRead reader, bool isLenientParsing)
public Catalog Create(IPdfTokenScanner scanner, DictionaryToken dictionary)
{
if (dictionary == null)
{

View File

@@ -12,9 +12,9 @@
this.offsetValidator = offsetValidator;
}
public long Validate(long crossReferenceOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
public long Validate(long crossReferenceOffset, ISeekableTokenScanner scanner, IInputBytes bytes, bool isLenientParsing)
{
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, scanner, reader, isLenientParsing);
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, scanner, bytes, isLenientParsing);
if (fixedOffset > -1)
{
crossReferenceOffset = fixedOffset;

View File

@@ -19,6 +19,7 @@
private readonly XrefCosOffsetChecker xrefCosChecker;
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
XrefCosOffsetChecker xrefCosChecker,
CrossReferenceStreamParser crossReferenceStreamParser,
CrossReferenceTableParser crossReferenceTableParser)
{
@@ -26,13 +27,12 @@
this.offsetValidator = offsetValidator;
this.crossReferenceStreamParser = crossReferenceStreamParser;
this.crossReferenceTableParser = crossReferenceTableParser;
xrefCosChecker = new XrefCosOffsetChecker();
this.xrefCosChecker = xrefCosChecker;
}
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long xrefLocation, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
{
long fixedOffset = offsetValidator.CheckXRefOffset(xrefLocation, tokenScanner, reader, isLenientParsing);
long fixedOffset = offsetValidator.CheckXRefOffset(xrefLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1)
{
xrefLocation = fixedOffset;
@@ -76,7 +76,7 @@
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, reader, isLenientParsing);
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
@@ -143,7 +143,7 @@
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, reader, isLenientParsing);
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, bytes, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
previousCrossReferenceLocation = fixedOffset;
@@ -170,7 +170,7 @@
var resolved = table.Build(xrefLocation, log);
// check the offsets of all referenced objects
xrefCosChecker.checkXrefOffsets(reader, resolved, isLenientParsing);
xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing);
return resolved;
}

View File

@@ -6,174 +6,122 @@
using ContentStream;
using Cos;
using IO;
using Logging;
using Parts;
using Util;
internal class XrefCosOffsetChecker
{
private static readonly long MINIMUM_SEARCH_OFFSET = 6;
private Dictionary<IndirectReference, long> bfSearchCOSObjectKeyOffsets;
private bool validateXrefOffsets(IRandomAccessRead reader, Dictionary<IndirectReference, long> xrefOffset)
private readonly ILog log;
private readonly BruteForceSearcher bruteForceSearcher;
private IReadOnlyDictionary<IndirectReference, long> objectKeyOffsets;
public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher)
{
this.log = log;
this.bruteForceSearcher = bruteForceSearcher;
}
private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary<IndirectReference, long> xrefOffset)
{
if (xrefOffset == null)
{
return true;
}
foreach (var objectEntry in xrefOffset)
{
IndirectReference objectKey = objectEntry.Key;
long objectOffset = objectEntry.Value;
// a negative offset number represents a object number itself
// see type 2 entry in xref stream
if (objectOffset >= 0
&& !checkObjectKeys(reader, objectKey, objectOffset))
if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset))
{
//LOG.debug("Stop checking xref offsets as at least one (" + objectKey
// + ") couldn't be dereferenced");
log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced");
return false;
}
}
return true;
}
private bool checkObjectKeys(IRandomAccessRead source, IndirectReference objectKey, long offset)
private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
{
// there can't be any object at the very beginning of a pdf
if (offset < MINIMUM_SEARCH_OFFSET)
{
return false;
}
long objectNr = objectKey.ObjectNumber;
long objectGen = objectKey.Generation;
long originOffset = source.GetPosition();
string objectString = ObjectHelper.createObjectString(objectNr, objectGen);
long originOffset = bytes.CurrentOffset;
string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
try
{
source.Seek(offset);
if (ReadHelper.IsString(source, OtherEncodings.StringAsLatin1Bytes(objectString)))
bytes.Seek(offset);
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bytes.MoveNext();
}
if (ReadHelper.IsString(bytes, objectString))
{
// everything is ok, return origin object key
source.Seek(originOffset);
bytes.Seek(originOffset);
return true;
}
}
catch (InvalidOperationException)
catch (Exception)
{
// Swallow the exception, obviously there isn't any valid object number
}
finally
{
source.Seek(originOffset);
bytes.Seek(originOffset);
}
// no valid object number found
return false;
}
private Dictionary<IndirectReference, long> getBFCosObjectOffsets(IRandomAccessRead reader)
private IReadOnlyDictionary<IndirectReference, long> getBFCosObjectOffsets()
{
if (bfSearchCOSObjectKeyOffsets == null)
if (objectKeyOffsets == null)
{
bfSearchForObjects(reader);
var offsets = bruteForceSearcher.GetObjectLocations();
objectKeyOffsets = offsets;
}
return bfSearchCOSObjectKeyOffsets;
return objectKeyOffsets;
}
private void bfSearchForObjects(IRandomAccessRead source)
{
bfSearchForLastEOFMarker(source);
bfSearchCOSObjectKeyOffsets = new Dictionary<IndirectReference, long>();
long originOffset = source.GetPosition();
long currentOffset = MINIMUM_SEARCH_OFFSET;
long lastObjectId = long.MinValue;
int lastGenID = int.MinValue;
long lastObjOffset = long.MinValue;
char[] objString = " obj".ToCharArray();
char[] endobjString = "endobj".ToCharArray();
bool endobjFound = false;
do
{
source.Seek(currentOffset);
if (ReadHelper.IsString(source, "obj"))
{
long tempOffset = currentOffset - 1;
source.Seek(tempOffset);
int genID = source.Peek();
// is the next char a digit?
if (ReadHelper.IsDigit(genID))
{
genID -= 48;
tempOffset--;
source.Seek(tempOffset);
if (ReadHelper.IsSpace(source))
{
while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsSpace(source))
{
source.Seek(--tempOffset);
}
bool objectIDFound = false;
while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsDigit(source))
{
source.Seek(--tempOffset);
objectIDFound = true;
}
if (objectIDFound)
{
source.Read();
long objectId = ObjectHelper.ReadObjectNumber(source);
if (lastObjOffset > 0)
{
// add the former object ID only if there was a subsequent object ID
bfSearchCOSObjectKeyOffsets[new IndirectReference(lastObjectId, lastGenID)] = lastObjOffset;
}
lastObjectId = objectId;
lastGenID = genID;
lastObjOffset = tempOffset + 1;
currentOffset += objString.Length - 1;
endobjFound = false;
}
}
}
}
else if (ReadHelper.IsString(source, "endobj"))
{
endobjFound = true;
currentOffset += endobjString.Length - 1;
}
currentOffset++;
} while (currentOffset < lastEOFMarker && !source.IsEof());
if ((lastEOFMarker < long.MaxValue || endobjFound) && lastObjOffset > 0)
{
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
bfSearchCOSObjectKeyOffsets[new IndirectReference(lastObjectId, lastGenID)] = lastObjOffset;
}
// reestablish origin position
source.Seek(originOffset);
}
/**
* Check the XRef table by dereferencing all objects and fixing the offset if necessary.
*
* @throws InvalidOperationException if something went wrong.
*/
public void checkXrefOffsets(IRandomAccessRead reader, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
/// <summary>
/// Check that the offsets in the cross reference are correct.
/// </summary>
public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
{
// repair mode isn't available in non-lenient mode
if (!isLenientParsing)
{
return;
}
Dictionary<IndirectReference, long> xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value);
if (validateXrefOffsets(reader, xrefOffset))
if (ValidateXrefOffsets(bytes, xrefOffset))
{
return;
}
Dictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets(reader);
IReadOnlyDictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets();
if (bfCOSObjectKeyOffsets.Count > 0)
{
List<IndirectReference> objStreams = new List<IndirectReference>();
@@ -225,49 +173,64 @@
foreach (var item in bfCOSObjectKeyOffsets)
{
xrefOffset.Add(item.Key, item.Value);
xrefOffset[item.Key] = item.Value;
}
}
}
private long? lastEOFMarker = null;
private void bfSearchForLastEOFMarker(IRandomAccessRead source)
private long? lastEndOfFileMarker;
private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
{
if (lastEOFMarker == null)
if (lastEndOfFileMarker != null)
{
long originOffset = source.GetPosition();
source.Seek(MINIMUM_SEARCH_OFFSET);
while (!source.IsEof())
return;
}
long startOffset = source.CurrentOffset;
source.Seek(MINIMUM_SEARCH_OFFSET);
while (!source.IsAtEnd())
{
// search for EOF marker
if (ReadHelper.IsString(source, "%%EOF"))
{
// search for EOF marker
if (ReadHelper.IsString(source, "%%EOF"))
long tempMarker = source.CurrentOffset;
if (tempMarker >= source.Length)
{
long tempMarker = source.GetPosition();
source.Seek(tempMarker + 5);
try
{
// check if the following data is some valid pdf content
// which most likely indicates that the pdf is linearized,
// updated or just cut off somewhere in the middle
ReadHelper.SkipSpaces(source);
ObjectHelper.ReadObjectNumber(source);
ObjectHelper.ReadGenerationNumber(source);
}
catch (InvalidOperationException)
{
// save the EOF marker as the following data is most likely some garbage
lastEOFMarker = tempMarker;
}
lastEndOfFileMarker = tempMarker;
break;
}
try
{
source.Seek(tempMarker + 5);
// check if the following data is some valid pdf content
// which most likely indicates that the pdf is linearized,
// updated or just cut off somewhere in the middle
ReadHelper.SkipSpaces(source);
ObjectHelper.ReadObjectNumber(source);
ObjectHelper.ReadGenerationNumber(source);
}
catch (Exception)
{
// save the EOF marker as the following data is most likely some garbage
lastEndOfFileMarker = tempMarker;
}
source.Read();
}
source.Seek(originOffset);
// no EOF marker found
if (lastEOFMarker == null)
{
lastEOFMarker = long.MaxValue;
}
source.MoveNext();
}
source.Seek(startOffset);
// no EOF marker found
if (lastEndOfFileMarker == null)
{
lastEndOfFileMarker = long.MaxValue;
}
}
}

View File

@@ -22,7 +22,7 @@
this.log = log;
}
public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing)
{
// repair mode isn't available in non-lenient mode
if (!isLenientParsing)
@@ -30,14 +30,15 @@
return startXRefOffset;
}
reader.Seek(startXRefOffset);
scanner.Seek(startXRefOffset);
ReadHelper.SkipSpaces(reader);
scanner.MoveNext();
if (reader.Peek() == 'x' && ReadHelper.IsString(reader, "xref"))
if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
{
return startXRefOffset;
}
if (startXRefOffset > 0)
{
if (CheckXRefStreamOffset(startXRefOffset, scanner, true))
@@ -45,14 +46,14 @@
return startXRefOffset;
}
return CalculateXRefFixedOffset(startXRefOffset, scanner, reader);
return CalculateXRefFixedOffset(startXRefOffset, scanner, inputBytes);
}
// can't find a valid offset
return -1;
}
private long CalculateXRefFixedOffset(long objectOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
private long CalculateXRefFixedOffset(long objectOffset, ISeekableTokenScanner scanner, IInputBytes inputBytes)
{
if (objectOffset < 0)
{
@@ -61,10 +62,12 @@
}
// start a brute force search for all xref tables and try to find the offset we are looking for
long newOffset = BfSearchForXRef(objectOffset, scanner, reader);
var newOffset = BruteForceSearchForXref(objectOffset, scanner, inputBytes);
if (newOffset > -1)
{
log.Debug($"Fixed reference for xref table/stream {objectOffset} -> {newOffset}");
return newOffset;
}
@@ -73,90 +76,18 @@
return 0;
}
private void BfSearchForXRefStreams(IRandomAccessRead reader)
{
if (bfSearchXRefStreamsOffsets != null)
{
return;
}
// a pdf may contain more than one /XRef entry
bfSearchXRefStreamsOffsets = new List<long>();
long originOffset = reader.GetPosition();
reader.Seek(MinimumSearchOffset);
// search for XRef streams
var objString = " obj";
while (!reader.IsEof())
{
if (ReadHelper.IsString(reader, "xref"))
{
// search backwards for the beginning of the stream
long newOffset = -1;
long xrefOffset = reader.GetPosition();
bool objFound = false;
for (int i = 1; i < 40 && !objFound; i++)
{
long currentOffset = xrefOffset - (i * 10);
if (currentOffset > 0)
{
reader.Seek(currentOffset);
for (int j = 0; j < 10; j++)
{
if (ReadHelper.IsString(reader, objString))
{
long tempOffset = currentOffset - 1;
reader.Seek(tempOffset);
int genId = reader.Peek();
// is the next char a digit?
if (ReadHelper.IsDigit(genId))
{
tempOffset--;
reader.Seek(tempOffset);
if (ReadHelper.IsSpace(reader))
{
int length = 0;
reader.Seek(--tempOffset);
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
{
reader.Seek(--tempOffset);
length++;
}
if (length > 0)
{
reader.Read();
newOffset = reader.GetPosition();
}
}
}
objFound = true;
break;
}
else
{
currentOffset++;
reader.Read();
}
}
}
}
if (newOffset > -1)
{
bfSearchXRefStreamsOffsets.Add(newOffset);
}
reader.Seek(xrefOffset + 5);
}
reader.Read();
}
reader.Seek(originOffset);
}
private long BfSearchForXRef(long xrefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scanner, IInputBytes reader)
{
long newOffset = -1;
long newOffsetTable = -1;
long newOffsetStream = -1;
BfSearchForXRefTables(reader);
BruteForceSearchForTables(reader);
BfSearchForXRefStreams(reader);
if (bfSearchXRefTablesOffsets != null)
{
// TODO to be optimized, this won't work in every case
@@ -196,35 +127,143 @@
return newOffset;
}
private void BfSearchForXRefTables(IRandomAccessRead reader)
private void BruteForceSearchForTables(IInputBytes bytes)
{
if (bfSearchXRefTablesOffsets == null)
if (bfSearchXRefTablesOffsets != null)
{
// a pdf may contain more than one xref entry
bfSearchXRefTablesOffsets = new List<long>();
long originOffset = reader.GetPosition();
reader.Seek(MinimumSearchOffset);
// search for xref tables
while (!reader.IsEof())
{
if (ReadHelper.IsString(reader, "xref"))
{
long newOffset = reader.GetPosition();
reader.Seek(newOffset - 1);
// ensure that we don't read "startxref" instead of "xref"
if (ReadHelper.IsWhitespace(reader))
{
bfSearchXRefTablesOffsets.Add(newOffset);
}
reader.Seek(newOffset + 4);
}
reader.Read();
}
reader.Seek(originOffset);
return;
}
// a pdf may contain more than one xref entry
bfSearchXRefTablesOffsets = new List<long>();
var startOffset = bytes.CurrentOffset;
bytes.Seek(MinimumSearchOffset);
// search for xref tables
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (ReadHelper.IsString(bytes, "xref"))
{
var newOffset = bytes.CurrentOffset;
bytes.Seek(newOffset - 1);
// ensure that we don't read "startxref" instead of "xref"
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bfSearchXRefTablesOffsets.Add(newOffset);
}
bytes.Seek(newOffset + 4);
}
}
bytes.Seek(startOffset);
}
private long SearchNearestValue(List<long> values, long offset)
private void BfSearchForXRefStreams(IInputBytes bytes)
{
if (bfSearchXRefStreamsOffsets != null)
{
return;
}
// a pdf may contain more than one /XRef entry
bfSearchXRefStreamsOffsets = new List<long>();
var startOffset = bytes.CurrentOffset;
bytes.Seek(MinimumSearchOffset);
// search for XRef streams
var objString = " obj";
while (bytes.MoveNext() && !bytes.IsAtEnd())
{
if (!ReadHelper.IsString(bytes, "xref"))
{
continue;
}
// search backwards for the beginning of the stream
long newOffset = -1;
long xrefOffset = bytes.CurrentOffset;
bool objFound = false;
for (var i = 1; i < 40; i++)
{
if (objFound)
{
break;
}
long currentOffset = xrefOffset - (i * 10);
if (currentOffset > 0)
{
bytes.Seek(currentOffset);
for (int j = 0; j < 10; j++)
{
if (ReadHelper.IsString(bytes, objString))
{
long tempOffset = currentOffset - 1;
bytes.Seek(tempOffset);
var generationNumber = bytes.Peek();
// is the next char a digit?
if (generationNumber.HasValue && ReadHelper.IsDigit(generationNumber.Value))
{
tempOffset--;
bytes.Seek(tempOffset);
// is the digit preceded by a space?
if (ReadHelper.IsSpace(bytes.CurrentByte))
{
int length = 0;
bytes.Seek(--tempOffset);
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(bytes.CurrentByte))
{
bytes.Seek(--tempOffset);
length++;
}
if (length > 0)
{
bytes.MoveNext();
newOffset = bytes.CurrentOffset;
}
}
}
objFound = true;
break;
}
currentOffset++;
bytes.MoveNext();
}
}
}
if (newOffset > -1)
{
bfSearchXRefStreamsOffsets.Add(newOffset);
}
bytes.Seek(xrefOffset + 5);
}
bytes.Seek(startOffset);
}
private static long SearchNearestValue(List<long> values, long offset)
{
long newValue = -1;
long? currentDifference = null;
@@ -255,8 +294,9 @@
{
return true;
}
// seek to offset-1
scanner.Seek(startXRefOffset - 1);
scanner.Seek(startXRefOffset);
if (scanner.TryReadToken(out NumericToken objectNumber))
{
try
@@ -280,7 +320,6 @@
if (!scanner.TryReadToken(out DictionaryToken dictionary))
{
scanner.Seek(startXRefOffset);
}
if (dictionary.TryGet(NameToken.Type, out var type) && NameToken.Xref.Equals(type))
@@ -293,6 +332,11 @@
log.Error("Couldn't read the xref stream object.", ex);
}
}
else
{
log.Error($"When looking for the cross reference stream object we sought a number but found: {scanner.CurrentToken}.");
}
return false;
}
}

View File

@@ -31,7 +31,7 @@
this.pdfScanner = pdfScanner;
}
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader,
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
bool isLenientParsing)
{
if (dictionary == null)

View File

@@ -1,38 +0,0 @@
namespace UglyToad.PdfPig.Parser
{
using System;
using System.Diagnostics;
using Cos;
using IO;
using Logging;
using Util;
internal class ParsingArguments
{
public IRandomAccessRead Reader { get; }
public CrossReferenceTable CrossReferenceTable { get; }
public ParsingCachingProviders CachingProviders { get; }
public IContainer Container { get; }
public bool IsLenientParsing { get; }
public ILog Log { get; }
[DebuggerStepThrough]
public T Get<T>() => Container.Get<T>();
public ParsingArguments(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable, ParsingCachingProviders cachingProviders, IContainer container, bool isLenientParsing)
{
Reader = reader ?? throw new ArgumentNullException(nameof(reader));
CrossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
CachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
Container = container ?? throw new ArgumentNullException(nameof(container));
IsLenientParsing = isLenientParsing;
Log = new NoOpLog();
}
}
}

View File

@@ -2,25 +2,26 @@
{
using System;
using System.Collections.Generic;
using System.Text;
using ContentStream;
using IO;
using Util;
using Util.JetBrains.Annotations;
/// <summary>
/// Store the results of a brute force search for all Cos Objects in the document so we only do it once.
/// Store the results of a brute force search for all objects in the document so we only do it once.
/// </summary>
internal class BruteForceSearcher
{
private const int MinimumSearchOffset = 6;
private readonly IRandomAccessRead reader;
private readonly IInputBytes bytes;
private Dictionary<IndirectReference, long> objectLocations;
public BruteForceSearcher([NotNull] IRandomAccessRead reader)
public BruteForceSearcher([NotNull] IInputBytes bytes)
{
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
}
[NotNull]
@@ -35,70 +36,86 @@
var results = new Dictionary<IndirectReference, long>();
var originPosition = reader.GetPosition();
var originPosition = bytes.CurrentOffset;
long currentOffset = MinimumSearchOffset;
long lastObjectId = long.MinValue;
int lastGenerationId = int.MinValue;
long lastObjOffset = long.MinValue;
byte[] objString = OtherEncodings.StringAsLatin1Bytes(" obj");
byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj");
bool inObject = false;
bool endobjFound = false;
do
{
reader.Seek(currentOffset);
if (ReadHelper.IsString(reader, objString))
{
long tempOffset = currentOffset - 1;
reader.Seek(tempOffset);
int generationId = reader.Peek();
bytes.Seek(currentOffset);
// is the next char a digit?
if (ReadHelper.IsDigit(generationId))
if (inObject)
{
if (ReadHelper.IsString(bytes, "endobj"))
{
generationId -= 48;
tempOffset--;
reader.Seek(tempOffset);
if (ReadHelper.IsSpace(reader))
{
while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader))
{
reader.Seek(--tempOffset);
}
bool objectIdFound = false;
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
{
reader.Seek(--tempOffset);
objectIdFound = true;
}
if (objectIdFound)
{
reader.Read();
long objectId = ObjectHelper.ReadObjectNumber(reader);
if (lastObjOffset > 0)
{
// add the former object ID only if there was a subsequent object ID
results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset;
}
lastObjectId = objectId;
lastGenerationId = generationId;
lastObjOffset = tempOffset + 1;
currentOffset += objString.Length - 1;
endobjFound = false;
}
}
inObject = false;
endobjFound = true;
currentOffset += "endobj".Length;
}
else
{
currentOffset++;
}
continue;
}
else if (ReadHelper.IsString(reader, "endobj"))
if (!ReadHelper.IsString(bytes, " obj"))
{
endobjFound = true;
currentOffset += endobjString.Length - 1;
currentOffset++;
continue;
}
// Current byte is ' '[obj]
var offset = currentOffset - 1;
bytes.Seek(offset);
var generationBytes = new StringBuilder();
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
generationBytes.Insert(0, (char)bytes.CurrentByte);
offset--;
bytes.Seek(offset);
}
// We should now be at the space between object and generation number.
if (!ReadHelper.IsSpace(bytes.CurrentByte))
{
continue;
}
bytes.Seek(--offset);
var objectNumberBytes = new StringBuilder();
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
objectNumberBytes.Insert(0, (char)bytes.CurrentByte);
offset--;
bytes.Seek(offset);
}
if (!ReadHelper.IsWhitespace(bytes.CurrentByte))
{
continue;
}
var obj = long.Parse(objectNumberBytes.ToString());
var generation = int.Parse(generationBytes.ToString());
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset + 1;
inObject = true;
endobjFound = false;
currentOffset++;
} while (currentOffset < lastEndOfFile && !reader.IsEof());
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
{
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
@@ -107,7 +124,7 @@
}
// reestablish origin position
reader.Seek(originPosition);
bytes.Seek(originPosition);
objectLocations = results;
@@ -116,27 +133,29 @@
private long GetLastEndOfFileMarker()
{
var originalOffset = reader.GetPosition();
var originalOffset = bytes.CurrentOffset;
var searchTerm = OtherEncodings.StringAsLatin1Bytes("%%EOF");
const string searchTerm = "%%EOF";
var minimumEndOffset = reader.Length() - searchTerm.Length;
var minimumEndOffset = bytes.Length - searchTerm.Length;
reader.Seek(minimumEndOffset);
bytes.Seek(minimumEndOffset);
while (reader.GetPosition() > 0)
while (bytes.CurrentOffset > 0)
{
if (ReadHelper.IsString(reader, searchTerm))
if (ReadHelper.IsString(bytes, searchTerm))
{
var position = reader.GetPosition();
reader.Seek(originalOffset);
var position = bytes.CurrentOffset;
bytes.Seek(originalOffset);
return position;
}
reader.Seek(minimumEndOffset--);
bytes.Seek(minimumEndOffset--);
}
reader.Seek(originalOffset);
bytes.Seek(originalOffset);
return long.MaxValue;
}
}

View File

@@ -46,11 +46,11 @@
/// <summary>
/// Attempts to read the <see cref="TableSubsectionDefinition"/> from the current line of the source.
/// </summary>
public static bool TryRead(ILog log, IRandomAccessRead source, out TableSubsectionDefinition definition)
public static bool TryRead(ILog log, IInputBytes bytes, out TableSubsectionDefinition definition)
{
definition = default(TableSubsectionDefinition);
var line = ReadHelper.ReadLine(source);
var line = ReadHelper.ReadLine(bytes);
var parts = line.Split(Splitters, StringSplitOptions.RemoveEmptyEntries);

View File

@@ -8,30 +8,31 @@
private const long ObjectNumberThreshold = 10000000000L;
private const long GenerationNumberThreshold = 65535;
public static long ReadObjectNumber(IRandomAccessRead reader)
public static long ReadObjectNumber(IInputBytes bytes)
{
long retval = ReadHelper.ReadLong(reader);
if (retval < 0 || retval >= ObjectNumberThreshold)
long result = ReadHelper.ReadLong(bytes);
if (result < 0 || result >= ObjectNumberThreshold)
{
throw new FormatException($"Object Number \'{retval}\' has more than 10 digits or is negative");
throw new FormatException($"Object Number \'{result}\' has more than 10 digits or is negative");
}
return retval;
return result;
}
public static int ReadGenerationNumber(IRandomAccessRead reader)
public static int ReadGenerationNumber(IInputBytes bytes)
{
int retval = ReadHelper.ReadInt(reader);
if (retval < 0 || retval > GenerationNumberThreshold)
int result = ReadHelper.ReadInt(bytes);
if (result < 0 || result > GenerationNumberThreshold)
{
throw new FormatException("Generation Number '" + retval + "' has more than 5 digits");
throw new FormatException("Generation Number '" + result + "' has more than 5 digits");
}
return retval;
return result;
}
public static string createObjectString(long objectID, long genID)
public static string CreateObjectString(long objectId, long genId)
{
return $"{objectID} {genID} obj";
return $"{objectId} {genId} obj";
}
}
}

View File

@@ -2,9 +2,8 @@
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Exceptions;
using IO;
using Util;
@@ -13,23 +12,25 @@
public const byte AsciiLineFeed = 10;
public const byte AsciiCarriageReturn = 13;
public static string ReadLine(IRandomAccessRead reader)
public static string ReadLine(IInputBytes bytes)
{
if (reader == null)
if (bytes == null)
{
throw new ArgumentNullException(nameof(reader));
throw new ArgumentNullException(nameof(bytes));
}
if (reader.IsEof())
if (bytes.IsAtEnd())
{
throw new InvalidOperationException("Error: End-of-File, expected line");
}
var buffer = new StringBuilder(11);
int c;
while ((c = reader.Read()) != -1)
byte c = 0;
while (bytes.MoveNext())
{
c = bytes.CurrentByte;
// CR and LF are valid EOLs
if (IsEndOfLine(c))
{
@@ -40,56 +41,43 @@
}
// CR+LF is also a valid EOL
if (IsCarriageReturn(c) && IsLineFeed(reader.Peek()))
if (IsCarriageReturn(c) && IsLineFeed(bytes.Peek()))
{
reader.Read();
bytes.MoveNext();
}
return buffer.ToString();
}
public static string ReadString(IRandomAccessRead reader)
{
SkipSpaces(reader);
StringBuilder buffer = new StringBuilder();
int c = reader.Read();
while (!IsEndOfName((char)c) && c != -1)
{
buffer.Append((char)c);
c = reader.Read();
}
if (c != -1)
{
reader.Unread(c);
}
return buffer.ToString();
}
public static void SkipSpaces(IRandomAccessRead reader)
public static void SkipSpaces(IInputBytes bytes)
{
const int commentCharacter = 37;
int c = reader.Read();
bytes.MoveNext();
byte c = bytes.CurrentByte;
while (IsWhitespace(c) || c == 37)
{
if (c == commentCharacter)
{
// skip past the comment section
c = reader.Read();
while (!IsEndOfLine(c) && c != -1)
bytes.MoveNext();
c = bytes.CurrentByte;
while (!IsEndOfLine(c))
{
c = reader.Read();
bytes.MoveNext();
c = bytes.CurrentByte;
}
}
else
{
c = reader.Read();
bytes.MoveNext();
c = bytes.CurrentByte;
}
}
if (c != -1)
if (!bytes.IsAtEnd())
{
reader.Unread(c);
bytes.Seek(bytes.CurrentOffset - 1);
}
}
@@ -114,15 +102,7 @@
{
return EndOfNameCharacters.Contains(ch);
}
/// <summary>
/// Determines if the current character in the reader is a whitespace.
/// </summary>
public static bool IsWhitespace(IRandomAccessRead reader)
{
return IsWhitespace(reader.Peek());
}
/// <summary>
/// Determines if a character is whitespace or not.
/// </summary>
@@ -135,50 +115,50 @@
|| c == AsciiCarriageReturn || c == ' ';
}
public static bool IsEndOfLine(int c)
{
return IsLineFeed(c) || IsCarriageReturn(c);
}
public static bool IsEndOfLine(char c) => IsEndOfLine((byte) c);
public static bool IsEndOfLine(byte b)
{
return IsLineFeed(b) || IsCarriageReturn(b);
}
public static bool IsLineFeed(int c)
public static bool IsLineFeed(byte? c)
{
return AsciiLineFeed == c;
}
public static bool IsCarriageReturn(int c)
public static bool IsCarriageReturn(byte c)
{
return AsciiCarriageReturn == c;
}
public static bool IsString(IRandomAccessRead reader, string str) => IsString(reader, str.Select(x => (byte)x));
public static bool IsString(IRandomAccessRead reader, IEnumerable<byte> str)
public static bool IsString(IInputBytes bytes, string s)
{
bool bytesMatching = true;
long originOffset = reader.GetPosition();
foreach (var c in str)
bool found = true;
var startOffset = bytes.CurrentOffset;
foreach (var c in s)
{
if (reader.Read() != c)
if (bytes.CurrentByte != c)
{
bytesMatching = false;
found = false;
break;
}
bytes.MoveNext();
}
reader.Seek(originOffset);
return bytesMatching;
bytes.Seek(startOffset);
return found;
}
public static long ReadLong(IRandomAccessRead reader)
public static long ReadLong(IInputBytes bytes)
{
SkipSpaces(reader);
SkipSpaces(bytes);
long retval;
StringBuilder longBuffer = ReadStringNumber(reader);
StringBuilder longBuffer = ReadStringNumber(bytes);
try
{
@@ -187,46 +167,45 @@
catch (FormatException e)
{
var bytesToReverse = OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString());
reader.Unread(bytesToReverse);
bytes.Seek(bytes.CurrentOffset - bytesToReverse.Length);
throw new InvalidOperationException($"Error: Expected a long type at offset {reader.GetPosition()}, instead got \'{longBuffer}\'", e);
throw new InvalidOperationException($"Error: Expected a long type at offset {bytes.CurrentOffset}, instead got \'{longBuffer}\'", e);
}
return retval;
}
private static StringBuilder ReadStringNumber(IRandomAccessRead reader)
private static readonly int MaximumNumberStringLength = long.MaxValue.ToString("D").Length;
private static StringBuilder ReadStringNumber(IInputBytes reader)
{
int lastByte = 0;
byte lastByte;
StringBuilder buffer = new StringBuilder();
while ((lastByte = reader.Read()) != ' ' &&
while (reader.MoveNext() && (lastByte = reader.CurrentByte) != ' ' &&
lastByte != AsciiLineFeed &&
lastByte != AsciiCarriageReturn &&
lastByte != 60 && //see sourceforge bug 1714707
lastByte != '[' && // PDFBOX-1845
lastByte != '(' && // PDFBOX-2579
lastByte != 0 && //See sourceforge bug 853328
lastByte != -1)
lastByte != 0)
{
buffer.Append((char)lastByte);
if (buffer.Length > long.MaxValue.ToString("D").Length)
if (buffer.Length > MaximumNumberStringLength)
{
throw new IOException("Number '" + buffer + "' is getting too long, stop reading at offset " + reader.GetPosition());
throw new InvalidOperationException($"Number \'{buffer}\' is getting too long, stop reading at offset {reader.CurrentOffset}");
}
}
if (lastByte != -1)
if (!reader.IsAtEnd())
{
reader.Unread(lastByte);
reader.Seek(reader.CurrentOffset - 1);
}
return buffer;
}
public static bool IsDigit(IRandomAccessRead reader)
{
return IsDigit(reader.Peek());
}
/// <summary>
/// This will tell if the given value is a digit or not.
/// </summary>
@@ -235,17 +214,17 @@
return c >= '0' && c <= '9';
}
public static int ReadInt(IRandomAccessRead reader)
public static int ReadInt(IInputBytes bytes)
{
if (reader == null)
if (bytes == null)
{
throw new ArgumentNullException(nameof(reader));
throw new ArgumentNullException(nameof(bytes));
}
SkipSpaces(reader);
SkipSpaces(bytes);
int result;
var intBuffer = ReadStringNumber(reader);
var intBuffer = ReadStringNumber(bytes);
try
{
@@ -253,45 +232,14 @@
}
catch (Exception e)
{
reader.Unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
throw new IOException("Error: Expected an integer type at offset " + reader.GetPosition(), e);
bytes.Seek(bytes.CurrentOffset - OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()).Length);
throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}", e);
}
return result;
}
public static void ReadExpectedString(IRandomAccessRead reader, string expectedstring)
{
ReadExpectedString(reader, expectedstring, false);
}
/**
* Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
*
* @param expectedstring pattern to be skipped
* @param skipSpaces if set to true spaces before and after the string will be skipped
* @throws IOException if pattern could not be read
*/
public static void ReadExpectedString(IRandomAccessRead reader, string expectedstring, bool skipSpaces)
{
SkipSpaces(reader);
foreach (var c in expectedstring)
{
if (reader.Read() != c)
{
throw new IOException($"Expected string \'{expectedstring}\' but missed character \'{c}\' at offset {reader.GetPosition()}");
}
}
SkipSpaces(reader);
}
public static bool IsSpace(IRandomAccessRead reader)
{
return IsSpace(reader.Peek());
}
/**
* This will tell if the given value is a space or not.
*
@@ -302,17 +250,7 @@
{
return ' ' == c;
}
public static void ReadExpectedChar(IRandomAccessRead reader, char ec)
{
char c = (char)reader.Read();
if (c != ec)
{
throw new InvalidOperationException($"expected=\'{ec}\' actual=\'{c}\' at offset {reader.GetPosition()}");
}
}
public static bool IsHexDigit(char ch)
{
return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');

View File

@@ -29,14 +29,12 @@
var container = Bootstrapper.GenerateContainer(options?.Logger);
var isLenientParsing = options?.UseLenientParsing ?? true;
var reader = new RandomAccessBuffer(fileBytes);
var inputBytes = new ByteArrayInputBytes(fileBytes);
var tokenScanner = new CoreTokenScanner(inputBytes);
var document = OpenDocument(reader, inputBytes, tokenScanner, container, isLenientParsing);
var document = OpenDocument(inputBytes, tokenScanner, container, isLenientParsing);
return document;
}
@@ -51,23 +49,27 @@
return Open(File.ReadAllBytes(filename), options);
}
private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
private static PdfDocument OpenDocument(IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
{
var log = container.Get<ILog>();
var filterProvider = container.Get<IFilterProvider>();
var bruteForceSearcher = new BruteForceSearcher(reader);
var catalogFactory = new CatalogFactory();
var cMapCache = new CMapCache(new CMapParser());
CrossReferenceTable crossReferenceTable = null;
var bruteForceSearcher = new BruteForceSearcher(inputBytes);
var xrefValidator = new XrefOffsetValidator(log);
var objectChecker = new XrefCosOffsetChecker(log, bruteForceSearcher);
// We're ok with this since our intent is to lazily load the cross reference table.
// ReSharper disable once AccessToModifiedClosure
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, bruteForceSearcher);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider);
var xrefValidator = new XrefOffsetValidator(log);
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser, new CrossReferenceTableParser());
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, objectChecker, crossReferenceStreamParser, new CrossReferenceTableParser());
var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
@@ -76,22 +78,20 @@
// TODO: make this use the scanner.
var validator = new CrossReferenceOffsetValidator(xrefValidator);
crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, reader, isLenientParsing);
crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, inputBytes, isLenientParsing);
crossReferenceTable = crossReferenceParser.Parse(reader, isLenientParsing, crossReferenceOffset, pdfScanner, scanner);
crossReferenceTable = crossReferenceParser.Parse(inputBytes, isLenientParsing, crossReferenceOffset, pdfScanner, scanner);
var trueTypeFontParser = new TrueTypeFontParser();
var fontDescriptorFactory = new FontDescriptorFactory();
var cidFontFactory = new CidFontFactory(pdfScanner, fontDescriptorFactory, trueTypeFontParser, filterProvider);
var encodingReader = new EncodingReader(pdfScanner);
var cMapCache = new CMapCache(new CMapParser());
var fontFactory = new FontFactory(log, new Type0FontHandler(cidFontFactory,
cMapCache,
filterProvider, pdfScanner),
new TrueTypeFontHandler(pdfScanner, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
new TrueTypeFontHandler(log, pdfScanner, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
new Type1FontHandler(pdfScanner, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, new Type1FontParser()),
new Type3FontHandler(pdfScanner, cMapCache, filterProvider, encodingReader));
@@ -99,17 +99,17 @@
var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
var informationFactory = new DocumentInformationFactory();
var catalogFactory = new CatalogFactory(pdfScanner);
var rootDictionary = ParseTrailer(crossReferenceTable, isLenientParsing, pdfScanner);
var information = informationFactory.Create(pdfScanner, crossReferenceTable.Dictionary);
var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing);
var catalog = catalogFactory.Create(pdfScanner, rootDictionary);
var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer);
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
return new PdfDocument(log, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
pdfScanner);
}

View File

@@ -15,8 +15,6 @@
/// </summary>
public class PdfDocument : IDisposable
{
[NotNull]
private readonly IRandomAccessRead reader;
[NotNull]
private readonly HeaderVersion version;
[NotNull]
@@ -51,7 +49,7 @@
/// </summary>
public int NumberOfPages => Pages.Count;
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
internal PdfDocument(ILog log, HeaderVersion version, CrossReferenceTable crossReferenceTable,
bool isLenientParsing,
ParsingCachingProviders cachingProviders,
IPageFactory pageFactory,
@@ -59,7 +57,6 @@
DocumentInformation information, IPdfTokenScanner pdfScanner)
{
this.log = log;
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
this.version = version ?? throw new ArgumentNullException(nameof(version));
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
this.isLenientParsing = isLenientParsing;
@@ -67,7 +64,7 @@
this.pdfScanner = pdfScanner;
Information = information ?? throw new ArgumentNullException(nameof(information));
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
Pages = new Pages(log, Catalog, pageFactory, reader, isLenientParsing, pdfScanner);
Pages = new Pages(log, Catalog, pageFactory, isLenientParsing, pdfScanner);
}
/// <summary>
@@ -103,7 +100,6 @@
{
try
{
reader.Dispose();
}
catch
{

View File

@@ -18,6 +18,7 @@
public static readonly OperatorToken Dup = new OperatorToken("dup");
public static readonly OperatorToken For = new OperatorToken("for");
public static readonly OperatorToken Put = new OperatorToken("put");
public static readonly OperatorToken Xref = new OperatorToken("xref");
public string Data { get; }
@@ -60,6 +61,8 @@
return For;
case "put":
return Put;
case "xref":
return Xref;
default:
return new OperatorToken(data);
}