mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
unify raw byte access method
This commit is contained in:
@@ -9,18 +9,9 @@ namespace UglyToad.PdfPig.Tests.Parser.Parts
|
||||
|
||||
public class BruteForceSearcherTests
|
||||
{
|
||||
[Fact]
|
||||
public void ReaderNull_Throws()
|
||||
{
|
||||
// ReSharper disable once ConvertToLocalFunction
|
||||
Action action = () => new BruteForceSearcher(null);
|
||||
|
||||
Assert.Throws<ArgumentNullException>(action);
|
||||
}
|
||||
|
||||
private const string TestData = @"%PDF-1.5
|
||||
%¿÷¢þ
|
||||
2 0 obj
|
||||
2 17 obj
|
||||
<< /Linearized 1 /L 26082 /H [ 722 130 ] /O 6 /E 25807 /N 1 /T 25806 >>
|
||||
endobj
|
||||
|
||||
@@ -44,14 +35,21 @@ startxref
|
||||
216
|
||||
%%EOF";
|
||||
|
||||
[Fact]
|
||||
public void ReaderNull_Throws()
|
||||
{
|
||||
Action action = () => new BruteForceSearcher(null);
|
||||
|
||||
Assert.Throws<ArgumentNullException>(action);
|
||||
}
|
||||
|
||||
|
||||
[Fact]
|
||||
public void SearcherFindsCorrectObjects()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes(TestData);
|
||||
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
|
||||
|
||||
var reader = new RandomAccessBuffer(bytes);
|
||||
|
||||
var searcher = new BruteForceSearcher(reader);
|
||||
var searcher = new BruteForceSearcher(input);
|
||||
|
||||
var locations = searcher.GetObjectLocations();
|
||||
|
||||
@@ -59,28 +57,24 @@ startxref
|
||||
|
||||
Assert.Equal(locations.Values, new long[]
|
||||
{
|
||||
TestData.IndexOf("2 0 obj", StringComparison.OrdinalIgnoreCase),
|
||||
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
|
||||
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
|
||||
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
|
||||
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase) + 1,
|
||||
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase) + 1,
|
||||
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase) + 1,
|
||||
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase) + 1
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReaderOnlyCallsOnce()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes(TestData);
|
||||
var reader = StringBytesTestConverter.Convert(TestData, false);
|
||||
|
||||
var reader = new ThrowingReader(new RandomAccessBuffer(bytes));
|
||||
|
||||
var searcher = new BruteForceSearcher(reader);
|
||||
var searcher = new BruteForceSearcher(reader.Bytes);
|
||||
|
||||
var locations = searcher.GetObjectLocations();
|
||||
|
||||
Assert.Equal(4, locations.Count);
|
||||
|
||||
reader.Throw = true;
|
||||
|
||||
|
||||
var newLocations = searcher.GetObjectLocations();
|
||||
|
||||
Assert.Equal(4, locations.Count);
|
||||
|
@@ -1,9 +1,7 @@
|
||||
namespace UglyToad.PdfPig.Tests.Parser.Parts.CrossReference
|
||||
{
|
||||
using System;
|
||||
using IO;
|
||||
using PdfPig.Parser.Parts.CrossReference;
|
||||
using PdfPig.Util;
|
||||
using Xunit;
|
||||
|
||||
public class TableSubsectionDefinitionTests
|
||||
@@ -39,11 +37,9 @@
|
||||
[Fact]
|
||||
public void TryReadIncorrectFormatSinglePartFalse()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes(@"76362");
|
||||
var input = StringBytesTestConverter.Convert("76362", false);
|
||||
|
||||
var input = new RandomAccessBuffer(bytes);
|
||||
|
||||
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
|
||||
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
|
||||
|
||||
Assert.False(result);
|
||||
}
|
||||
@@ -51,11 +47,9 @@
|
||||
[Fact]
|
||||
public void TryReadIncorrectFormatMultiplePartsFalse()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes(@"76362 100 1000");
|
||||
var input = StringBytesTestConverter.Convert("76362 100 1000", false);
|
||||
|
||||
var input = new RandomAccessBuffer(bytes);
|
||||
|
||||
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
|
||||
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
|
||||
|
||||
Assert.False(result);
|
||||
}
|
||||
@@ -63,11 +57,9 @@
|
||||
[Fact]
|
||||
public void FirstPartInvalidFormatFalse()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes("00adb85 97");
|
||||
var input = StringBytesTestConverter.Convert("00adb85 97", false);
|
||||
|
||||
var input = new RandomAccessBuffer(bytes);
|
||||
|
||||
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
|
||||
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
|
||||
|
||||
Assert.False(result);
|
||||
}
|
||||
@@ -75,11 +67,9 @@
|
||||
[Fact]
|
||||
public void SecondPartInvalidFormatFalse()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes("85 9t");
|
||||
|
||||
var input = new RandomAccessBuffer(bytes);
|
||||
|
||||
var result = TableSubsectionDefinition.TryRead(log, input, out var _);
|
||||
var input = StringBytesTestConverter.Convert("85 9t", false);
|
||||
|
||||
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var _);
|
||||
|
||||
Assert.False(result);
|
||||
}
|
||||
@@ -87,11 +77,9 @@
|
||||
[Fact]
|
||||
public void ValidTrue()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes("12 32");
|
||||
var input = StringBytesTestConverter.Convert("12 32", false);
|
||||
|
||||
var input = new RandomAccessBuffer(bytes);
|
||||
|
||||
var result = TableSubsectionDefinition.TryRead(log, input, out var definition);
|
||||
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var definition);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
@@ -102,11 +90,9 @@
|
||||
[Fact]
|
||||
public void ValidWithLongTrue()
|
||||
{
|
||||
var bytes = OtherEncodings.StringAsLatin1Bytes("214748364700 6");
|
||||
var input = StringBytesTestConverter.Convert("214748364700 6", false);
|
||||
|
||||
var input = new RandomAccessBuffer(bytes);
|
||||
|
||||
var result = TableSubsectionDefinition.TryRead(log, input, out var definition);
|
||||
var result = TableSubsectionDefinition.TryRead(log, input.Bytes, out var definition);
|
||||
|
||||
Assert.True(result);
|
||||
|
||||
|
@@ -1,120 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Tests
|
||||
{
|
||||
using System;
|
||||
using IO;
|
||||
|
||||
internal class ThrowingReader : IRandomAccessRead
|
||||
{
|
||||
private readonly IRandomAccessRead reader;
|
||||
|
||||
public bool Throw { get; set; }
|
||||
|
||||
public ThrowingReader(IRandomAccessRead reader)
|
||||
{
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
|
||||
reader.Dispose();
|
||||
}
|
||||
|
||||
public int Read()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.Read();
|
||||
}
|
||||
|
||||
public int Read(byte[] b)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.Read(b);
|
||||
}
|
||||
|
||||
public int Read(byte[] b, int offset, int length)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.Read(b, offset, length);
|
||||
}
|
||||
|
||||
public long GetPosition()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.GetPosition();
|
||||
}
|
||||
|
||||
public void Seek(long position)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
reader.Seek(position);
|
||||
}
|
||||
|
||||
public long Length()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.Length();
|
||||
}
|
||||
|
||||
public bool IsClosed()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.IsClosed();
|
||||
}
|
||||
|
||||
public int Peek()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.Peek();
|
||||
}
|
||||
|
||||
public void Rewind(int bytes)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
reader.Rewind(bytes);
|
||||
}
|
||||
|
||||
public byte[] ReadFully(int length)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.ReadFully(length);
|
||||
}
|
||||
|
||||
public bool IsEof()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.IsEof();
|
||||
}
|
||||
|
||||
public int Available()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
return reader.Available();
|
||||
}
|
||||
|
||||
public void ReturnToBeginning()
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
reader.ReturnToBeginning();
|
||||
}
|
||||
|
||||
public void Unread(int b)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
reader.Unread(b);
|
||||
}
|
||||
|
||||
public void Unread(byte[] bytes)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
reader.Unread(bytes);
|
||||
}
|
||||
|
||||
public void Unread(byte[] bytes, int start, int length)
|
||||
{
|
||||
if (Throw) throw new InvalidOperationException();
|
||||
reader.Unread(bytes, start, length);
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,11 +1,10 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using IO;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal interface IPageFactory
|
||||
{
|
||||
Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader,
|
||||
Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
|
||||
bool isLenientParsing);
|
||||
|
||||
void LoadResources(DictionaryToken dictionary, bool isLenientParsing);
|
||||
|
@@ -2,7 +2,6 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parser.Parts;
|
||||
using Tokenization.Scanner;
|
||||
@@ -14,7 +13,6 @@
|
||||
private readonly ILog log;
|
||||
private readonly Catalog catalog;
|
||||
private readonly IPageFactory pageFactory;
|
||||
private readonly IRandomAccessRead reader;
|
||||
private readonly bool isLenientParsing;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly DictionaryToken rootPageDictionary;
|
||||
@@ -22,8 +20,7 @@
|
||||
|
||||
public int Count { get; }
|
||||
|
||||
internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory,
|
||||
IRandomAccessRead reader, bool isLenientParsing, IPdfTokenScanner pdfScanner)
|
||||
internal Pages(ILog log, Catalog catalog, IPageFactory pageFactory, bool isLenientParsing, IPdfTokenScanner pdfScanner)
|
||||
{
|
||||
if (catalog == null)
|
||||
{
|
||||
@@ -37,7 +34,6 @@
|
||||
this.log = log;
|
||||
this.catalog = catalog;
|
||||
this.pageFactory = pageFactory;
|
||||
this.reader = reader;
|
||||
this.isLenientParsing = isLenientParsing;
|
||||
this.pdfScanner = pdfScanner;
|
||||
}
|
||||
@@ -47,7 +43,7 @@
|
||||
if (locatedPages.TryGetValue(pageNumber, out DictionaryToken targetPageDictionary))
|
||||
{
|
||||
// TODO: cache the page
|
||||
return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader,
|
||||
return pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(),
|
||||
isLenientParsing);
|
||||
}
|
||||
|
||||
@@ -61,7 +57,7 @@
|
||||
throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
|
||||
}
|
||||
|
||||
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);
|
||||
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), isLenientParsing);
|
||||
|
||||
locatedPages[pageNumber] = targetPageDictionary;
|
||||
|
||||
|
@@ -44,7 +44,7 @@
|
||||
|
||||
var code = CMap.ReadCode(bytes);
|
||||
|
||||
codeLength = bytes.CurrentOffset - current;
|
||||
codeLength = (int)(bytes.CurrentOffset - current);
|
||||
|
||||
return code;
|
||||
}
|
||||
|
@@ -1,10 +1,12 @@
|
||||
namespace UglyToad.PdfPig.Fonts.Parser.Handlers
|
||||
{
|
||||
using System;
|
||||
using Cmap;
|
||||
using Encodings;
|
||||
using Exceptions;
|
||||
using Filters;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
using PdfPig.Parser.Parts;
|
||||
using Simple;
|
||||
@@ -15,6 +17,7 @@
|
||||
|
||||
internal class TrueTypeFontHandler : IFontHandler
|
||||
{
|
||||
private readonly ILog log;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly CMapCache cMapCache;
|
||||
private readonly FontDescriptorFactory fontDescriptorFactory;
|
||||
@@ -22,12 +25,13 @@
|
||||
private readonly IEncodingReader encodingReader;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
|
||||
public TrueTypeFontHandler(IPdfTokenScanner pdfScanner, IFilterProvider filterProvider,
|
||||
public TrueTypeFontHandler(ILog log, IPdfTokenScanner pdfScanner, IFilterProvider filterProvider,
|
||||
CMapCache cMapCache,
|
||||
FontDescriptorFactory fontDescriptorFactory,
|
||||
TrueTypeFontParser trueTypeFontParser,
|
||||
IEncodingReader encodingReader)
|
||||
{
|
||||
this.log = log;
|
||||
this.filterProvider = filterProvider;
|
||||
this.cMapCache = cMapCache;
|
||||
this.fontDescriptorFactory = fontDescriptorFactory;
|
||||
@@ -47,7 +51,7 @@
|
||||
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfScanner, fontDescriptorFactory, dictionary, isLenientParsing);
|
||||
|
||||
// TODO: use the parsed font fully.
|
||||
//var font = ParseTrueTypeFont(descriptor, reader, isLenientParsing);
|
||||
var font = ParseTrueTypeFont(descriptor);
|
||||
|
||||
var name = FontDictionaryAccessHelper.GetName(pdfScanner, dictionary, descriptor, isLenientParsing);
|
||||
|
||||
@@ -69,8 +73,7 @@
|
||||
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding);
|
||||
}
|
||||
|
||||
private TrueTypeFont ParseTrueTypeFont(FontDescriptor descriptor, IRandomAccessRead reader,
|
||||
bool isLenientParsing)
|
||||
private TrueTypeFont ParseTrueTypeFont(FontDescriptor descriptor)
|
||||
{
|
||||
if (descriptor?.FontFile == null)
|
||||
{
|
||||
@@ -83,18 +86,23 @@
|
||||
$"Expected a TrueType font in the TrueType font descriptor, instead it was {descriptor.FontFile.FileType}.");
|
||||
}
|
||||
|
||||
//var fontFileStream = pdfObjectParser.Parse(descriptor.FontFile.ObjectKey, reader, isLenientParsing) as PdfRawStream;
|
||||
|
||||
//if (fontFileStream == null)
|
||||
try
|
||||
{
|
||||
|
||||
var fontFileStream = DirectObjectFinder.Get<StreamToken>(descriptor.FontFile.ObjectKey, pdfScanner);
|
||||
|
||||
var fontFile = fontFileStream.Decode(filterProvider);
|
||||
|
||||
var font = trueTypeFontParser.Parse(new TrueTypeDataBytes(new ByteArrayInputBytes(fontFile)));
|
||||
|
||||
return font;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
log.Error("Could not parse the TrueType font.", ex);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
//var fontFile = fontFileStream.Decode(filterProvider);
|
||||
|
||||
//var font = trueTypeFontParser.Parse(new TrueTypeDataBytes(new ByteArrayInputBytes(fontFile)));
|
||||
|
||||
//return font;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -12,8 +12,8 @@
|
||||
currentOffset = -1;
|
||||
}
|
||||
|
||||
private int currentOffset;
|
||||
public int CurrentOffset => currentOffset + 1;
|
||||
private long currentOffset;
|
||||
public long CurrentOffset => currentOffset + 1;
|
||||
|
||||
public bool MoveNext()
|
||||
{
|
||||
@@ -23,7 +23,7 @@
|
||||
}
|
||||
|
||||
currentOffset++;
|
||||
CurrentByte = bytes[currentOffset];
|
||||
CurrentByte = bytes[(int)currentOffset];
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
return null;
|
||||
}
|
||||
|
||||
return bytes[currentOffset + 1];
|
||||
return bytes[(int)currentOffset + 1];
|
||||
}
|
||||
|
||||
public bool IsAtEnd()
|
||||
@@ -49,7 +49,7 @@
|
||||
public void Seek(long position)
|
||||
{
|
||||
currentOffset = (int)position - 1;
|
||||
CurrentByte = currentOffset < 0 ? (byte)0 : bytes[currentOffset];
|
||||
CurrentByte = currentOffset < 0 ? (byte)0 : bytes[(int)currentOffset];
|
||||
}
|
||||
}
|
||||
}
|
@@ -2,7 +2,7 @@
|
||||
{
|
||||
internal interface IInputBytes
|
||||
{
|
||||
int CurrentOffset { get; }
|
||||
long CurrentOffset { get; }
|
||||
|
||||
bool MoveNext();
|
||||
|
||||
|
@@ -1,7 +0,0 @@
|
||||
namespace UglyToad.PdfPig.IO
|
||||
{
|
||||
internal interface RandomAccess : IRandomAccessRead, RandomAccessWrite
|
||||
{
|
||||
// super interface for both read and write
|
||||
}
|
||||
}
|
@@ -4,7 +4,7 @@
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
|
||||
internal class RandomAccessBuffer : RandomAccess
|
||||
internal class RandomAccessBuffer : IDisposable
|
||||
{
|
||||
// default chunk size is 1kb
|
||||
private static readonly int DefaultChunkSize = 1024;
|
||||
|
@@ -1,123 +0,0 @@
|
||||
namespace UglyToad.PdfPig.IO
|
||||
{
|
||||
using System;
|
||||
|
||||
internal interface IRandomAccessRead : IDisposable
|
||||
{
|
||||
/**
|
||||
* Read a single byte of data.
|
||||
*
|
||||
* @return The byte of data that is being read.
|
||||
*
|
||||
* @throws IOException If there is an error while reading the data.
|
||||
*/
|
||||
int Read();
|
||||
|
||||
/**
|
||||
* Read a buffer of data.
|
||||
*
|
||||
* @param b The buffer to write the data to.
|
||||
* @return The number of bytes that were actually read.
|
||||
* @throws IOException If there was an error while reading the data.
|
||||
*/
|
||||
int Read(byte[]
|
||||
b);
|
||||
|
||||
/**
|
||||
* Read a buffer of data.
|
||||
*
|
||||
* @param b The buffer to write the data to.
|
||||
* @param offset Offset into the buffer to start writing.
|
||||
* @param length The amount of data to attempt to read.
|
||||
* @return The number of bytes that were actually read.
|
||||
* @throws IOException If there was an error while reading the data.
|
||||
*/
|
||||
int Read(byte[]
|
||||
b, int offset, int length);
|
||||
|
||||
/**
|
||||
* Returns offset of next byte to be returned by a read method.
|
||||
*
|
||||
* @return offset of next byte which will be returned with next {@link #read()}
|
||||
* (if no more bytes are left it returns a value >= length of source)
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
long GetPosition();
|
||||
|
||||
/**
|
||||
* Seek to a position in the data.
|
||||
*
|
||||
* @param position The position to seek to.
|
||||
* @throws IOException If there is an error while seeking.
|
||||
*/
|
||||
|
||||
void Seek(long position);
|
||||
|
||||
/**
|
||||
* The total number of bytes that are available.
|
||||
*
|
||||
* @return The number of bytes available.
|
||||
*
|
||||
* @throws IOException If there is an IO error while determining the
|
||||
* length of the data stream.
|
||||
*/
|
||||
long Length();
|
||||
|
||||
/**
|
||||
* Returns true if this stream has been closed.
|
||||
*/
|
||||
bool IsClosed();
|
||||
|
||||
/**
|
||||
* This will peek at the next byte.
|
||||
*
|
||||
* @return The next byte on the stream, leaving it as available to read.
|
||||
*
|
||||
* @throws IOException If there is an error reading the next byte.
|
||||
*/
|
||||
int Peek();
|
||||
|
||||
/**
|
||||
* Seek backwards the given number of bytes.
|
||||
*
|
||||
* @param bytes the number of bytes to be seeked backwards
|
||||
* @throws IOException If there is an error while seeking
|
||||
*/
|
||||
void Rewind(int bytes);
|
||||
|
||||
/**
|
||||
* Reads a given number of bytes.
|
||||
* @param length the number of bytes to be read
|
||||
* @return a byte array containing the bytes just read
|
||||
* @throws IOException if an I/O error occurs while reading data
|
||||
*/
|
||||
byte[]
|
||||
ReadFully(int length);
|
||||
|
||||
/**
|
||||
* A simple test to see if we are at the end of the data.
|
||||
*
|
||||
* @return true if we are at the end of the data.
|
||||
*
|
||||
* @throws IOException If there is an error reading the next byte.
|
||||
*/
|
||||
bool IsEof();
|
||||
|
||||
/**
|
||||
* Returns an estimate of the number of bytes that can be read.
|
||||
*
|
||||
* @return the number of bytes that can be read
|
||||
* @throws IOException if this random access has been closed
|
||||
*/
|
||||
int Available();
|
||||
|
||||
void ReturnToBeginning();
|
||||
|
||||
void Unread(int b);
|
||||
|
||||
void Unread(byte[] bytes);
|
||||
|
||||
void Unread(byte[] bytes, int start, int length);
|
||||
}
|
||||
}
|
@@ -1,40 +0,0 @@
|
||||
namespace UglyToad.PdfPig.IO
|
||||
{
|
||||
using System;
|
||||
|
||||
internal interface RandomAccessWrite : IDisposable
|
||||
{
|
||||
/**
|
||||
* Write a byte to the stream.
|
||||
*
|
||||
* @param b The byte to write.
|
||||
* @throws IOException If there is an IO error while writing.
|
||||
*/
|
||||
void write(int b);
|
||||
|
||||
/**
|
||||
* Write a buffer of data to the stream.
|
||||
*
|
||||
* @param b The buffer to get the data from.
|
||||
* @throws IOException If there is an error while writing the data.
|
||||
*/
|
||||
void write(byte[]
|
||||
b);
|
||||
|
||||
/**
|
||||
* Write a buffer of data to the stream.
|
||||
*
|
||||
* @param b The buffer to get the data from.
|
||||
* @param offset An offset into the buffer to get the data from.
|
||||
* @param length The length of data to write.
|
||||
* @throws IOException If there is an error while writing the data.
|
||||
*/
|
||||
void write(byte[]
|
||||
b, int offset, int length);
|
||||
|
||||
/**
|
||||
* Clears all data of the buffer.
|
||||
*/
|
||||
void clear();
|
||||
}
|
||||
}
|
@@ -3,21 +3,13 @@
|
||||
using System;
|
||||
using Content;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Parts;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class CatalogFactory
|
||||
{
|
||||
private readonly IPdfTokenScanner scanner;
|
||||
|
||||
public CatalogFactory(IPdfTokenScanner scanner)
|
||||
{
|
||||
this.scanner = scanner;
|
||||
}
|
||||
|
||||
public Catalog Create(DictionaryToken dictionary, IRandomAccessRead reader, bool isLenientParsing)
|
||||
public Catalog Create(IPdfTokenScanner scanner, DictionaryToken dictionary)
|
||||
{
|
||||
if (dictionary == null)
|
||||
{
|
||||
|
@@ -12,9 +12,9 @@
|
||||
this.offsetValidator = offsetValidator;
|
||||
}
|
||||
|
||||
public long Validate(long crossReferenceOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
|
||||
public long Validate(long crossReferenceOffset, ISeekableTokenScanner scanner, IInputBytes bytes, bool isLenientParsing)
|
||||
{
|
||||
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, scanner, reader, isLenientParsing);
|
||||
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, scanner, bytes, isLenientParsing);
|
||||
if (fixedOffset > -1)
|
||||
{
|
||||
crossReferenceOffset = fixedOffset;
|
||||
|
@@ -19,6 +19,7 @@
|
||||
private readonly XrefCosOffsetChecker xrefCosChecker;
|
||||
|
||||
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
|
||||
XrefCosOffsetChecker xrefCosChecker,
|
||||
CrossReferenceStreamParser crossReferenceStreamParser,
|
||||
CrossReferenceTableParser crossReferenceTableParser)
|
||||
{
|
||||
@@ -26,13 +27,12 @@
|
||||
this.offsetValidator = offsetValidator;
|
||||
this.crossReferenceStreamParser = crossReferenceStreamParser;
|
||||
this.crossReferenceTableParser = crossReferenceTableParser;
|
||||
|
||||
xrefCosChecker = new XrefCosOffsetChecker();
|
||||
this.xrefCosChecker = xrefCosChecker;
|
||||
}
|
||||
|
||||
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
|
||||
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long xrefLocation, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
|
||||
{
|
||||
long fixedOffset = offsetValidator.CheckXRefOffset(xrefLocation, tokenScanner, reader, isLenientParsing);
|
||||
long fixedOffset = offsetValidator.CheckXRefOffset(xrefLocation, tokenScanner, bytes, isLenientParsing);
|
||||
if (fixedOffset > -1)
|
||||
{
|
||||
xrefLocation = fixedOffset;
|
||||
@@ -76,7 +76,7 @@
|
||||
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
|
||||
|
||||
// check the xref stream reference
|
||||
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, reader, isLenientParsing);
|
||||
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != streamOffset)
|
||||
{
|
||||
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
|
||||
@@ -143,7 +143,7 @@
|
||||
if (previousCrossReferenceLocation > 0)
|
||||
{
|
||||
// check the xref table reference
|
||||
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, reader, isLenientParsing);
|
||||
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, bytes, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
|
||||
{
|
||||
previousCrossReferenceLocation = fixedOffset;
|
||||
@@ -170,7 +170,7 @@
|
||||
var resolved = table.Build(xrefLocation, log);
|
||||
|
||||
// check the offsets of all referenced objects
|
||||
xrefCosChecker.checkXrefOffsets(reader, resolved, isLenientParsing);
|
||||
xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing);
|
||||
|
||||
return resolved;
|
||||
}
|
||||
|
@@ -6,174 +6,122 @@
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parts;
|
||||
using Util;
|
||||
|
||||
internal class XrefCosOffsetChecker
|
||||
{
|
||||
private static readonly long MINIMUM_SEARCH_OFFSET = 6;
|
||||
|
||||
private Dictionary<IndirectReference, long> bfSearchCOSObjectKeyOffsets;
|
||||
|
||||
private bool validateXrefOffsets(IRandomAccessRead reader, Dictionary<IndirectReference, long> xrefOffset)
|
||||
private readonly ILog log;
|
||||
private readonly BruteForceSearcher bruteForceSearcher;
|
||||
|
||||
private IReadOnlyDictionary<IndirectReference, long> objectKeyOffsets;
|
||||
|
||||
public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher)
|
||||
{
|
||||
this.log = log;
|
||||
this.bruteForceSearcher = bruteForceSearcher;
|
||||
}
|
||||
|
||||
private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary<IndirectReference, long> xrefOffset)
|
||||
{
|
||||
if (xrefOffset == null)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
foreach (var objectEntry in xrefOffset)
|
||||
{
|
||||
IndirectReference objectKey = objectEntry.Key;
|
||||
long objectOffset = objectEntry.Value;
|
||||
|
||||
// a negative offset number represents a object number itself
|
||||
// see type 2 entry in xref stream
|
||||
if (objectOffset >= 0
|
||||
&& !checkObjectKeys(reader, objectKey, objectOffset))
|
||||
if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset))
|
||||
{
|
||||
//LOG.debug("Stop checking xref offsets as at least one (" + objectKey
|
||||
// + ") couldn't be dereferenced");
|
||||
log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced");
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool checkObjectKeys(IRandomAccessRead source, IndirectReference objectKey, long offset)
|
||||
private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
|
||||
{
|
||||
// there can't be any object at the very beginning of a pdf
|
||||
if (offset < MINIMUM_SEARCH_OFFSET)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
long objectNr = objectKey.ObjectNumber;
|
||||
long objectGen = objectKey.Generation;
|
||||
long originOffset = source.GetPosition();
|
||||
string objectString = ObjectHelper.createObjectString(objectNr, objectGen);
|
||||
long originOffset = bytes.CurrentOffset;
|
||||
|
||||
string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
|
||||
|
||||
try
|
||||
{
|
||||
source.Seek(offset);
|
||||
if (ReadHelper.IsString(source, OtherEncodings.StringAsLatin1Bytes(objectString)))
|
||||
bytes.Seek(offset);
|
||||
|
||||
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||
{
|
||||
bytes.MoveNext();
|
||||
}
|
||||
|
||||
if (ReadHelper.IsString(bytes, objectString))
|
||||
{
|
||||
// everything is ok, return origin object key
|
||||
source.Seek(originOffset);
|
||||
bytes.Seek(originOffset);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
catch (InvalidOperationException)
|
||||
catch (Exception)
|
||||
{
|
||||
// Swallow the exception, obviously there isn't any valid object number
|
||||
}
|
||||
finally
|
||||
{
|
||||
source.Seek(originOffset);
|
||||
bytes.Seek(originOffset);
|
||||
}
|
||||
|
||||
// no valid object number found
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private Dictionary<IndirectReference, long> getBFCosObjectOffsets(IRandomAccessRead reader)
|
||||
private IReadOnlyDictionary<IndirectReference, long> getBFCosObjectOffsets()
|
||||
{
|
||||
if (bfSearchCOSObjectKeyOffsets == null)
|
||||
if (objectKeyOffsets == null)
|
||||
{
|
||||
bfSearchForObjects(reader);
|
||||
var offsets = bruteForceSearcher.GetObjectLocations();
|
||||
|
||||
objectKeyOffsets = offsets;
|
||||
}
|
||||
return bfSearchCOSObjectKeyOffsets;
|
||||
|
||||
return objectKeyOffsets;
|
||||
}
|
||||
|
||||
private void bfSearchForObjects(IRandomAccessRead source)
|
||||
{
|
||||
bfSearchForLastEOFMarker(source);
|
||||
bfSearchCOSObjectKeyOffsets = new Dictionary<IndirectReference, long>();
|
||||
long originOffset = source.GetPosition();
|
||||
long currentOffset = MINIMUM_SEARCH_OFFSET;
|
||||
long lastObjectId = long.MinValue;
|
||||
int lastGenID = int.MinValue;
|
||||
long lastObjOffset = long.MinValue;
|
||||
char[] objString = " obj".ToCharArray();
|
||||
char[] endobjString = "endobj".ToCharArray();
|
||||
bool endobjFound = false;
|
||||
do
|
||||
{
|
||||
source.Seek(currentOffset);
|
||||
if (ReadHelper.IsString(source, "obj"))
|
||||
{
|
||||
long tempOffset = currentOffset - 1;
|
||||
source.Seek(tempOffset);
|
||||
int genID = source.Peek();
|
||||
// is the next char a digit?
|
||||
if (ReadHelper.IsDigit(genID))
|
||||
{
|
||||
genID -= 48;
|
||||
tempOffset--;
|
||||
source.Seek(tempOffset);
|
||||
if (ReadHelper.IsSpace(source))
|
||||
{
|
||||
while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsSpace(source))
|
||||
{
|
||||
source.Seek(--tempOffset);
|
||||
}
|
||||
bool objectIDFound = false;
|
||||
while (tempOffset > MINIMUM_SEARCH_OFFSET && ReadHelper.IsDigit(source))
|
||||
{
|
||||
source.Seek(--tempOffset);
|
||||
objectIDFound = true;
|
||||
}
|
||||
if (objectIDFound)
|
||||
{
|
||||
source.Read();
|
||||
long objectId = ObjectHelper.ReadObjectNumber(source);
|
||||
if (lastObjOffset > 0)
|
||||
{
|
||||
// add the former object ID only if there was a subsequent object ID
|
||||
bfSearchCOSObjectKeyOffsets[new IndirectReference(lastObjectId, lastGenID)] = lastObjOffset;
|
||||
}
|
||||
lastObjectId = objectId;
|
||||
lastGenID = genID;
|
||||
lastObjOffset = tempOffset + 1;
|
||||
currentOffset += objString.Length - 1;
|
||||
endobjFound = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (ReadHelper.IsString(source, "endobj"))
|
||||
{
|
||||
endobjFound = true;
|
||||
currentOffset += endobjString.Length - 1;
|
||||
}
|
||||
currentOffset++;
|
||||
} while (currentOffset < lastEOFMarker && !source.IsEof());
|
||||
if ((lastEOFMarker < long.MaxValue || endobjFound) && lastObjOffset > 0)
|
||||
{
|
||||
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
|
||||
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
|
||||
bfSearchCOSObjectKeyOffsets[new IndirectReference(lastObjectId, lastGenID)] = lastObjOffset;
|
||||
}
|
||||
// reestablish origin position
|
||||
|
||||
source.Seek(originOffset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the XRef table by dereferencing all objects and fixing the offset if necessary.
|
||||
*
|
||||
* @throws InvalidOperationException if something went wrong.
|
||||
*/
|
||||
public void checkXrefOffsets(IRandomAccessRead reader, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
|
||||
|
||||
/// <summary>
|
||||
/// Check that the offsets in the cross reference are correct.
|
||||
/// </summary>
|
||||
public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
|
||||
{
|
||||
// repair mode isn't available in non-lenient mode
|
||||
if (!isLenientParsing)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
Dictionary<IndirectReference, long> xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value);
|
||||
if (validateXrefOffsets(reader, xrefOffset))
|
||||
if (ValidateXrefOffsets(bytes, xrefOffset))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
Dictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets(reader);
|
||||
IReadOnlyDictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets();
|
||||
if (bfCOSObjectKeyOffsets.Count > 0)
|
||||
{
|
||||
List<IndirectReference> objStreams = new List<IndirectReference>();
|
||||
@@ -225,49 +173,64 @@
|
||||
|
||||
foreach (var item in bfCOSObjectKeyOffsets)
|
||||
{
|
||||
xrefOffset.Add(item.Key, item.Value);
|
||||
xrefOffset[item.Key] = item.Value;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private long? lastEOFMarker = null;
|
||||
private void bfSearchForLastEOFMarker(IRandomAccessRead source)
|
||||
private long? lastEndOfFileMarker;
|
||||
|
||||
private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
|
||||
{
|
||||
if (lastEOFMarker == null)
|
||||
if (lastEndOfFileMarker != null)
|
||||
{
|
||||
long originOffset = source.GetPosition();
|
||||
source.Seek(MINIMUM_SEARCH_OFFSET);
|
||||
while (!source.IsEof())
|
||||
return;
|
||||
}
|
||||
|
||||
long startOffset = source.CurrentOffset;
|
||||
|
||||
source.Seek(MINIMUM_SEARCH_OFFSET);
|
||||
|
||||
while (!source.IsAtEnd())
|
||||
{
|
||||
// search for EOF marker
|
||||
if (ReadHelper.IsString(source, "%%EOF"))
|
||||
{
|
||||
// search for EOF marker
|
||||
if (ReadHelper.IsString(source, "%%EOF"))
|
||||
long tempMarker = source.CurrentOffset;
|
||||
|
||||
if (tempMarker >= source.Length)
|
||||
{
|
||||
long tempMarker = source.GetPosition();
|
||||
source.Seek(tempMarker + 5);
|
||||
try
|
||||
{
|
||||
// check if the following data is some valid pdf content
|
||||
// which most likely indicates that the pdf is linearized,
|
||||
// updated or just cut off somewhere in the middle
|
||||
ReadHelper.SkipSpaces(source);
|
||||
ObjectHelper.ReadObjectNumber(source);
|
||||
ObjectHelper.ReadGenerationNumber(source);
|
||||
}
|
||||
catch (InvalidOperationException)
|
||||
{
|
||||
// save the EOF marker as the following data is most likely some garbage
|
||||
lastEOFMarker = tempMarker;
|
||||
}
|
||||
lastEndOfFileMarker = tempMarker;
|
||||
break;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
source.Seek(tempMarker + 5);
|
||||
// check if the following data is some valid pdf content
|
||||
// which most likely indicates that the pdf is linearized,
|
||||
// updated or just cut off somewhere in the middle
|
||||
ReadHelper.SkipSpaces(source);
|
||||
ObjectHelper.ReadObjectNumber(source);
|
||||
ObjectHelper.ReadGenerationNumber(source);
|
||||
}
|
||||
catch (Exception)
|
||||
{
|
||||
// save the EOF marker as the following data is most likely some garbage
|
||||
lastEndOfFileMarker = tempMarker;
|
||||
}
|
||||
source.Read();
|
||||
}
|
||||
source.Seek(originOffset);
|
||||
// no EOF marker found
|
||||
if (lastEOFMarker == null)
|
||||
{
|
||||
lastEOFMarker = long.MaxValue;
|
||||
}
|
||||
|
||||
source.MoveNext();
|
||||
}
|
||||
|
||||
source.Seek(startOffset);
|
||||
|
||||
// no EOF marker found
|
||||
if (lastEndOfFileMarker == null)
|
||||
{
|
||||
lastEndOfFileMarker = long.MaxValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -22,7 +22,7 @@
|
||||
this.log = log;
|
||||
}
|
||||
|
||||
public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
|
||||
public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IInputBytes inputBytes, bool isLenientParsing)
|
||||
{
|
||||
// repair mode isn't available in non-lenient mode
|
||||
if (!isLenientParsing)
|
||||
@@ -30,14 +30,15 @@
|
||||
return startXRefOffset;
|
||||
}
|
||||
|
||||
reader.Seek(startXRefOffset);
|
||||
scanner.Seek(startXRefOffset);
|
||||
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
scanner.MoveNext();
|
||||
|
||||
if (reader.Peek() == 'x' && ReadHelper.IsString(reader, "xref"))
|
||||
if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
|
||||
{
|
||||
return startXRefOffset;
|
||||
}
|
||||
|
||||
if (startXRefOffset > 0)
|
||||
{
|
||||
if (CheckXRefStreamOffset(startXRefOffset, scanner, true))
|
||||
@@ -45,14 +46,14 @@
|
||||
return startXRefOffset;
|
||||
}
|
||||
|
||||
return CalculateXRefFixedOffset(startXRefOffset, scanner, reader);
|
||||
return CalculateXRefFixedOffset(startXRefOffset, scanner, inputBytes);
|
||||
}
|
||||
|
||||
// can't find a valid offset
|
||||
return -1;
|
||||
}
|
||||
|
||||
private long CalculateXRefFixedOffset(long objectOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
|
||||
private long CalculateXRefFixedOffset(long objectOffset, ISeekableTokenScanner scanner, IInputBytes inputBytes)
|
||||
{
|
||||
if (objectOffset < 0)
|
||||
{
|
||||
@@ -61,10 +62,12 @@
|
||||
}
|
||||
|
||||
// start a brute force search for all xref tables and try to find the offset we are looking for
|
||||
long newOffset = BfSearchForXRef(objectOffset, scanner, reader);
|
||||
var newOffset = BruteForceSearchForXref(objectOffset, scanner, inputBytes);
|
||||
|
||||
if (newOffset > -1)
|
||||
{
|
||||
log.Debug($"Fixed reference for xref table/stream {objectOffset} -> {newOffset}");
|
||||
|
||||
return newOffset;
|
||||
}
|
||||
|
||||
@@ -73,90 +76,18 @@
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void BfSearchForXRefStreams(IRandomAccessRead reader)
|
||||
{
|
||||
if (bfSearchXRefStreamsOffsets != null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// a pdf may contain more than one /XRef entry
|
||||
bfSearchXRefStreamsOffsets = new List<long>();
|
||||
long originOffset = reader.GetPosition();
|
||||
reader.Seek(MinimumSearchOffset);
|
||||
// search for XRef streams
|
||||
var objString = " obj";
|
||||
while (!reader.IsEof())
|
||||
{
|
||||
if (ReadHelper.IsString(reader, "xref"))
|
||||
{
|
||||
// search backwards for the beginning of the stream
|
||||
long newOffset = -1;
|
||||
long xrefOffset = reader.GetPosition();
|
||||
bool objFound = false;
|
||||
for (int i = 1; i < 40 && !objFound; i++)
|
||||
{
|
||||
long currentOffset = xrefOffset - (i * 10);
|
||||
if (currentOffset > 0)
|
||||
{
|
||||
reader.Seek(currentOffset);
|
||||
for (int j = 0; j < 10; j++)
|
||||
{
|
||||
if (ReadHelper.IsString(reader, objString))
|
||||
{
|
||||
long tempOffset = currentOffset - 1;
|
||||
reader.Seek(tempOffset);
|
||||
int genId = reader.Peek();
|
||||
// is the next char a digit?
|
||||
if (ReadHelper.IsDigit(genId))
|
||||
{
|
||||
tempOffset--;
|
||||
reader.Seek(tempOffset);
|
||||
if (ReadHelper.IsSpace(reader))
|
||||
{
|
||||
int length = 0;
|
||||
reader.Seek(--tempOffset);
|
||||
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
|
||||
{
|
||||
reader.Seek(--tempOffset);
|
||||
length++;
|
||||
}
|
||||
if (length > 0)
|
||||
{
|
||||
reader.Read();
|
||||
newOffset = reader.GetPosition();
|
||||
}
|
||||
}
|
||||
}
|
||||
objFound = true;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
currentOffset++;
|
||||
reader.Read();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (newOffset > -1)
|
||||
{
|
||||
bfSearchXRefStreamsOffsets.Add(newOffset);
|
||||
}
|
||||
reader.Seek(xrefOffset + 5);
|
||||
}
|
||||
reader.Read();
|
||||
}
|
||||
reader.Seek(originOffset);
|
||||
}
|
||||
|
||||
private long BfSearchForXRef(long xrefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
|
||||
private long BruteForceSearchForXref(long xrefOffset, ISeekableTokenScanner scanner, IInputBytes reader)
|
||||
{
|
||||
long newOffset = -1;
|
||||
long newOffsetTable = -1;
|
||||
long newOffsetStream = -1;
|
||||
BfSearchForXRefTables(reader);
|
||||
|
||||
BruteForceSearchForTables(reader);
|
||||
|
||||
BfSearchForXRefStreams(reader);
|
||||
|
||||
if (bfSearchXRefTablesOffsets != null)
|
||||
{
|
||||
// TODO to be optimized, this won't work in every case
|
||||
@@ -196,35 +127,143 @@
|
||||
return newOffset;
|
||||
}
|
||||
|
||||
private void BfSearchForXRefTables(IRandomAccessRead reader)
|
||||
private void BruteForceSearchForTables(IInputBytes bytes)
|
||||
{
|
||||
if (bfSearchXRefTablesOffsets == null)
|
||||
if (bfSearchXRefTablesOffsets != null)
|
||||
{
|
||||
// a pdf may contain more than one xref entry
|
||||
bfSearchXRefTablesOffsets = new List<long>();
|
||||
long originOffset = reader.GetPosition();
|
||||
reader.Seek(MinimumSearchOffset);
|
||||
// search for xref tables
|
||||
while (!reader.IsEof())
|
||||
{
|
||||
if (ReadHelper.IsString(reader, "xref"))
|
||||
{
|
||||
long newOffset = reader.GetPosition();
|
||||
reader.Seek(newOffset - 1);
|
||||
// ensure that we don't read "startxref" instead of "xref"
|
||||
if (ReadHelper.IsWhitespace(reader))
|
||||
{
|
||||
bfSearchXRefTablesOffsets.Add(newOffset);
|
||||
}
|
||||
reader.Seek(newOffset + 4);
|
||||
}
|
||||
reader.Read();
|
||||
}
|
||||
reader.Seek(originOffset);
|
||||
return;
|
||||
}
|
||||
|
||||
// a pdf may contain more than one xref entry
|
||||
bfSearchXRefTablesOffsets = new List<long>();
|
||||
|
||||
var startOffset = bytes.CurrentOffset;
|
||||
|
||||
bytes.Seek(MinimumSearchOffset);
|
||||
|
||||
// search for xref tables
|
||||
while (bytes.MoveNext() && !bytes.IsAtEnd())
|
||||
{
|
||||
if (ReadHelper.IsString(bytes, "xref"))
|
||||
{
|
||||
var newOffset = bytes.CurrentOffset;
|
||||
|
||||
bytes.Seek(newOffset - 1);
|
||||
|
||||
// ensure that we don't read "startxref" instead of "xref"
|
||||
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||
{
|
||||
bfSearchXRefTablesOffsets.Add(newOffset);
|
||||
}
|
||||
|
||||
bytes.Seek(newOffset + 4);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.Seek(startOffset);
|
||||
}
|
||||
|
||||
private long SearchNearestValue(List<long> values, long offset)
|
||||
private void BfSearchForXRefStreams(IInputBytes bytes)
|
||||
{
|
||||
if (bfSearchXRefStreamsOffsets != null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// a pdf may contain more than one /XRef entry
|
||||
bfSearchXRefStreamsOffsets = new List<long>();
|
||||
|
||||
var startOffset = bytes.CurrentOffset;
|
||||
|
||||
bytes.Seek(MinimumSearchOffset);
|
||||
|
||||
// search for XRef streams
|
||||
var objString = " obj";
|
||||
|
||||
while (bytes.MoveNext() && !bytes.IsAtEnd())
|
||||
{
|
||||
if (!ReadHelper.IsString(bytes, "xref"))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// search backwards for the beginning of the stream
|
||||
long newOffset = -1;
|
||||
long xrefOffset = bytes.CurrentOffset;
|
||||
|
||||
bool objFound = false;
|
||||
for (var i = 1; i < 40; i++)
|
||||
{
|
||||
if (objFound)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
long currentOffset = xrefOffset - (i * 10);
|
||||
|
||||
if (currentOffset > 0)
|
||||
{
|
||||
bytes.Seek(currentOffset);
|
||||
|
||||
for (int j = 0; j < 10; j++)
|
||||
{
|
||||
if (ReadHelper.IsString(bytes, objString))
|
||||
{
|
||||
long tempOffset = currentOffset - 1;
|
||||
|
||||
bytes.Seek(tempOffset);
|
||||
|
||||
var generationNumber = bytes.Peek();
|
||||
|
||||
// is the next char a digit?
|
||||
if (generationNumber.HasValue && ReadHelper.IsDigit(generationNumber.Value))
|
||||
{
|
||||
tempOffset--;
|
||||
bytes.Seek(tempOffset);
|
||||
|
||||
// is the digit preceded by a space?
|
||||
if (ReadHelper.IsSpace(bytes.CurrentByte))
|
||||
{
|
||||
int length = 0;
|
||||
bytes.Seek(--tempOffset);
|
||||
|
||||
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(bytes.CurrentByte))
|
||||
{
|
||||
bytes.Seek(--tempOffset);
|
||||
length++;
|
||||
}
|
||||
|
||||
if (length > 0)
|
||||
{
|
||||
bytes.MoveNext();
|
||||
newOffset = bytes.CurrentOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
objFound = true;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
currentOffset++;
|
||||
bytes.MoveNext();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (newOffset > -1)
|
||||
{
|
||||
bfSearchXRefStreamsOffsets.Add(newOffset);
|
||||
}
|
||||
|
||||
bytes.Seek(xrefOffset + 5);
|
||||
}
|
||||
|
||||
bytes.Seek(startOffset);
|
||||
}
|
||||
|
||||
private static long SearchNearestValue(List<long> values, long offset)
|
||||
{
|
||||
long newValue = -1;
|
||||
long? currentDifference = null;
|
||||
@@ -255,8 +294,9 @@
|
||||
{
|
||||
return true;
|
||||
}
|
||||
// seek to offset-1
|
||||
scanner.Seek(startXRefOffset - 1);
|
||||
|
||||
scanner.Seek(startXRefOffset);
|
||||
|
||||
if (scanner.TryReadToken(out NumericToken objectNumber))
|
||||
{
|
||||
try
|
||||
@@ -280,7 +320,6 @@
|
||||
if (!scanner.TryReadToken(out DictionaryToken dictionary))
|
||||
{
|
||||
scanner.Seek(startXRefOffset);
|
||||
|
||||
}
|
||||
|
||||
if (dictionary.TryGet(NameToken.Type, out var type) && NameToken.Xref.Equals(type))
|
||||
@@ -293,6 +332,11 @@
|
||||
log.Error("Couldn't read the xref stream object.", ex);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log.Error($"When looking for the cross reference stream object we sought a number but found: {scanner.CurrentToken}.");
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@@ -31,7 +31,7 @@
|
||||
this.pdfScanner = pdfScanner;
|
||||
}
|
||||
|
||||
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, IRandomAccessRead reader,
|
||||
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
if (dictionary == null)
|
||||
|
@@ -1,38 +0,0 @@
|
||||
namespace UglyToad.PdfPig.Parser
|
||||
{
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using Cos;
|
||||
using IO;
|
||||
using Logging;
|
||||
using Util;
|
||||
|
||||
internal class ParsingArguments
|
||||
{
|
||||
public IRandomAccessRead Reader { get; }
|
||||
|
||||
public CrossReferenceTable CrossReferenceTable { get; }
|
||||
|
||||
public ParsingCachingProviders CachingProviders { get; }
|
||||
|
||||
public IContainer Container { get; }
|
||||
|
||||
public bool IsLenientParsing { get; }
|
||||
|
||||
public ILog Log { get; }
|
||||
|
||||
[DebuggerStepThrough]
|
||||
public T Get<T>() => Container.Get<T>();
|
||||
|
||||
public ParsingArguments(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable, ParsingCachingProviders cachingProviders, IContainer container, bool isLenientParsing)
|
||||
{
|
||||
Reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
CrossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
|
||||
CachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
||||
Container = container ?? throw new ArgumentNullException(nameof(container));
|
||||
IsLenientParsing = isLenientParsing;
|
||||
Log = new NoOpLog();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@@ -2,25 +2,26 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Text;
|
||||
using ContentStream;
|
||||
using IO;
|
||||
using Util;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <summary>
|
||||
/// Store the results of a brute force search for all Cos Objects in the document so we only do it once.
|
||||
/// Store the results of a brute force search for all objects in the document so we only do it once.
|
||||
/// </summary>
|
||||
internal class BruteForceSearcher
|
||||
{
|
||||
private const int MinimumSearchOffset = 6;
|
||||
|
||||
private readonly IRandomAccessRead reader;
|
||||
private readonly IInputBytes bytes;
|
||||
|
||||
private Dictionary<IndirectReference, long> objectLocations;
|
||||
|
||||
public BruteForceSearcher([NotNull] IRandomAccessRead reader)
|
||||
public BruteForceSearcher([NotNull] IInputBytes bytes)
|
||||
{
|
||||
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
|
||||
}
|
||||
|
||||
[NotNull]
|
||||
@@ -35,70 +36,86 @@
|
||||
|
||||
var results = new Dictionary<IndirectReference, long>();
|
||||
|
||||
var originPosition = reader.GetPosition();
|
||||
var originPosition = bytes.CurrentOffset;
|
||||
|
||||
long currentOffset = MinimumSearchOffset;
|
||||
long lastObjectId = long.MinValue;
|
||||
int lastGenerationId = int.MinValue;
|
||||
long lastObjOffset = long.MinValue;
|
||||
byte[] objString = OtherEncodings.StringAsLatin1Bytes(" obj");
|
||||
byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj");
|
||||
|
||||
bool inObject = false;
|
||||
bool endobjFound = false;
|
||||
do
|
||||
{
|
||||
reader.Seek(currentOffset);
|
||||
if (ReadHelper.IsString(reader, objString))
|
||||
{
|
||||
long tempOffset = currentOffset - 1;
|
||||
reader.Seek(tempOffset);
|
||||
int generationId = reader.Peek();
|
||||
bytes.Seek(currentOffset);
|
||||
|
||||
// is the next char a digit?
|
||||
if (ReadHelper.IsDigit(generationId))
|
||||
if (inObject)
|
||||
{
|
||||
if (ReadHelper.IsString(bytes, "endobj"))
|
||||
{
|
||||
generationId -= 48;
|
||||
tempOffset--;
|
||||
reader.Seek(tempOffset);
|
||||
if (ReadHelper.IsSpace(reader))
|
||||
{
|
||||
while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader))
|
||||
{
|
||||
reader.Seek(--tempOffset);
|
||||
}
|
||||
|
||||
bool objectIdFound = false;
|
||||
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
|
||||
{
|
||||
reader.Seek(--tempOffset);
|
||||
objectIdFound = true;
|
||||
}
|
||||
|
||||
if (objectIdFound)
|
||||
{
|
||||
reader.Read();
|
||||
long objectId = ObjectHelper.ReadObjectNumber(reader);
|
||||
if (lastObjOffset > 0)
|
||||
{
|
||||
// add the former object ID only if there was a subsequent object ID
|
||||
results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset;
|
||||
}
|
||||
lastObjectId = objectId;
|
||||
lastGenerationId = generationId;
|
||||
lastObjOffset = tempOffset + 1;
|
||||
currentOffset += objString.Length - 1;
|
||||
endobjFound = false;
|
||||
}
|
||||
}
|
||||
inObject = false;
|
||||
endobjFound = true;
|
||||
currentOffset += "endobj".Length;
|
||||
}
|
||||
else
|
||||
{
|
||||
currentOffset++;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
else if (ReadHelper.IsString(reader, "endobj"))
|
||||
|
||||
if (!ReadHelper.IsString(bytes, " obj"))
|
||||
{
|
||||
endobjFound = true;
|
||||
currentOffset += endobjString.Length - 1;
|
||||
currentOffset++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Current byte is ' '[obj]
|
||||
var offset = currentOffset - 1;
|
||||
|
||||
bytes.Seek(offset);
|
||||
|
||||
var generationBytes = new StringBuilder();
|
||||
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
|
||||
{
|
||||
generationBytes.Insert(0, (char)bytes.CurrentByte);
|
||||
offset--;
|
||||
bytes.Seek(offset);
|
||||
}
|
||||
|
||||
// We should now be at the space between object and generation number.
|
||||
if (!ReadHelper.IsSpace(bytes.CurrentByte))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
bytes.Seek(--offset);
|
||||
|
||||
var objectNumberBytes = new StringBuilder();
|
||||
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
|
||||
{
|
||||
objectNumberBytes.Insert(0, (char)bytes.CurrentByte);
|
||||
offset--;
|
||||
bytes.Seek(offset);
|
||||
}
|
||||
|
||||
if (!ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var obj = long.Parse(objectNumberBytes.ToString());
|
||||
var generation = int.Parse(generationBytes.ToString());
|
||||
|
||||
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset + 1;
|
||||
|
||||
inObject = true;
|
||||
endobjFound = false;
|
||||
|
||||
currentOffset++;
|
||||
} while (currentOffset < lastEndOfFile && !reader.IsEof());
|
||||
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
|
||||
|
||||
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
|
||||
{
|
||||
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
|
||||
@@ -107,7 +124,7 @@
|
||||
}
|
||||
|
||||
// reestablish origin position
|
||||
reader.Seek(originPosition);
|
||||
bytes.Seek(originPosition);
|
||||
|
||||
objectLocations = results;
|
||||
|
||||
@@ -116,27 +133,29 @@
|
||||
|
||||
private long GetLastEndOfFileMarker()
|
||||
{
|
||||
var originalOffset = reader.GetPosition();
|
||||
var originalOffset = bytes.CurrentOffset;
|
||||
|
||||
var searchTerm = OtherEncodings.StringAsLatin1Bytes("%%EOF");
|
||||
const string searchTerm = "%%EOF";
|
||||
|
||||
var minimumEndOffset = reader.Length() - searchTerm.Length;
|
||||
var minimumEndOffset = bytes.Length - searchTerm.Length;
|
||||
|
||||
reader.Seek(minimumEndOffset);
|
||||
bytes.Seek(minimumEndOffset);
|
||||
|
||||
while (reader.GetPosition() > 0)
|
||||
while (bytes.CurrentOffset > 0)
|
||||
{
|
||||
if (ReadHelper.IsString(reader, searchTerm))
|
||||
if (ReadHelper.IsString(bytes, searchTerm))
|
||||
{
|
||||
var position = reader.GetPosition();
|
||||
reader.Seek(originalOffset);
|
||||
var position = bytes.CurrentOffset;
|
||||
|
||||
bytes.Seek(originalOffset);
|
||||
|
||||
return position;
|
||||
}
|
||||
|
||||
reader.Seek(minimumEndOffset--);
|
||||
bytes.Seek(minimumEndOffset--);
|
||||
}
|
||||
|
||||
reader.Seek(originalOffset);
|
||||
bytes.Seek(originalOffset);
|
||||
return long.MaxValue;
|
||||
}
|
||||
}
|
||||
|
@@ -46,11 +46,11 @@
|
||||
/// <summary>
|
||||
/// Attempts to read the <see cref="TableSubsectionDefinition"/> from the current line of the source.
|
||||
/// </summary>
|
||||
public static bool TryRead(ILog log, IRandomAccessRead source, out TableSubsectionDefinition definition)
|
||||
public static bool TryRead(ILog log, IInputBytes bytes, out TableSubsectionDefinition definition)
|
||||
{
|
||||
definition = default(TableSubsectionDefinition);
|
||||
|
||||
var line = ReadHelper.ReadLine(source);
|
||||
var line = ReadHelper.ReadLine(bytes);
|
||||
|
||||
var parts = line.Split(Splitters, StringSplitOptions.RemoveEmptyEntries);
|
||||
|
||||
|
@@ -8,30 +8,31 @@
|
||||
private const long ObjectNumberThreshold = 10000000000L;
|
||||
private const long GenerationNumberThreshold = 65535;
|
||||
|
||||
public static long ReadObjectNumber(IRandomAccessRead reader)
|
||||
public static long ReadObjectNumber(IInputBytes bytes)
|
||||
{
|
||||
long retval = ReadHelper.ReadLong(reader);
|
||||
if (retval < 0 || retval >= ObjectNumberThreshold)
|
||||
long result = ReadHelper.ReadLong(bytes);
|
||||
if (result < 0 || result >= ObjectNumberThreshold)
|
||||
{
|
||||
throw new FormatException($"Object Number \'{retval}\' has more than 10 digits or is negative");
|
||||
throw new FormatException($"Object Number \'{result}\' has more than 10 digits or is negative");
|
||||
}
|
||||
|
||||
return retval;
|
||||
return result;
|
||||
}
|
||||
|
||||
public static int ReadGenerationNumber(IRandomAccessRead reader)
|
||||
public static int ReadGenerationNumber(IInputBytes bytes)
|
||||
{
|
||||
int retval = ReadHelper.ReadInt(reader);
|
||||
if (retval < 0 || retval > GenerationNumberThreshold)
|
||||
int result = ReadHelper.ReadInt(bytes);
|
||||
if (result < 0 || result > GenerationNumberThreshold)
|
||||
{
|
||||
throw new FormatException("Generation Number '" + retval + "' has more than 5 digits");
|
||||
throw new FormatException("Generation Number '" + result + "' has more than 5 digits");
|
||||
}
|
||||
return retval;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static string createObjectString(long objectID, long genID)
|
||||
public static string CreateObjectString(long objectId, long genId)
|
||||
{
|
||||
return $"{objectID} {genID} obj";
|
||||
return $"{objectId} {genId} obj";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -2,9 +2,8 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
using Util;
|
||||
|
||||
@@ -13,23 +12,25 @@
|
||||
public const byte AsciiLineFeed = 10;
|
||||
public const byte AsciiCarriageReturn = 13;
|
||||
|
||||
public static string ReadLine(IRandomAccessRead reader)
|
||||
public static string ReadLine(IInputBytes bytes)
|
||||
{
|
||||
if (reader == null)
|
||||
if (bytes == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(reader));
|
||||
throw new ArgumentNullException(nameof(bytes));
|
||||
}
|
||||
|
||||
if (reader.IsEof())
|
||||
if (bytes.IsAtEnd())
|
||||
{
|
||||
throw new InvalidOperationException("Error: End-of-File, expected line");
|
||||
}
|
||||
|
||||
var buffer = new StringBuilder(11);
|
||||
|
||||
int c;
|
||||
while ((c = reader.Read()) != -1)
|
||||
byte c = 0;
|
||||
while (bytes.MoveNext())
|
||||
{
|
||||
c = bytes.CurrentByte;
|
||||
|
||||
// CR and LF are valid EOLs
|
||||
if (IsEndOfLine(c))
|
||||
{
|
||||
@@ -40,56 +41,43 @@
|
||||
}
|
||||
|
||||
// CR+LF is also a valid EOL
|
||||
if (IsCarriageReturn(c) && IsLineFeed(reader.Peek()))
|
||||
if (IsCarriageReturn(c) && IsLineFeed(bytes.Peek()))
|
||||
{
|
||||
reader.Read();
|
||||
bytes.MoveNext();
|
||||
}
|
||||
|
||||
return buffer.ToString();
|
||||
}
|
||||
|
||||
public static string ReadString(IRandomAccessRead reader)
|
||||
{
|
||||
SkipSpaces(reader);
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
int c = reader.Read();
|
||||
while (!IsEndOfName((char)c) && c != -1)
|
||||
{
|
||||
buffer.Append((char)c);
|
||||
c = reader.Read();
|
||||
}
|
||||
if (c != -1)
|
||||
{
|
||||
reader.Unread(c);
|
||||
}
|
||||
|
||||
return buffer.ToString();
|
||||
}
|
||||
|
||||
public static void SkipSpaces(IRandomAccessRead reader)
|
||||
|
||||
public static void SkipSpaces(IInputBytes bytes)
|
||||
{
|
||||
const int commentCharacter = 37;
|
||||
int c = reader.Read();
|
||||
bytes.MoveNext();
|
||||
byte c = bytes.CurrentByte;
|
||||
|
||||
while (IsWhitespace(c) || c == 37)
|
||||
{
|
||||
if (c == commentCharacter)
|
||||
{
|
||||
// skip past the comment section
|
||||
c = reader.Read();
|
||||
while (!IsEndOfLine(c) && c != -1)
|
||||
bytes.MoveNext();
|
||||
c = bytes.CurrentByte;
|
||||
while (!IsEndOfLine(c))
|
||||
{
|
||||
c = reader.Read();
|
||||
bytes.MoveNext();
|
||||
c = bytes.CurrentByte;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
c = reader.Read();
|
||||
bytes.MoveNext();
|
||||
c = bytes.CurrentByte;
|
||||
}
|
||||
}
|
||||
if (c != -1)
|
||||
|
||||
if (!bytes.IsAtEnd())
|
||||
{
|
||||
reader.Unread(c);
|
||||
bytes.Seek(bytes.CurrentOffset - 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,15 +102,7 @@
|
||||
{
|
||||
return EndOfNameCharacters.Contains(ch);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determines if the current character in the reader is a whitespace.
|
||||
/// </summary>
|
||||
public static bool IsWhitespace(IRandomAccessRead reader)
|
||||
{
|
||||
return IsWhitespace(reader.Peek());
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a character is whitespace or not.
|
||||
/// </summary>
|
||||
@@ -135,50 +115,50 @@
|
||||
|| c == AsciiCarriageReturn || c == ' ';
|
||||
}
|
||||
|
||||
public static bool IsEndOfLine(int c)
|
||||
{
|
||||
return IsLineFeed(c) || IsCarriageReturn(c);
|
||||
}
|
||||
|
||||
public static bool IsEndOfLine(char c) => IsEndOfLine((byte) c);
|
||||
public static bool IsEndOfLine(byte b)
|
||||
{
|
||||
return IsLineFeed(b) || IsCarriageReturn(b);
|
||||
}
|
||||
|
||||
public static bool IsLineFeed(int c)
|
||||
public static bool IsLineFeed(byte? c)
|
||||
{
|
||||
return AsciiLineFeed == c;
|
||||
}
|
||||
|
||||
public static bool IsCarriageReturn(int c)
|
||||
public static bool IsCarriageReturn(byte c)
|
||||
{
|
||||
return AsciiCarriageReturn == c;
|
||||
}
|
||||
|
||||
public static bool IsString(IRandomAccessRead reader, string str) => IsString(reader, str.Select(x => (byte)x));
|
||||
public static bool IsString(IRandomAccessRead reader, IEnumerable<byte> str)
|
||||
public static bool IsString(IInputBytes bytes, string s)
|
||||
{
|
||||
bool bytesMatching = true;
|
||||
long originOffset = reader.GetPosition();
|
||||
foreach (var c in str)
|
||||
bool found = true;
|
||||
|
||||
var startOffset = bytes.CurrentOffset;
|
||||
|
||||
foreach (var c in s)
|
||||
{
|
||||
if (reader.Read() != c)
|
||||
if (bytes.CurrentByte != c)
|
||||
{
|
||||
bytesMatching = false;
|
||||
found = false;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes.MoveNext();
|
||||
}
|
||||
reader.Seek(originOffset);
|
||||
|
||||
return bytesMatching;
|
||||
bytes.Seek(startOffset);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
public static long ReadLong(IRandomAccessRead reader)
|
||||
|
||||
public static long ReadLong(IInputBytes bytes)
|
||||
{
|
||||
SkipSpaces(reader);
|
||||
SkipSpaces(bytes);
|
||||
long retval;
|
||||
|
||||
StringBuilder longBuffer = ReadStringNumber(reader);
|
||||
StringBuilder longBuffer = ReadStringNumber(bytes);
|
||||
|
||||
try
|
||||
{
|
||||
@@ -187,46 +167,45 @@
|
||||
catch (FormatException e)
|
||||
{
|
||||
var bytesToReverse = OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString());
|
||||
reader.Unread(bytesToReverse);
|
||||
bytes.Seek(bytes.CurrentOffset - bytesToReverse.Length);
|
||||
|
||||
throw new InvalidOperationException($"Error: Expected a long type at offset {reader.GetPosition()}, instead got \'{longBuffer}\'", e);
|
||||
throw new InvalidOperationException($"Error: Expected a long type at offset {bytes.CurrentOffset}, instead got \'{longBuffer}\'", e);
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
private static StringBuilder ReadStringNumber(IRandomAccessRead reader)
|
||||
private static readonly int MaximumNumberStringLength = long.MaxValue.ToString("D").Length;
|
||||
|
||||
private static StringBuilder ReadStringNumber(IInputBytes reader)
|
||||
{
|
||||
int lastByte = 0;
|
||||
byte lastByte;
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
while ((lastByte = reader.Read()) != ' ' &&
|
||||
|
||||
while (reader.MoveNext() && (lastByte = reader.CurrentByte) != ' ' &&
|
||||
lastByte != AsciiLineFeed &&
|
||||
lastByte != AsciiCarriageReturn &&
|
||||
lastByte != 60 && //see sourceforge bug 1714707
|
||||
lastByte != '[' && // PDFBOX-1845
|
||||
lastByte != '(' && // PDFBOX-2579
|
||||
lastByte != 0 && //See sourceforge bug 853328
|
||||
lastByte != -1)
|
||||
lastByte != 0)
|
||||
{
|
||||
buffer.Append((char)lastByte);
|
||||
if (buffer.Length > long.MaxValue.ToString("D").Length)
|
||||
|
||||
if (buffer.Length > MaximumNumberStringLength)
|
||||
{
|
||||
throw new IOException("Number '" + buffer + "' is getting too long, stop reading at offset " + reader.GetPosition());
|
||||
throw new InvalidOperationException($"Number \'{buffer}\' is getting too long, stop reading at offset {reader.CurrentOffset}");
|
||||
}
|
||||
}
|
||||
if (lastByte != -1)
|
||||
|
||||
if (!reader.IsAtEnd())
|
||||
{
|
||||
reader.Unread(lastByte);
|
||||
reader.Seek(reader.CurrentOffset - 1);
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
public static bool IsDigit(IRandomAccessRead reader)
|
||||
{
|
||||
return IsDigit(reader.Peek());
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// This will tell if the given value is a digit or not.
|
||||
/// </summary>
|
||||
@@ -235,17 +214,17 @@
|
||||
return c >= '0' && c <= '9';
|
||||
}
|
||||
|
||||
public static int ReadInt(IRandomAccessRead reader)
|
||||
public static int ReadInt(IInputBytes bytes)
|
||||
{
|
||||
if (reader == null)
|
||||
if (bytes == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(reader));
|
||||
throw new ArgumentNullException(nameof(bytes));
|
||||
}
|
||||
|
||||
SkipSpaces(reader);
|
||||
SkipSpaces(bytes);
|
||||
int result;
|
||||
|
||||
var intBuffer = ReadStringNumber(reader);
|
||||
var intBuffer = ReadStringNumber(bytes);
|
||||
|
||||
try
|
||||
{
|
||||
@@ -253,45 +232,14 @@
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
reader.Unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
|
||||
throw new IOException("Error: Expected an integer type at offset " + reader.GetPosition(), e);
|
||||
bytes.Seek(bytes.CurrentOffset - OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()).Length);
|
||||
|
||||
throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}", e);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static void ReadExpectedString(IRandomAccessRead reader, string expectedstring)
|
||||
{
|
||||
ReadExpectedString(reader, expectedstring, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
|
||||
*
|
||||
* @param expectedstring pattern to be skipped
|
||||
* @param skipSpaces if set to true spaces before and after the string will be skipped
|
||||
* @throws IOException if pattern could not be read
|
||||
*/
|
||||
public static void ReadExpectedString(IRandomAccessRead reader, string expectedstring, bool skipSpaces)
|
||||
{
|
||||
SkipSpaces(reader);
|
||||
|
||||
foreach (var c in expectedstring)
|
||||
{
|
||||
if (reader.Read() != c)
|
||||
{
|
||||
throw new IOException($"Expected string \'{expectedstring}\' but missed character \'{c}\' at offset {reader.GetPosition()}");
|
||||
}
|
||||
}
|
||||
|
||||
SkipSpaces(reader);
|
||||
}
|
||||
|
||||
public static bool IsSpace(IRandomAccessRead reader)
|
||||
{
|
||||
return IsSpace(reader.Peek());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This will tell if the given value is a space or not.
|
||||
*
|
||||
@@ -302,17 +250,7 @@
|
||||
{
|
||||
return ' ' == c;
|
||||
}
|
||||
|
||||
public static void ReadExpectedChar(IRandomAccessRead reader, char ec)
|
||||
{
|
||||
char c = (char)reader.Read();
|
||||
|
||||
if (c != ec)
|
||||
{
|
||||
throw new InvalidOperationException($"expected=\'{ec}\' actual=\'{c}\' at offset {reader.GetPosition()}");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static bool IsHexDigit(char ch)
|
||||
{
|
||||
return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
|
||||
|
@@ -29,14 +29,12 @@
|
||||
var container = Bootstrapper.GenerateContainer(options?.Logger);
|
||||
|
||||
var isLenientParsing = options?.UseLenientParsing ?? true;
|
||||
|
||||
var reader = new RandomAccessBuffer(fileBytes);
|
||||
|
||||
|
||||
var inputBytes = new ByteArrayInputBytes(fileBytes);
|
||||
|
||||
var tokenScanner = new CoreTokenScanner(inputBytes);
|
||||
|
||||
var document = OpenDocument(reader, inputBytes, tokenScanner, container, isLenientParsing);
|
||||
var document = OpenDocument(inputBytes, tokenScanner, container, isLenientParsing);
|
||||
|
||||
return document;
|
||||
}
|
||||
@@ -51,23 +49,27 @@
|
||||
return Open(File.ReadAllBytes(filename), options);
|
||||
}
|
||||
|
||||
private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
|
||||
private static PdfDocument OpenDocument(IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
|
||||
{
|
||||
var log = container.Get<ILog>();
|
||||
var filterProvider = container.Get<IFilterProvider>();
|
||||
var bruteForceSearcher = new BruteForceSearcher(reader);
|
||||
var catalogFactory = new CatalogFactory();
|
||||
var cMapCache = new CMapCache(new CMapParser());
|
||||
|
||||
CrossReferenceTable crossReferenceTable = null;
|
||||
|
||||
var bruteForceSearcher = new BruteForceSearcher(inputBytes);
|
||||
var xrefValidator = new XrefOffsetValidator(log);
|
||||
var objectChecker = new XrefCosOffsetChecker(log, bruteForceSearcher);
|
||||
|
||||
// We're ok with this since our intent is to lazily load the cross reference table.
|
||||
// ReSharper disable once AccessToModifiedClosure
|
||||
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, bruteForceSearcher);
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider);
|
||||
|
||||
var xrefValidator = new XrefOffsetValidator(log);
|
||||
|
||||
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
||||
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser, new CrossReferenceTableParser());
|
||||
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, objectChecker, crossReferenceStreamParser, new CrossReferenceTableParser());
|
||||
|
||||
var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
|
||||
|
||||
@@ -76,22 +78,20 @@
|
||||
// TODO: make this use the scanner.
|
||||
var validator = new CrossReferenceOffsetValidator(xrefValidator);
|
||||
|
||||
crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, reader, isLenientParsing);
|
||||
crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, inputBytes, isLenientParsing);
|
||||
|
||||
crossReferenceTable = crossReferenceParser.Parse(reader, isLenientParsing, crossReferenceOffset, pdfScanner, scanner);
|
||||
crossReferenceTable = crossReferenceParser.Parse(inputBytes, isLenientParsing, crossReferenceOffset, pdfScanner, scanner);
|
||||
|
||||
var trueTypeFontParser = new TrueTypeFontParser();
|
||||
var fontDescriptorFactory = new FontDescriptorFactory();
|
||||
|
||||
var cidFontFactory = new CidFontFactory(pdfScanner, fontDescriptorFactory, trueTypeFontParser, filterProvider);
|
||||
var encodingReader = new EncodingReader(pdfScanner);
|
||||
|
||||
var cMapCache = new CMapCache(new CMapParser());
|
||||
|
||||
|
||||
var fontFactory = new FontFactory(log, new Type0FontHandler(cidFontFactory,
|
||||
cMapCache,
|
||||
filterProvider, pdfScanner),
|
||||
new TrueTypeFontHandler(pdfScanner, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
|
||||
new TrueTypeFontHandler(log, pdfScanner, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
|
||||
new Type1FontHandler(pdfScanner, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, new Type1FontParser()),
|
||||
new Type3FontHandler(pdfScanner, cMapCache, filterProvider, encodingReader));
|
||||
|
||||
@@ -99,17 +99,17 @@
|
||||
|
||||
var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
|
||||
var informationFactory = new DocumentInformationFactory();
|
||||
var catalogFactory = new CatalogFactory(pdfScanner);
|
||||
|
||||
|
||||
var rootDictionary = ParseTrailer(crossReferenceTable, isLenientParsing, pdfScanner);
|
||||
|
||||
var information = informationFactory.Create(pdfScanner, crossReferenceTable.Dictionary);
|
||||
|
||||
var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing);
|
||||
var catalog = catalogFactory.Create(pdfScanner, rootDictionary);
|
||||
|
||||
var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer);
|
||||
|
||||
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
|
||||
return new PdfDocument(log, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
|
||||
pdfScanner);
|
||||
}
|
||||
|
||||
|
@@ -15,8 +15,6 @@
|
||||
/// </summary>
|
||||
public class PdfDocument : IDisposable
|
||||
{
|
||||
[NotNull]
|
||||
private readonly IRandomAccessRead reader;
|
||||
[NotNull]
|
||||
private readonly HeaderVersion version;
|
||||
[NotNull]
|
||||
@@ -51,7 +49,7 @@
|
||||
/// </summary>
|
||||
public int NumberOfPages => Pages.Count;
|
||||
|
||||
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
|
||||
internal PdfDocument(ILog log, HeaderVersion version, CrossReferenceTable crossReferenceTable,
|
||||
bool isLenientParsing,
|
||||
ParsingCachingProviders cachingProviders,
|
||||
IPageFactory pageFactory,
|
||||
@@ -59,7 +57,6 @@
|
||||
DocumentInformation information, IPdfTokenScanner pdfScanner)
|
||||
{
|
||||
this.log = log;
|
||||
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
this.version = version ?? throw new ArgumentNullException(nameof(version));
|
||||
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
|
||||
this.isLenientParsing = isLenientParsing;
|
||||
@@ -67,7 +64,7 @@
|
||||
this.pdfScanner = pdfScanner;
|
||||
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
|
||||
Pages = new Pages(log, Catalog, pageFactory, reader, isLenientParsing, pdfScanner);
|
||||
Pages = new Pages(log, Catalog, pageFactory, isLenientParsing, pdfScanner);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -103,7 +100,6 @@
|
||||
{
|
||||
try
|
||||
{
|
||||
reader.Dispose();
|
||||
}
|
||||
catch
|
||||
{
|
||||
|
@@ -18,6 +18,7 @@
|
||||
public static readonly OperatorToken Dup = new OperatorToken("dup");
|
||||
public static readonly OperatorToken For = new OperatorToken("for");
|
||||
public static readonly OperatorToken Put = new OperatorToken("put");
|
||||
public static readonly OperatorToken Xref = new OperatorToken("xref");
|
||||
|
||||
public string Data { get; }
|
||||
|
||||
@@ -60,6 +61,8 @@
|
||||
return For;
|
||||
case "put":
|
||||
return Put;
|
||||
case "xref":
|
||||
return Xref;
|
||||
default:
|
||||
return new OperatorToken(data);
|
||||
}
|
||||
|
Reference in New Issue
Block a user