substitute the token scanner into the file trailer parsing and test

This commit is contained in:
Eliot Jones
2018-01-03 22:29:09 +00:00
parent f09ef85e5a
commit 21be34a938
6 changed files with 287 additions and 126 deletions

View File

@@ -0,0 +1,161 @@
namespace UglyToad.Pdf.Tests.Parser.Parts
{
using System;
using Exceptions;
using Pdf.Parser.Parts;
using Pdf.Tokenization.Scanner;
using Xunit;
public class FileTrailerParserTests
{
private readonly FileTrailerParser parser = new FileTrailerParser();
[Fact]
public void FindsCompliantStartXref()
{
var input = StringBytesTestConverter.Convert(@"sta455%r endstream
endobj
12 0 obj
1234 %eof
endobj
startxref
456
%%EOF", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(456, result);
}
[Fact]
public void IgnoresStartXrefFollowingEndOfFile()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Something /W[12 0 5 6] >>
endobj
12 0 obj
1234 %eof
endobj
startxref
1384733
%%EOF
% I decided to put some nonsense here:
% because I could hahaha
startxref
17", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(1384733, result);
}
[Fact]
public void MissingStartXrefThrows()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Something /W[12 0 5 6] >>
endobj
12 0 obj
1234 %eof
endobj
startref
1384733
%%EOF
% I decided to put some nonsense here:
% because I could hahaha
start_rexf
17", false);
Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void NullInputBytesThrows()
{
var input = StringBytesTestConverter.Convert("11 0 obj", false);
Action action = () => parser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false);
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void NullScannerThrows()
{
var input = StringBytesTestConverter.Convert("11 0 obj", false);
Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, null, false);
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void InvalidTokensAfterStartXrefThrows()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Font >>
endobj
startxref
<< /Why (am i here?) >> 69
%EOF", false);
Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void TakesLastStartXrefPrecedingEndOfFile()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Something /W[12 0 5 6] >>
endobj
12 0 obj
1234 %eof
endobj
startxref
1384733
%actually I changed my mind
startxref
1274665676543
%%EOF", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(1274665676543, result);
}
[Fact]
public void CanReadStartXrefIfCommentsPresent()
{
var input = StringBytesTestConverter.Convert(@"
startxref %Commented here
57695
%%EOF", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(57695, result);
}
}
}

View File

@@ -29,6 +29,8 @@
public byte CurrentByte { get; private set; }
public long Length => bytes.Count;
public byte? Peek()
{
if (currentOffset == bytes.Count - 1)

View File

@@ -8,6 +8,8 @@
byte CurrentByte { get; }
long Length { get; }
byte? Peek();
bool IsAtEnd();

View File

@@ -1,8 +1,11 @@
namespace UglyToad.Pdf.Parser.Parts
{
using System;
using System.Linq;
using System.Collections.Generic;
using Exceptions;
using IO;
using Tokenization.Scanner;
using Tokenization.Tokens;
/*
* The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects.
@@ -19,156 +22,136 @@
internal class FileTrailerParser
{
private const int DefaultTrailerByteLength = 2048;
/// <summary>
/// Acrobat viewers require the EOF to be in the last 1024 bytes instead of at the end.
/// </summary>
private const int EndOfFileSearchRange = 1024;
private readonly byte[] endOfFileBytes;
private readonly byte[] startXRefBytes;
public FileTrailerParser()
private static readonly byte[] EndOfFileBytes =
{
endOfFileBytes = "%%EOF".Select(x => (byte)x).ToArray();
startXRefBytes = "startxref".Select(x => (byte)x).ToArray();
}
(byte)'%',
(byte)'%',
(byte)'E',
(byte)'O',
(byte)'F'
};
public long GetXrefOffset(IRandomAccessRead reader, bool isLenientParsing)
private static readonly byte[] StartXRefBytes =
{
var startXrefOffset = GetByteOffsetForStartXref(reader, (int)reader.Length(), isLenientParsing);
reader.Seek(startXrefOffset);
long actualXrefOffset = Math.Max(0, ParseXrefStartPosition(reader));
return actualXrefOffset;
}
private long ParseXrefStartPosition(IRandomAccessRead reader)
(byte) 's',
(byte) 't',
(byte) 'a',
(byte) 'r',
(byte) 't',
(byte) 'x',
(byte) 'r',
(byte) 'e',
(byte) 'f'
};
public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
{
long startXref = -1;
if (ReadHelper.IsString(reader, startXRefBytes))
if (bytes == null)
{
ReadHelper.ReadString(reader);
ReadHelper.SkipSpaces(reader);
// This integer is the byte offset of the first object referenced by the xref or xref stream
startXref = ReadHelper.ReadLong(reader);
throw new ArgumentNullException(nameof(bytes));
}
return startXref;
}
private long GetByteOffsetForStartXref(IRandomAccessRead reader, int fileLength, bool isLenientParsing)
{
byte[] buf;
long skipBytes;
// read trailing bytes into buffer
try
if (scanner == null)
{
var trailByteCount = fileLength < DefaultTrailerByteLength ? fileLength : DefaultTrailerByteLength;
buf = new byte[trailByteCount];
throw new ArgumentNullException(nameof(scanner));
}
skipBytes = fileLength - trailByteCount;
var fileLength = bytes.Length;
reader.Seek(skipBytes);
int off = 0;
while (off < trailByteCount)
var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;
var startPosition = fileLength - offsetFromEnd;
bytes.Seek(startPosition);
var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);
scanner.Seek(startXrefPosition);
if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref")
{
throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}.");
}
NumericToken numeric = null;
while (scanner.MoveNext())
{
if (scanner.CurrentToken is NumericToken token)
{
var readBytes = reader.Read(buf, off, trailByteCount - off);
numeric = token;
break;
}
// in order to not get stuck in a loop we check readBytes (this should never happen)
if (readBytes < 1)
{
throw new InvalidOperationException(
"No more bytes to read for trailing buffer, but expected: "
+ (trailByteCount - off));
}
off += readBytes;
if (!(scanner.CurrentToken is CommentToken))
{
throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}.");
}
}
finally
if (numeric == null)
{
reader.ReturnToBeginning();
throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}.");
}
// find last '%%EOF'
int bufOff = LastIndexOf(endOfFileBytes, buf, buf.Length);
if (bufOff < 0)
return numeric.Long;
}
private static long GetStartXrefPosition(IInputBytes bytes, int offsetFromEnd)
{
var startXrefs = new List<int>();
var index = 0;
var eofIndex = 0;
var offset = 0;
// Starting scanning the last 1024 bytes.
while (bytes.MoveNext())
{
if (isLenientParsing)
offset++;
if (bytes.CurrentByte == StartXRefBytes[index])
{
// in lenient mode the '%%EOF' isn't needed
bufOff = buf.Length;
//LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'");
// We might be reading "startxref".
eofIndex = 0;
index++;
}
else if (bytes.CurrentByte == EndOfFileBytes[eofIndex])
{
// We might be reading "%%EOF".
eofIndex++;
index = 0;
}
else
{
throw new InvalidOperationException("Missing end of file marker '%%EOF'");
eofIndex = 0;
index = 0;
}
}
// find last startxref preceding EOF marker
bufOff = LastIndexOf(startXRefBytes, buf, bufOff);
long startXRefOffset = skipBytes + bufOff;
if (bufOff < 0)
{
throw new NotImplementedException();
//if (isLenientParsing)
//{
// //LOG.debug("Performing brute force search for last startxref entry");
// long bfOffset = bfSearchForLastStartxrefEntry();
// bool offsetIsValid = false;
// if (bfOffset > -1)
// {
// reader.Seek(bfOffset);
// long bfXref = ParseXrefStartPosition();
// if (bfXref > -1)
// {
// offsetIsValid = checkXRefOffset(bfXref) == bfXref;
// }
// }
// reader.ReturnToBeginning();
// // use the new offset only if it is a valid pointer to a xref table
// return offsetIsValid ? bfOffset : -1;
//}
throw new InvalidOperationException("Missing 'startxref' marker.");
}
return startXRefOffset;
}
private int LastIndexOf(byte[] pattern, byte[] bytes, int endOff)
{
int lastPatternByte = pattern.Length - 1;
int bufferOffset = endOff;
int patternByte = lastPatternByte;
byte targetByte = pattern[patternByte];
while (--bufferOffset >= 0)
{
if (bytes[bufferOffset] == targetByte)
if (index == StartXRefBytes.Length)
{
if (--patternByte < 0)
{
// whole pattern matched
return bufferOffset;
}
// matched current byte, advance to preceding one
targetByte = pattern[patternByte];
// Add this "startxref" (position from the end of the document to the first 's').
startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
// Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
index = 0;
}
else if (patternByte < lastPatternByte)
else if (eofIndex == EndOfFileBytes.Length)
{
// no byte match but already matched some chars; reset
patternByte = lastPatternByte;
targetByte = pattern[patternByte];
// Stop at the EOF if present.
break;
}
}
return -1;
if (startXrefs.Count == 0)
{
throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters.");
}
return bytes.Length - startXrefs[startXrefs.Count - 1];
}
}
}

View File

@@ -29,9 +29,11 @@
var reader = new RandomAccessBuffer(fileBytes);
var tokenScanner = new CoreTokenScanner(new ByteArrayInputBytes(fileBytes));
var inputBytes = new ByteArrayInputBytes(fileBytes);
var document = OpenDocument(reader,tokenScanner, container, isLenientParsing);
var tokenScanner = new CoreTokenScanner(inputBytes);
var document = OpenDocument(reader, inputBytes, tokenScanner, container, isLenientParsing);
return document;
}
@@ -46,13 +48,13 @@
return Open(File.ReadAllBytes(filename), options);
}
private static PdfDocument OpenDocument(IRandomAccessRead reader, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
{
var log = container.Get<ILog>();
var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing);
var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
var pool = new CosObjectPool();

View File

@@ -1,5 +1,6 @@
namespace UglyToad.Pdf.Tokenization.Tokens
{
using System;
using System.Globalization;
public class NumericToken : IDataToken<decimal>
@@ -10,14 +11,24 @@
public int Int { get; }
public bool IsBiggerThanInt { get; }
public long Long { get; }
public NumericToken(decimal value)
{
Data = value;
IsWhole = decimal.Floor(value) == value;
Int = (int) value;
Long = (long) value;
try
{
Int = (int) value;
}
catch (OverflowException)
{
IsBiggerThanInt = true;
}
}
public override string ToString()