substitute the token scanner into the file trailer parsing and test

This commit is contained in:
Eliot Jones
2018-01-03 22:29:09 +00:00
parent f09ef85e5a
commit 21be34a938
6 changed files with 287 additions and 126 deletions

View File

@@ -0,0 +1,161 @@
namespace UglyToad.Pdf.Tests.Parser.Parts
{
using System;
using Exceptions;
using Pdf.Parser.Parts;
using Pdf.Tokenization.Scanner;
using Xunit;
public class FileTrailerParserTests
{
private readonly FileTrailerParser parser = new FileTrailerParser();
[Fact]
public void FindsCompliantStartXref()
{
var input = StringBytesTestConverter.Convert(@"sta455%r endstream
endobj
12 0 obj
1234 %eof
endobj
startxref
456
%%EOF", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(456, result);
}
[Fact]
public void IgnoresStartXrefFollowingEndOfFile()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Something /W[12 0 5 6] >>
endobj
12 0 obj
1234 %eof
endobj
startxref
1384733
%%EOF
% I decided to put some nonsense here:
% because I could hahaha
startxref
17", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(1384733, result);
}
[Fact]
public void MissingStartXrefThrows()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Something /W[12 0 5 6] >>
endobj
12 0 obj
1234 %eof
endobj
startref
1384733
%%EOF
% I decided to put some nonsense here:
% because I could hahaha
start_rexf
17", false);
Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void NullInputBytesThrows()
{
var input = StringBytesTestConverter.Convert("11 0 obj", false);
Action action = () => parser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false);
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void NullScannerThrows()
{
var input = StringBytesTestConverter.Convert("11 0 obj", false);
Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, null, false);
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void InvalidTokensAfterStartXrefThrows()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Font >>
endobj
startxref
<< /Why (am i here?) >> 69
%EOF", false);
Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void TakesLastStartXrefPrecedingEndOfFile()
{
var input = StringBytesTestConverter.Convert(@"11 0 obj
<< /Type/Something /W[12 0 5 6] >>
endobj
12 0 obj
1234 %eof
endobj
startxref
1384733
%actually I changed my mind
startxref
1274665676543
%%EOF", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(1274665676543, result);
}
[Fact]
public void CanReadStartXrefIfCommentsPresent()
{
var input = StringBytesTestConverter.Convert(@"
startxref %Commented here
57695
%%EOF", false);
var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
Assert.Equal(57695, result);
}
}
}

View File

@@ -29,6 +29,8 @@
public byte CurrentByte { get; private set; } public byte CurrentByte { get; private set; }
public long Length => bytes.Count;
public byte? Peek() public byte? Peek()
{ {
if (currentOffset == bytes.Count - 1) if (currentOffset == bytes.Count - 1)

View File

@@ -8,6 +8,8 @@
byte CurrentByte { get; } byte CurrentByte { get; }
long Length { get; }
byte? Peek(); byte? Peek();
bool IsAtEnd(); bool IsAtEnd();

View File

@@ -1,8 +1,11 @@
namespace UglyToad.Pdf.Parser.Parts namespace UglyToad.Pdf.Parser.Parts
{ {
using System; using System;
using System.Linq; using System.Collections.Generic;
using Exceptions;
using IO; using IO;
using Tokenization.Scanner;
using Tokenization.Tokens;
/* /*
* The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects. * The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects.
@@ -19,156 +22,136 @@
internal class FileTrailerParser internal class FileTrailerParser
{ {
private const int DefaultTrailerByteLength = 2048; /// <summary>
/// Acrobat viewers require the EOF to be in the last 1024 bytes instead of at the end.
/// </summary>
private const int EndOfFileSearchRange = 1024;
private readonly byte[] endOfFileBytes; private static readonly byte[] EndOfFileBytes =
private readonly byte[] startXRefBytes;
public FileTrailerParser()
{ {
endOfFileBytes = "%%EOF".Select(x => (byte)x).ToArray(); (byte)'%',
startXRefBytes = "startxref".Select(x => (byte)x).ToArray(); (byte)'%',
} (byte)'E',
(byte)'O',
(byte)'F'
};
public long GetXrefOffset(IRandomAccessRead reader, bool isLenientParsing) private static readonly byte[] StartXRefBytes =
{ {
var startXrefOffset = GetByteOffsetForStartXref(reader, (int)reader.Length(), isLenientParsing); (byte) 's',
(byte) 't',
reader.Seek(startXrefOffset); (byte) 'a',
(byte) 'r',
long actualXrefOffset = Math.Max(0, ParseXrefStartPosition(reader)); (byte) 't',
(byte) 'x',
return actualXrefOffset; (byte) 'r',
} (byte) 'e',
(byte) 'f'
private long ParseXrefStartPosition(IRandomAccessRead reader) };
public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
{ {
long startXref = -1; if (bytes == null)
if (ReadHelper.IsString(reader, startXRefBytes))
{ {
ReadHelper.ReadString(reader); throw new ArgumentNullException(nameof(bytes));
ReadHelper.SkipSpaces(reader);
// This integer is the byte offset of the first object referenced by the xref or xref stream
startXref = ReadHelper.ReadLong(reader);
} }
return startXref;
}
private long GetByteOffsetForStartXref(IRandomAccessRead reader, int fileLength, bool isLenientParsing) if (scanner == null)
{
byte[] buf;
long skipBytes;
// read trailing bytes into buffer
try
{ {
var trailByteCount = fileLength < DefaultTrailerByteLength ? fileLength : DefaultTrailerByteLength; throw new ArgumentNullException(nameof(scanner));
buf = new byte[trailByteCount]; }
skipBytes = fileLength - trailByteCount; var fileLength = bytes.Length;
reader.Seek(skipBytes); var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;
int off = 0;
while (off < trailByteCount) var startPosition = fileLength - offsetFromEnd;
bytes.Seek(startPosition);
var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);
scanner.Seek(startXrefPosition);
if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref")
{
throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}.");
}
NumericToken numeric = null;
while (scanner.MoveNext())
{
if (scanner.CurrentToken is NumericToken token)
{ {
var readBytes = reader.Read(buf, off, trailByteCount - off); numeric = token;
break;
}
// in order to not get stuck in a loop we check readBytes (this should never happen) if (!(scanner.CurrentToken is CommentToken))
if (readBytes < 1) {
{ throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}.");
throw new InvalidOperationException(
"No more bytes to read for trailing buffer, but expected: "
+ (trailByteCount - off));
}
off += readBytes;
} }
} }
finally
if (numeric == null)
{ {
reader.ReturnToBeginning(); throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}.");
} }
// find last '%%EOF' return numeric.Long;
int bufOff = LastIndexOf(endOfFileBytes, buf, buf.Length); }
if (bufOff < 0)
private static long GetStartXrefPosition(IInputBytes bytes, int offsetFromEnd)
{
var startXrefs = new List<int>();
var index = 0;
var eofIndex = 0;
var offset = 0;
// Starting scanning the last 1024 bytes.
while (bytes.MoveNext())
{ {
if (isLenientParsing) offset++;
if (bytes.CurrentByte == StartXRefBytes[index])
{ {
// in lenient mode the '%%EOF' isn't needed // We might be reading "startxref".
bufOff = buf.Length; eofIndex = 0;
//LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'"); index++;
}
else if (bytes.CurrentByte == EndOfFileBytes[eofIndex])
{
// We might be reading "%%EOF".
eofIndex++;
index = 0;
} }
else else
{ {
throw new InvalidOperationException("Missing end of file marker '%%EOF'"); eofIndex = 0;
index = 0;
} }
}
// find last startxref preceding EOF marker
bufOff = LastIndexOf(startXRefBytes, buf, bufOff);
long startXRefOffset = skipBytes + bufOff;
if (bufOff < 0) if (index == StartXRefBytes.Length)
{
throw new NotImplementedException();
//if (isLenientParsing)
//{
// //LOG.debug("Performing brute force search for last startxref entry");
// long bfOffset = bfSearchForLastStartxrefEntry();
// bool offsetIsValid = false;
// if (bfOffset > -1)
// {
// reader.Seek(bfOffset);
// long bfXref = ParseXrefStartPosition();
// if (bfXref > -1)
// {
// offsetIsValid = checkXRefOffset(bfXref) == bfXref;
// }
// }
// reader.ReturnToBeginning();
// // use the new offset only if it is a valid pointer to a xref table
// return offsetIsValid ? bfOffset : -1;
//}
throw new InvalidOperationException("Missing 'startxref' marker.");
}
return startXRefOffset;
}
private int LastIndexOf(byte[] pattern, byte[] bytes, int endOff)
{
int lastPatternByte = pattern.Length - 1;
int bufferOffset = endOff;
int patternByte = lastPatternByte;
byte targetByte = pattern[patternByte];
while (--bufferOffset >= 0)
{
if (bytes[bufferOffset] == targetByte)
{ {
if (--patternByte < 0) // Add this "startxref" (position from the end of the document to the first 's').
{ startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
// whole pattern matched
return bufferOffset; // Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
} index = 0;
// matched current byte, advance to preceding one
targetByte = pattern[patternByte];
} }
else if (patternByte < lastPatternByte) else if (eofIndex == EndOfFileBytes.Length)
{ {
// no byte match but already matched some chars; reset // Stop at the EOF if present.
patternByte = lastPatternByte; break;
targetByte = pattern[patternByte];
} }
} }
return -1; if (startXrefs.Count == 0)
{
throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters.");
}
return bytes.Length - startXrefs[startXrefs.Count - 1];
} }
} }
} }

View File

@@ -29,9 +29,11 @@
var reader = new RandomAccessBuffer(fileBytes); var reader = new RandomAccessBuffer(fileBytes);
var tokenScanner = new CoreTokenScanner(new ByteArrayInputBytes(fileBytes)); var inputBytes = new ByteArrayInputBytes(fileBytes);
var document = OpenDocument(reader,tokenScanner, container, isLenientParsing); var tokenScanner = new CoreTokenScanner(inputBytes);
var document = OpenDocument(reader, inputBytes, tokenScanner, container, isLenientParsing);
return document; return document;
} }
@@ -46,13 +48,13 @@
return Open(File.ReadAllBytes(filename), options); return Open(File.ReadAllBytes(filename), options);
} }
private static PdfDocument OpenDocument(IRandomAccessRead reader, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing) private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
{ {
var log = container.Get<ILog>(); var log = container.Get<ILog>();
var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing); var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing); var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
var pool = new CosObjectPool(); var pool = new CosObjectPool();

View File

@@ -1,5 +1,6 @@
namespace UglyToad.Pdf.Tokenization.Tokens namespace UglyToad.Pdf.Tokenization.Tokens
{ {
using System;
using System.Globalization; using System.Globalization;
public class NumericToken : IDataToken<decimal> public class NumericToken : IDataToken<decimal>
@@ -10,14 +11,24 @@
public int Int { get; } public int Int { get; }
public bool IsBiggerThanInt { get; }
public long Long { get; } public long Long { get; }
public NumericToken(decimal value) public NumericToken(decimal value)
{ {
Data = value; Data = value;
IsWhole = decimal.Floor(value) == value; IsWhole = decimal.Floor(value) == value;
Int = (int) value;
Long = (long) value; Long = (long) value;
try
{
Int = (int) value;
}
catch (OverflowException)
{
IsBiggerThanInt = true;
}
} }
public override string ToString() public override string ToString()