From 21be34a9382c1d5192c3e96b766e227fa2da386b Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Wed, 3 Jan 2018 22:29:09 +0000 Subject: [PATCH] substitute the token scanner into the file trailer parsing and test --- .../Parser/Parts/FileTrailerParserTests.cs | 161 +++++++++++++ src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs | 2 + src/UglyToad.Pdf/IO/IInputBytes.cs | 2 + .../Parser/Parts/FileTrailerParser.cs | 223 ++++++++---------- src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs | 12 +- .../Tokenization/Tokens/NumericToken.cs | 13 +- 6 files changed, 287 insertions(+), 126 deletions(-) create mode 100644 src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs diff --git a/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs b/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs new file mode 100644 index 00000000..ce59fade --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs @@ -0,0 +1,161 @@ +namespace UglyToad.Pdf.Tests.Parser.Parts +{ + using System; + using Exceptions; + using Pdf.Parser.Parts; + using Pdf.Tokenization.Scanner; + using Xunit; + + public class FileTrailerParserTests + { + private readonly FileTrailerParser parser = new FileTrailerParser(); + + [Fact] + public void FindsCompliantStartXref() + { + var input = StringBytesTestConverter.Convert(@"sta455%r endstream +endobj + +12 0 obj +1234 %eof +endobj + +startxref + 456 + +%%EOF", false); + + var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + + Assert.Equal(456, result); + } + + [Fact] + public void IgnoresStartXrefFollowingEndOfFile() + { + var input = StringBytesTestConverter.Convert(@"11 0 obj +<< /Type/Something /W[12 0 5 6] >> +endobj + +12 0 obj +1234 %eof +endobj + +startxref + 1384733 + +%%EOF + +% I decided to put some nonsense here: +% because I could hahaha +startxref +17", false); + + var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + + Assert.Equal(1384733, result); + } + + [Fact] + public void MissingStartXrefThrows() + { + var input = StringBytesTestConverter.Convert(@"11 0 obj +<< /Type/Something /W[12 0 5 6] >> +endobj + +12 0 obj +1234 %eof +endobj + +startref + 1384733 + +%%EOF + +% I decided to put some nonsense here: +% because I could hahaha +start_rexf +17", false); + + Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + + Assert.Throws(action); + } + + [Fact] + public void NullInputBytesThrows() + { + var input = StringBytesTestConverter.Convert("11 0 obj", false); + + Action action = () => parser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false); + + Assert.Throws(action); + } + + [Fact] + public void NullScannerThrows() + { + var input = StringBytesTestConverter.Convert("11 0 obj", false); + + Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, null, false); + + Assert.Throws(action); + } + + [Fact] + public void InvalidTokensAfterStartXrefThrows() + { + var input = StringBytesTestConverter.Convert(@"11 0 obj + << /Type/Font >> +endobj + +startxref +<< /Why (am i here?) >> 69 +%EOF", false); + + Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + + Assert.Throws(action); + } + + [Fact] + public void TakesLastStartXrefPrecedingEndOfFile() + { + var input = StringBytesTestConverter.Convert(@"11 0 obj +<< /Type/Something /W[12 0 5 6] >> +endobj + +12 0 obj +1234 %eof +endobj + +startxref + 1384733 + +%actually I changed my mind + +startxref + 1274665676543 + +%%EOF", false); + + var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + + Assert.Equal(1274665676543, result); + } + + [Fact] + public void CanReadStartXrefIfCommentsPresent() + { + var input = StringBytesTestConverter.Convert(@" +startxref %Commented here + 57695 + +%%EOF", false); + + var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false); + + Assert.Equal(57695, result); + } + } +} diff --git a/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs b/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs index ccb8d162..744c8579 100644 --- a/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs +++ b/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs @@ -29,6 +29,8 @@ public byte CurrentByte { get; private set; } + public long Length => bytes.Count; + public byte? Peek() { if (currentOffset == bytes.Count - 1) diff --git a/src/UglyToad.Pdf/IO/IInputBytes.cs b/src/UglyToad.Pdf/IO/IInputBytes.cs index d09b6e45..f248d37a 100644 --- a/src/UglyToad.Pdf/IO/IInputBytes.cs +++ b/src/UglyToad.Pdf/IO/IInputBytes.cs @@ -8,6 +8,8 @@ byte CurrentByte { get; } + long Length { get; } + byte? Peek(); bool IsAtEnd(); diff --git a/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs b/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs index 07be39ad..0fcd448d 100644 --- a/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs +++ b/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs @@ -1,8 +1,11 @@ namespace UglyToad.Pdf.Parser.Parts { using System; - using System.Linq; + using System.Collections.Generic; + using Exceptions; using IO; + using Tokenization.Scanner; + using Tokenization.Tokens; /* * The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects. @@ -19,156 +22,136 @@ internal class FileTrailerParser { - private const int DefaultTrailerByteLength = 2048; + /// + /// Acrobat viewers require the EOF to be in the last 1024 bytes instead of at the end. + /// + private const int EndOfFileSearchRange = 1024; - private readonly byte[] endOfFileBytes; - private readonly byte[] startXRefBytes; - - public FileTrailerParser() + private static readonly byte[] EndOfFileBytes = { - endOfFileBytes = "%%EOF".Select(x => (byte)x).ToArray(); - startXRefBytes = "startxref".Select(x => (byte)x).ToArray(); - } + (byte)'%', + (byte)'%', + (byte)'E', + (byte)'O', + (byte)'F' + }; - public long GetXrefOffset(IRandomAccessRead reader, bool isLenientParsing) + private static readonly byte[] StartXRefBytes = { - var startXrefOffset = GetByteOffsetForStartXref(reader, (int)reader.Length(), isLenientParsing); - - reader.Seek(startXrefOffset); - - long actualXrefOffset = Math.Max(0, ParseXrefStartPosition(reader)); - - return actualXrefOffset; - } - - private long ParseXrefStartPosition(IRandomAccessRead reader) + (byte) 's', + (byte) 't', + (byte) 'a', + (byte) 'r', + (byte) 't', + (byte) 'x', + (byte) 'r', + (byte) 'e', + (byte) 'f' + }; + + public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing) { - long startXref = -1; - - if (ReadHelper.IsString(reader, startXRefBytes)) + if (bytes == null) { - ReadHelper.ReadString(reader); - - ReadHelper.SkipSpaces(reader); - - // This integer is the byte offset of the first object referenced by the xref or xref stream - startXref = ReadHelper.ReadLong(reader); + throw new ArgumentNullException(nameof(bytes)); } - return startXref; - } - private long GetByteOffsetForStartXref(IRandomAccessRead reader, int fileLength, bool isLenientParsing) - { - byte[] buf; - long skipBytes; - // read trailing bytes into buffer - try + if (scanner == null) { - var trailByteCount = fileLength < DefaultTrailerByteLength ? fileLength : DefaultTrailerByteLength; - buf = new byte[trailByteCount]; + throw new ArgumentNullException(nameof(scanner)); + } - skipBytes = fileLength - trailByteCount; + var fileLength = bytes.Length; - reader.Seek(skipBytes); - int off = 0; - while (off < trailByteCount) + var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange; + + var startPosition = fileLength - offsetFromEnd; + + bytes.Seek(startPosition); + + var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd); + + scanner.Seek(startXrefPosition); + + if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref") + { + throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}."); + } + + NumericToken numeric = null; + while (scanner.MoveNext()) + { + if (scanner.CurrentToken is NumericToken token) { - var readBytes = reader.Read(buf, off, trailByteCount - off); + numeric = token; + break; + } - // in order to not get stuck in a loop we check readBytes (this should never happen) - if (readBytes < 1) - { - throw new InvalidOperationException( - "No more bytes to read for trailing buffer, but expected: " - + (trailByteCount - off)); - } - - off += readBytes; + if (!(scanner.CurrentToken is CommentToken)) + { + throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}."); } } - finally + + if (numeric == null) { - reader.ReturnToBeginning(); + throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}."); } - // find last '%%EOF' - int bufOff = LastIndexOf(endOfFileBytes, buf, buf.Length); - if (bufOff < 0) + return numeric.Long; + } + + private static long GetStartXrefPosition(IInputBytes bytes, int offsetFromEnd) + { + var startXrefs = new List(); + + var index = 0; + var eofIndex = 0; + var offset = 0; + + // Starting scanning the last 1024 bytes. + while (bytes.MoveNext()) { - if (isLenientParsing) + offset++; + if (bytes.CurrentByte == StartXRefBytes[index]) { - // in lenient mode the '%%EOF' isn't needed - bufOff = buf.Length; - //LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'"); + // We might be reading "startxref". + eofIndex = 0; + index++; + } + else if (bytes.CurrentByte == EndOfFileBytes[eofIndex]) + { + // We might be reading "%%EOF". + eofIndex++; + index = 0; } else { - throw new InvalidOperationException("Missing end of file marker '%%EOF'"); + eofIndex = 0; + index = 0; } - } - // find last startxref preceding EOF marker - bufOff = LastIndexOf(startXRefBytes, buf, bufOff); - long startXRefOffset = skipBytes + bufOff; - if (bufOff < 0) - { - throw new NotImplementedException(); - //if (isLenientParsing) - //{ - // //LOG.debug("Performing brute force search for last startxref entry"); - // long bfOffset = bfSearchForLastStartxrefEntry(); - // bool offsetIsValid = false; - // if (bfOffset > -1) - // { - // reader.Seek(bfOffset); - // long bfXref = ParseXrefStartPosition(); - // if (bfXref > -1) - // { - // offsetIsValid = checkXRefOffset(bfXref) == bfXref; - // } - // } - - // reader.ReturnToBeginning(); - - // // use the new offset only if it is a valid pointer to a xref table - // return offsetIsValid ? bfOffset : -1; - //} - - throw new InvalidOperationException("Missing 'startxref' marker."); - } - - return startXRefOffset; - } - - private int LastIndexOf(byte[] pattern, byte[] bytes, int endOff) - { - int lastPatternByte = pattern.Length - 1; - - int bufferOffset = endOff; - int patternByte = lastPatternByte; - byte targetByte = pattern[patternByte]; - - while (--bufferOffset >= 0) - { - if (bytes[bufferOffset] == targetByte) + if (index == StartXRefBytes.Length) { - if (--patternByte < 0) - { - // whole pattern matched - return bufferOffset; - } - // matched current byte, advance to preceding one - targetByte = pattern[patternByte]; + // Add this "startxref" (position from the end of the document to the first 's'). + startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length)); + + // Continue scanning in case there are further "startxref"s. Not sure if this ever happens. + index = 0; } - else if (patternByte < lastPatternByte) + else if (eofIndex == EndOfFileBytes.Length) { - // no byte match but already matched some chars; reset - patternByte = lastPatternByte; - targetByte = pattern[patternByte]; + // Stop at the EOF if present. + break; } } - return -1; + if (startXrefs.Count == 0) + { + throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters."); + } + + return bytes.Length - startXrefs[startXrefs.Count - 1]; } } } diff --git a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs index 593178d6..476cd064 100644 --- a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs @@ -29,9 +29,11 @@ var reader = new RandomAccessBuffer(fileBytes); - var tokenScanner = new CoreTokenScanner(new ByteArrayInputBytes(fileBytes)); + var inputBytes = new ByteArrayInputBytes(fileBytes); - var document = OpenDocument(reader,tokenScanner, container, isLenientParsing); + var tokenScanner = new CoreTokenScanner(inputBytes); + + var document = OpenDocument(reader, inputBytes, tokenScanner, container, isLenientParsing); return document; } @@ -46,13 +48,13 @@ return Open(File.ReadAllBytes(filename), options); } - private static PdfDocument OpenDocument(IRandomAccessRead reader, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing) + private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing) { var log = container.Get(); var version = container.Get().Parse(scanner, isLenientParsing); - - var crossReferenceOffset = container.Get().GetXrefOffset(reader, isLenientParsing); + + var crossReferenceOffset = container.Get().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing); var pool = new CosObjectPool(); diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs index 0ba9f65c..3663eaca 100644 --- a/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs +++ b/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs @@ -1,5 +1,6 @@ namespace UglyToad.Pdf.Tokenization.Tokens { + using System; using System.Globalization; public class NumericToken : IDataToken @@ -10,14 +11,24 @@ public int Int { get; } + public bool IsBiggerThanInt { get; } + public long Long { get; } public NumericToken(decimal value) { Data = value; IsWhole = decimal.Floor(value) == value; - Int = (int) value; Long = (long) value; + + try + { + Int = (int) value; + } + catch (OverflowException) + { + IsBiggerThanInt = true; + } } public override string ToString()