From 21be34a9382c1d5192c3e96b766e227fa2da386b Mon Sep 17 00:00:00 2001
From: Eliot Jones <elioty@hotmail.co.uk>
Date: Wed, 3 Jan 2018 22:29:09 +0000
Subject: [PATCH] substitute the token scanner into the file trailer parsing
 and test

---
 .../Parser/Parts/FileTrailerParserTests.cs    | 161 +++++++++++++
 src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs    |   2 +
 src/UglyToad.Pdf/IO/IInputBytes.cs            |   2 +
 .../Parser/Parts/FileTrailerParser.cs         | 223 ++++++++----------
 src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs |  12 +-
 .../Tokenization/Tokens/NumericToken.cs       |  13 +-
 6 files changed, 287 insertions(+), 126 deletions(-)
 create mode 100644 src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs

diff --git a/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs b/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs
new file mode 100644
index 00000000..ce59fade
--- /dev/null
+++ b/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs
@@ -0,0 +1,161 @@
+﻿namespace UglyToad.Pdf.Tests.Parser.Parts
+{
+    using System;
+    using Exceptions;
+    using Pdf.Parser.Parts;
+    using Pdf.Tokenization.Scanner;
+    using Xunit;
+
+    public class FileTrailerParserTests
+    {
+        private readonly FileTrailerParser parser = new FileTrailerParser();
+
+        [Fact]
+        public void FindsCompliantStartXref()
+        {
+            var input = StringBytesTestConverter.Convert(@"sta455%r endstream
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startxref
+    456
+
+%%EOF", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(456, result);
+        }
+
+        [Fact]
+        public void IgnoresStartXrefFollowingEndOfFile()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+<< /Type/Something /W[12 0 5 6] >>
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startxref
+    1384733
+
+%%EOF
+
+% I decided to put some nonsense here:
+% because I could hahaha
+startxref
+17", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(1384733, result);
+        }
+
+        [Fact]
+        public void MissingStartXrefThrows()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+<< /Type/Something /W[12 0 5 6] >>
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startref
+    1384733
+
+%%EOF
+
+% I decided to put some nonsense here:
+% because I could hahaha
+start_rexf
+17", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Throws<PdfDocumentFormatException>(action);
+        }
+
+        [Fact]
+        public void NullInputBytesThrows()
+        {
+            var input = StringBytesTestConverter.Convert("11 0 obj", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Throws<ArgumentNullException>(action);
+        }
+
+        [Fact]
+        public void NullScannerThrows()
+        {
+            var input = StringBytesTestConverter.Convert("11 0 obj", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, null, false);
+
+            Assert.Throws<ArgumentNullException>(action);
+        }
+
+        [Fact]
+        public void InvalidTokensAfterStartXrefThrows()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+        << /Type/Font >>
+endobj
+
+startxref 
+<< /Why (am i here?) >> 69
+%EOF", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Throws<PdfDocumentFormatException>(action);
+        }
+
+        [Fact]
+        public void TakesLastStartXrefPrecedingEndOfFile()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+<< /Type/Something /W[12 0 5 6] >>
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startxref
+    1384733
+
+%actually I changed my mind
+
+startxref
+         1274665676543
+
+%%EOF", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(1274665676543, result);
+        }
+
+        [Fact]
+        public void CanReadStartXrefIfCommentsPresent()
+        {
+            var input = StringBytesTestConverter.Convert(@"
+startxref %Commented here
+    57695
+
+%%EOF", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(57695, result);
+        }
+    }
+}
diff --git a/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs b/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs
index ccb8d162..744c8579 100644
--- a/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs
+++ b/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs
@@ -29,6 +29,8 @@
 
         public byte CurrentByte { get; private set; }
 
+        public long Length => bytes.Count;
+
         public byte? Peek()
         {
             if (currentOffset == bytes.Count - 1)
diff --git a/src/UglyToad.Pdf/IO/IInputBytes.cs b/src/UglyToad.Pdf/IO/IInputBytes.cs
index d09b6e45..f248d37a 100644
--- a/src/UglyToad.Pdf/IO/IInputBytes.cs
+++ b/src/UglyToad.Pdf/IO/IInputBytes.cs
@@ -8,6 +8,8 @@
 
         byte CurrentByte { get; }
 
+        long Length { get; }
+
         byte? Peek();
         
         bool IsAtEnd();
diff --git a/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs b/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs
index 07be39ad..0fcd448d 100644
--- a/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs
+++ b/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs
@@ -1,8 +1,11 @@
 ﻿namespace UglyToad.Pdf.Parser.Parts
 {
     using System;
-    using System.Linq;
+    using System.Collections.Generic;
+    using Exceptions;
     using IO;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;
 
     /*
      * The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects. 
@@ -19,156 +22,136 @@
 
     internal class FileTrailerParser
     {
-        private const int DefaultTrailerByteLength = 2048;
+        /// <summary>
+        /// Acrobat viewers require the EOF to be in the last 1024 bytes instead of at the end.
+        /// </summary>
+        private const int EndOfFileSearchRange = 1024;
 
-        private readonly byte[] endOfFileBytes;
-        private readonly byte[] startXRefBytes;
-
-        public FileTrailerParser()
+        private static readonly byte[] EndOfFileBytes = 
         {
-            endOfFileBytes = "%%EOF".Select(x => (byte)x).ToArray();
-            startXRefBytes = "startxref".Select(x => (byte)x).ToArray();
-        }
+            (byte)'%',
+            (byte)'%',
+            (byte)'E',
+            (byte)'O',
+            (byte)'F'
+        };
 
-        public long GetXrefOffset(IRandomAccessRead reader, bool isLenientParsing)
+        private static readonly byte[] StartXRefBytes =
         {
-            var startXrefOffset = GetByteOffsetForStartXref(reader, (int)reader.Length(), isLenientParsing);
-
-            reader.Seek(startXrefOffset);
-
-            long actualXrefOffset = Math.Max(0, ParseXrefStartPosition(reader));
-
-            return actualXrefOffset;
-        }
-
-        private long ParseXrefStartPosition(IRandomAccessRead reader)
+            (byte) 's',
+            (byte) 't',
+            (byte) 'a',
+            (byte) 'r',
+            (byte) 't',
+            (byte) 'x',
+            (byte) 'r',
+            (byte) 'e',
+            (byte) 'f'
+        };
+        
+        public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
         {
-            long startXref = -1;
-
-            if (ReadHelper.IsString(reader, startXRefBytes))
+            if (bytes == null)
             {
-                ReadHelper.ReadString(reader);
-
-                ReadHelper.SkipSpaces(reader);
-
-                // This integer is the byte offset of the first object referenced by the xref or xref stream
-                startXref = ReadHelper.ReadLong(reader);
+                throw new ArgumentNullException(nameof(bytes));
             }
-            return startXref;
-        }
 
-        private long GetByteOffsetForStartXref(IRandomAccessRead reader, int fileLength, bool isLenientParsing)
-        {
-            byte[] buf;
-            long skipBytes;
-            // read trailing bytes into buffer
-            try
+            if (scanner == null)
             {
-                var trailByteCount = fileLength < DefaultTrailerByteLength ? fileLength : DefaultTrailerByteLength;
-                buf = new byte[trailByteCount];
+                throw new ArgumentNullException(nameof(scanner));
+            }
 
-                skipBytes = fileLength - trailByteCount;
+            var fileLength = bytes.Length;
 
-                reader.Seek(skipBytes);
-                int off = 0;
-                while (off < trailByteCount)
+            var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;
+
+            var startPosition = fileLength - offsetFromEnd;
+
+            bytes.Seek(startPosition);
+
+            var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);
+
+            scanner.Seek(startXrefPosition);
+
+            if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref")
+            {
+                throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}.");
+            }
+
+            NumericToken numeric = null;
+            while (scanner.MoveNext())
+            {
+                if (scanner.CurrentToken is NumericToken token)
                 {
-                    var readBytes = reader.Read(buf, off, trailByteCount - off);
+                    numeric = token;
+                    break;
+                }
 
-                    // in order to not get stuck in a loop we check readBytes (this should never happen)
-                    if (readBytes < 1)
-                    {
-                        throw new InvalidOperationException(
-                                "No more bytes to read for trailing buffer, but expected: "
-                                        + (trailByteCount - off));
-                    }
-
-                    off += readBytes;
+                if (!(scanner.CurrentToken is CommentToken))
+                {
+                    throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}.");
                 }
             }
-            finally
+
+            if (numeric == null)
             {
-                reader.ReturnToBeginning();
+                throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}.");
             }
 
-            // find last '%%EOF'
-            int bufOff = LastIndexOf(endOfFileBytes, buf, buf.Length);
-            if (bufOff < 0)
+            return numeric.Long;
+        }
+
+        private static long GetStartXrefPosition(IInputBytes bytes, int offsetFromEnd)
+        {
+            var startXrefs = new List<int>();
+
+            var index = 0;
+            var eofIndex = 0;
+            var offset = 0;
+            
+            // Starting scanning the last 1024 bytes.
+            while (bytes.MoveNext())
             {
-                if (isLenientParsing)
+                offset++;
+                if (bytes.CurrentByte == StartXRefBytes[index])
                 {
-                    // in lenient mode the '%%EOF' isn't needed
-                    bufOff = buf.Length;
-                    //LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'");
+                    // We might be reading "startxref".
+                    eofIndex = 0;
+                    index++;
+                }
+                else if (bytes.CurrentByte == EndOfFileBytes[eofIndex])
+                {
+                    // We might be reading "%%EOF".
+                    eofIndex++;
+                    index = 0;
                 }
                 else
                 {
-                    throw new InvalidOperationException("Missing end of file marker '%%EOF'");
+                    eofIndex = 0;
+                    index = 0;
                 }
-            }
-            // find last startxref preceding EOF marker
-            bufOff = LastIndexOf(startXRefBytes, buf, bufOff);
-            long startXRefOffset = skipBytes + bufOff;
 
-            if (bufOff < 0)
-            {
-                throw new NotImplementedException();
-                //if (isLenientParsing)
-                //{
-                //    //LOG.debug("Performing brute force search for last startxref entry");
-                //    long bfOffset = bfSearchForLastStartxrefEntry();
-                //    bool offsetIsValid = false;
-                //    if (bfOffset > -1)
-                //    {
-                //        reader.Seek(bfOffset);
-                //        long bfXref = ParseXrefStartPosition();
-                //        if (bfXref > -1)
-                //        {
-                //            offsetIsValid = checkXRefOffset(bfXref) == bfXref;
-                //        }
-                //    }
-
-                //    reader.ReturnToBeginning();
-
-                //    // use the new offset only if it is a valid pointer to a xref table
-                //    return offsetIsValid ? bfOffset : -1;
-                //}
-
-                throw new InvalidOperationException("Missing 'startxref' marker.");
-            }
-
-            return startXRefOffset;
-        }
-
-        private int LastIndexOf(byte[] pattern, byte[] bytes, int endOff)
-        {
-            int lastPatternByte = pattern.Length - 1;
-
-            int bufferOffset = endOff;
-            int patternByte = lastPatternByte;
-            byte targetByte = pattern[patternByte];
-
-            while (--bufferOffset >= 0)
-            {
-                if (bytes[bufferOffset] == targetByte)
+                if (index == StartXRefBytes.Length)
                 {
-                    if (--patternByte < 0)
-                    {
-                        // whole pattern matched
-                        return bufferOffset;
-                    }
-                    // matched current byte, advance to preceding one
-                    targetByte = pattern[patternByte];
+                    // Add this "startxref" (position from the end of the document to the first 's').
+                    startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
+
+                    // Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
+                    index = 0;
                 }
-                else if (patternByte < lastPatternByte)
+                else if (eofIndex == EndOfFileBytes.Length)
                 {
-                    // no byte match but already matched some chars; reset
-                    patternByte = lastPatternByte;
-                    targetByte = pattern[patternByte];
+                    // Stop at the EOF if present.
+                    break;
                 }
             }
 
-            return -1;
+            if (startXrefs.Count == 0)
+            {
+                throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters.");
+            }
+
+            return bytes.Length - startXrefs[startXrefs.Count - 1];
         }
     }
 }
diff --git a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs
index 593178d6..476cd064 100644
--- a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs
@@ -29,9 +29,11 @@
 
             var reader = new RandomAccessBuffer(fileBytes);
 
-            var tokenScanner = new CoreTokenScanner(new ByteArrayInputBytes(fileBytes));
+            var inputBytes = new ByteArrayInputBytes(fileBytes);
 
-            var document = OpenDocument(reader,tokenScanner, container,  isLenientParsing);
+            var tokenScanner = new CoreTokenScanner(inputBytes);
+
+            var document = OpenDocument(reader, inputBytes, tokenScanner, container,  isLenientParsing);
 
             return document;
         }
@@ -46,13 +48,13 @@
             return Open(File.ReadAllBytes(filename), options);
         }
 
-        private static PdfDocument OpenDocument(IRandomAccessRead reader, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
+        private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
         {
             var log = container.Get<ILog>();
 
             var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
-
-            var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing);
+            
+            var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
 
             var pool = new CosObjectPool();
             
diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs
index 0ba9f65c..3663eaca 100644
--- a/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs
+++ b/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs
@@ -1,5 +1,6 @@
 ﻿namespace UglyToad.Pdf.Tokenization.Tokens
 {
+    using System;
     using System.Globalization;
 
     public class NumericToken : IDataToken<decimal>
@@ -10,14 +11,24 @@
 
         public int Int { get; }
 
+        public bool IsBiggerThanInt { get; }
+
         public long Long { get; }
 
         public NumericToken(decimal value)
         {
             Data = value;
             IsWhole = decimal.Floor(value) == value;
-            Int = (int) value;
             Long = (long) value;
+
+            try
+            {
+                Int = (int) value;
+            }
+            catch (OverflowException)
+            {
+                IsBiggerThanInt = true;
+            }
         }
 
         public override string ToString()