substitute the token scanner into the file trailer parsing and test

2025-10-14 19:05:01 +08:00 · 2018-01-03 22:29:09 +00:00
parent f09ef85e5a
commit 21be34a938
6 changed files with 287 additions and 126 deletions
--- a/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs
+++ b/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs
@@ -0,0 +1,161 @@
+namespace UglyToad.Pdf.Tests.Parser.Parts
+{
+    using System;
+    using Exceptions;
+    using Pdf.Parser.Parts;
+    using Pdf.Tokenization.Scanner;
+    using Xunit;
+
+    public class FileTrailerParserTests
+    {
+        private readonly FileTrailerParser parser = new FileTrailerParser();
+
+        [Fact]
+        public void FindsCompliantStartXref()
+        {
+            var input = StringBytesTestConverter.Convert(@"sta455%r endstream
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startxref
+    456
+
+%%EOF", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(456, result);
+        }
+
+        [Fact]
+        public void IgnoresStartXrefFollowingEndOfFile()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+<< /Type/Something /W[12 0 5 6] >>
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startxref
+    1384733
+
+%%EOF
+
+% I decided to put some nonsense here:
+% because I could hahaha
+startxref
+17", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(1384733, result);
+        }
+
+        [Fact]
+        public void MissingStartXrefThrows()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+<< /Type/Something /W[12 0 5 6] >>
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startref
+    1384733
+
+%%EOF
+
+% I decided to put some nonsense here:
+% because I could hahaha
+start_rexf
+17", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Throws<PdfDocumentFormatException>(action);
+        }
+
+        [Fact]
+        public void NullInputBytesThrows()
+        {
+            var input = StringBytesTestConverter.Convert("11 0 obj", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Throws<ArgumentNullException>(action);
+        }
+
+        [Fact]
+        public void NullScannerThrows()
+        {
+            var input = StringBytesTestConverter.Convert("11 0 obj", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, null, false);
+
+            Assert.Throws<ArgumentNullException>(action);
+        }
+
+        [Fact]
+        public void InvalidTokensAfterStartXrefThrows()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+        << /Type/Font >>
+endobj
+
+startxref 
+<< /Why (am i here?) >> 69
+%EOF", false);
+
+            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Throws<PdfDocumentFormatException>(action);
+        }
+
+        [Fact]
+        public void TakesLastStartXrefPrecedingEndOfFile()
+        {
+            var input = StringBytesTestConverter.Convert(@"11 0 obj
+<< /Type/Something /W[12 0 5 6] >>
+endobj
+
+12 0 obj
+1234  %eof
+endobj
+
+startxref
+    1384733
+
+%actually I changed my mind
+
+startxref
+         1274665676543
+
+%%EOF", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(1274665676543, result);
+        }
+
+        [Fact]
+        public void CanReadStartXrefIfCommentsPresent()
+        {
+            var input = StringBytesTestConverter.Convert(@"
+startxref %Commented here
+    57695
+
+%%EOF", false);
+
+            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
+
+            Assert.Equal(57695, result);
+        }
+    }
+}
--- a/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs
+++ b/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs
@@ -29,6 +29,8 @@

        public byte CurrentByte { get; private set; }

+        public long Length => bytes.Count;
+
        public byte? Peek()
        {
            if (currentOffset == bytes.Count - 1)
--- a/src/UglyToad.Pdf/IO/IInputBytes.cs
+++ b/src/UglyToad.Pdf/IO/IInputBytes.cs
@@ -8,6 +8,8 @@

        byte CurrentByte { get; }

+        long Length { get; }
+
        byte? Peek();
        
        bool IsAtEnd();
--- a/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs
+++ b/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs
@@ -1,8 +1,11 @@
 namespace UglyToad.Pdf.Parser.Parts
 {
    using System;
-    using System.Linq;
+    using System.Collections.Generic;
+    using Exceptions;
    using IO;
+    using Tokenization.Scanner;
+    using Tokenization.Tokens;

    /*
     * The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects. 
@@ -19,156 +22,136 @@

    internal class FileTrailerParser
    {
-        private const int DefaultTrailerByteLength = 2048;
+        /// <summary>
+        /// Acrobat viewers require the EOF to be in the last 1024 bytes instead of at the end.
+        /// </summary>
+        private const int EndOfFileSearchRange = 1024;

-        private readonly byte[] endOfFileBytes;
-        private readonly byte[] startXRefBytes;
-
-        public FileTrailerParser()
+        private static readonly byte[] EndOfFileBytes = 
        {
-            endOfFileBytes = "%%EOF".Select(x => (byte)x).ToArray();
-            startXRefBytes = "startxref".Select(x => (byte)x).ToArray();
-        }
+            (byte)'%',
+            (byte)'%',
+            (byte)'E',
+            (byte)'O',
+            (byte)'F'
+        };

-        public long GetXrefOffset(IRandomAccessRead reader, bool isLenientParsing)
+        private static readonly byte[] StartXRefBytes =
        {
-            var startXrefOffset = GetByteOffsetForStartXref(reader, (int)reader.Length(), isLenientParsing);
-
-            reader.Seek(startXrefOffset);
-
-            long actualXrefOffset = Math.Max(0, ParseXrefStartPosition(reader));
-
-            return actualXrefOffset;
-        }
-
-        private long ParseXrefStartPosition(IRandomAccessRead reader)
+            (byte) 's',
+            (byte) 't',
+            (byte) 'a',
+            (byte) 'r',
+            (byte) 't',
+            (byte) 'x',
+            (byte) 'r',
+            (byte) 'e',
+            (byte) 'f'
+        };
+        
+        public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
        {
-            long startXref = -1;
-
-            if (ReadHelper.IsString(reader, startXRefBytes))
+            if (bytes == null)
            {
-                ReadHelper.ReadString(reader);
-
-                ReadHelper.SkipSpaces(reader);
-
-                // This integer is the byte offset of the first object referenced by the xref or xref stream
-                startXref = ReadHelper.ReadLong(reader);
+                throw new ArgumentNullException(nameof(bytes));
            }
-            return startXref;
-        }

-        private long GetByteOffsetForStartXref(IRandomAccessRead reader, int fileLength, bool isLenientParsing)
-        {
-            byte[] buf;
-            long skipBytes;
-            // read trailing bytes into buffer
-            try
+            if (scanner == null)
            {
-                var trailByteCount = fileLength < DefaultTrailerByteLength ? fileLength : DefaultTrailerByteLength;
-                buf = new byte[trailByteCount];
+                throw new ArgumentNullException(nameof(scanner));
+            }

-                skipBytes = fileLength - trailByteCount;
+            var fileLength = bytes.Length;

-                reader.Seek(skipBytes);
-                int off = 0;
-                while (off < trailByteCount)
+            var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;
+
+            var startPosition = fileLength - offsetFromEnd;
+
+            bytes.Seek(startPosition);
+
+            var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);
+
+            scanner.Seek(startXrefPosition);
+
+            if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref")
+            {
+                throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}.");
+            }
+
+            NumericToken numeric = null;
+            while (scanner.MoveNext())
+            {
+                if (scanner.CurrentToken is NumericToken token)
                {
-                    var readBytes = reader.Read(buf, off, trailByteCount - off);
+                    numeric = token;
+                    break;
+                }

-                    // in order to not get stuck in a loop we check readBytes (this should never happen)
-                    if (readBytes < 1)
-                    {
-                        throw new InvalidOperationException(
-                                "No more bytes to read for trailing buffer, but expected: "
-                                        + (trailByteCount - off));
-                    }
-
-                    off += readBytes;
+                if (!(scanner.CurrentToken is CommentToken))
+                {
+                    throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}.");
                }
            }
-            finally
+
+            if (numeric == null)
            {
-                reader.ReturnToBeginning();
+                throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}.");
            }

-            // find last '%%EOF'
-            int bufOff = LastIndexOf(endOfFileBytes, buf, buf.Length);
-            if (bufOff < 0)
+            return numeric.Long;
+        }
+
+        private static long GetStartXrefPosition(IInputBytes bytes, int offsetFromEnd)
+        {
+            var startXrefs = new List<int>();
+
+            var index = 0;
+            var eofIndex = 0;
+            var offset = 0;
+            
+            // Starting scanning the last 1024 bytes.
+            while (bytes.MoveNext())
            {
-                if (isLenientParsing)
+                offset++;
+                if (bytes.CurrentByte == StartXRefBytes[index])
                {
-                    // in lenient mode the '%%EOF' isn't needed
-                    bufOff = buf.Length;
-                    //LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'");
+                    // We might be reading "startxref".
+                    eofIndex = 0;
+                    index++;
+                }
+                else if (bytes.CurrentByte == EndOfFileBytes[eofIndex])
+                {
+                    // We might be reading "%%EOF".
+                    eofIndex++;
+                    index = 0;
                }
                else
                {
-                    throw new InvalidOperationException("Missing end of file marker '%%EOF'");
+                    eofIndex = 0;
+                    index = 0;
                }
-            }
-            // find last startxref preceding EOF marker
-            bufOff = LastIndexOf(startXRefBytes, buf, bufOff);
-            long startXRefOffset = skipBytes + bufOff;

-            if (bufOff < 0)
-            {
-                throw new NotImplementedException();
-                //if (isLenientParsing)
-                //{
-                //    //LOG.debug("Performing brute force search for last startxref entry");
-                //    long bfOffset = bfSearchForLastStartxrefEntry();
-                //    bool offsetIsValid = false;
-                //    if (bfOffset > -1)
-                //    {
-                //        reader.Seek(bfOffset);
-                //        long bfXref = ParseXrefStartPosition();
-                //        if (bfXref > -1)
-                //        {
-                //            offsetIsValid = checkXRefOffset(bfXref) == bfXref;
-                //        }
-                //    }
-
-                //    reader.ReturnToBeginning();
-
-                //    // use the new offset only if it is a valid pointer to a xref table
-                //    return offsetIsValid ? bfOffset : -1;
-                //}
-
-                throw new InvalidOperationException("Missing 'startxref' marker.");
-            }
-
-            return startXRefOffset;
-        }
-
-        private int LastIndexOf(byte[] pattern, byte[] bytes, int endOff)
-        {
-            int lastPatternByte = pattern.Length - 1;
-
-            int bufferOffset = endOff;
-            int patternByte = lastPatternByte;
-            byte targetByte = pattern[patternByte];
-
-            while (--bufferOffset >= 0)
-            {
-                if (bytes[bufferOffset] == targetByte)
+                if (index == StartXRefBytes.Length)
                {
-                    if (--patternByte < 0)
-                    {
-                        // whole pattern matched
-                        return bufferOffset;
-                    }
-                    // matched current byte, advance to preceding one
-                    targetByte = pattern[patternByte];
+                    // Add this "startxref" (position from the end of the document to the first 's').
+                    startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
+
+                    // Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
+                    index = 0;
                }
-                else if (patternByte < lastPatternByte)
+                else if (eofIndex == EndOfFileBytes.Length)
                {
-                    // no byte match but already matched some chars; reset
-                    patternByte = lastPatternByte;
-                    targetByte = pattern[patternByte];
+                    // Stop at the EOF if present.
+                    break;
                }
            }

-            return -1;
+            if (startXrefs.Count == 0)
+            {
+                throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters.");
+            }
+
+            return bytes.Length - startXrefs[startXrefs.Count - 1];
        }
    }
 }
--- a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs
@@ -29,9 +29,11 @@

            var reader = new RandomAccessBuffer(fileBytes);

-            var tokenScanner = new CoreTokenScanner(new ByteArrayInputBytes(fileBytes));
+            var inputBytes = new ByteArrayInputBytes(fileBytes);

-            var document = OpenDocument(reader,tokenScanner, container,  isLenientParsing);
+            var tokenScanner = new CoreTokenScanner(inputBytes);
+
+            var document = OpenDocument(reader, inputBytes, tokenScanner, container,  isLenientParsing);

            return document;
        }
@@ -46,13 +48,13 @@
            return Open(File.ReadAllBytes(filename), options);
        }

-        private static PdfDocument OpenDocument(IRandomAccessRead reader, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
+        private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
        {
            var log = container.Get<ILog>();

            var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
-
-            var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing);
+            
+            var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);

            var pool = new CosObjectPool();
            
--- a/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs
+++ b/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs
@@ -1,5 +1,6 @@
 namespace UglyToad.Pdf.Tokenization.Tokens
 {
+    using System;
    using System.Globalization;

    public class NumericToken : IDataToken<decimal>
@@ -10,14 +11,24 @@

        public int Int { get; }

+        public bool IsBiggerThanInt { get; }
+
        public long Long { get; }

        public NumericToken(decimal value)
        {
            Data = value;
            IsWhole = decimal.Floor(value) == value;
-            Int = (int) value;
            Long = (long) value;
+
+            try
+            {
+                Int = (int) value;
+            }
+            catch (OverflowException)
+            {
+                IsBiggerThanInt = true;
+            }
        }

        public override string ToString()