substitute the token scanner into the file trailer parsing and test

2026-03-10 00:23:29 +08:00 · 2018-01-03 22:29:09 +00:00
parent f09ef85e5a
commit 21be34a938
6 changed files with 287 additions and 126 deletions
--- a/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs
+++ b/src/UglyToad.Pdf.Tests/Parser/Parts/FileTrailerParserTests.cs
@@ -0,0 +1,161 @@
 namespace UglyToad.Pdf.Tests.Parser.Parts
 {
    using System;
    using Exceptions;
    using Pdf.Parser.Parts;
    using Pdf.Tokenization.Scanner;
    using Xunit;
    public class FileTrailerParserTests
    {
        private readonly FileTrailerParser parser = new FileTrailerParser();
        [Fact]
        public void FindsCompliantStartXref()
        {
            var input = StringBytesTestConverter.Convert(@"sta455%r endstream
 endobj
 12 0 obj
 1234  %eof
 endobj
 startxref
    456
 %%EOF", false);
            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
            Assert.Equal(456, result);
        }
        [Fact]
        public void IgnoresStartXrefFollowingEndOfFile()
        {
            var input = StringBytesTestConverter.Convert(@"11 0 obj
 << /Type/Something /W[12 0 5 6] >>
 endobj
 12 0 obj
 1234  %eof
 endobj
 startxref
    1384733
 %%EOF
 % I decided to put some nonsense here:
 % because I could hahaha
 startxref
 17", false);
            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
            Assert.Equal(1384733, result);
        }
        [Fact]
        public void MissingStartXrefThrows()
        {
            var input = StringBytesTestConverter.Convert(@"11 0 obj
 << /Type/Something /W[12 0 5 6] >>
 endobj
 12 0 obj
 1234  %eof
 endobj
 startref
    1384733
 %%EOF
 % I decided to put some nonsense here:
 % because I could hahaha
 start_rexf
 17", false);
            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
            Assert.Throws<PdfDocumentFormatException>(action);
        }
        [Fact]
        public void NullInputBytesThrows()
        {
            var input = StringBytesTestConverter.Convert("11 0 obj", false);
            Action action = () => parser.GetFirstCrossReferenceOffset(null, new CoreTokenScanner(input.Bytes), false);
            Assert.Throws<ArgumentNullException>(action);
        }
        [Fact]
        public void NullScannerThrows()
        {
            var input = StringBytesTestConverter.Convert("11 0 obj", false);
            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, null, false);
            Assert.Throws<ArgumentNullException>(action);
        }
        [Fact]
        public void InvalidTokensAfterStartXrefThrows()
        {
            var input = StringBytesTestConverter.Convert(@"11 0 obj
        << /Type/Font >>
 endobj
 startxref 
 << /Why (am i here?) >> 69
 %EOF", false);
            Action action = () => parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
            Assert.Throws<PdfDocumentFormatException>(action);
        }
        [Fact]
        public void TakesLastStartXrefPrecedingEndOfFile()
        {
            var input = StringBytesTestConverter.Convert(@"11 0 obj
 << /Type/Something /W[12 0 5 6] >>
 endobj
 12 0 obj
 1234  %eof
 endobj
 startxref
    1384733
 %actually I changed my mind
 startxref
         1274665676543
 %%EOF", false);
            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
            Assert.Equal(1274665676543, result);
        }
        [Fact]
        public void CanReadStartXrefIfCommentsPresent()
        {
            var input = StringBytesTestConverter.Convert(@"
 startxref %Commented here
    57695
 %%EOF", false);
            var result = parser.GetFirstCrossReferenceOffset(input.Bytes, new CoreTokenScanner(input.Bytes), false);
            Assert.Equal(57695, result);
        }
    }
 }
--- a/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs
+++ b/src/UglyToad.Pdf/IO/ByteArrayInputBytes.cs
@@ -29,6 +29,8 @@
        public byte CurrentByte { get; private set; }
        public long Length => bytes.Count;
        public byte? Peek()
        {
            if (currentOffset == bytes.Count - 1)
--- a/src/UglyToad.Pdf/IO/IInputBytes.cs
+++ b/src/UglyToad.Pdf/IO/IInputBytes.cs
@@ -8,6 +8,8 @@
        byte CurrentByte { get; }
        long Length { get; }
        byte? Peek();
        bool IsAtEnd();
--- a/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs
+++ b/src/UglyToad.Pdf/Parser/Parts/FileTrailerParser.cs
@@ -1,8 +1,11 @@
 namespace UglyToad.Pdf.Parser.Parts
 {
    using System;
-    using System.Linq;
+    using System.Collections.Generic;
    using Exceptions;
    using IO;
    using Tokenization.Scanner;
    using Tokenization.Tokens;
    /*
     * The trailer of a PDF file allows us to quickly find the cross-reference table and other special objects. 
@@ -19,156 +22,136 @@
    internal class FileTrailerParser
    {
-        private const int DefaultTrailerByteLength = 2048;
+        /// <summary>
        /// Acrobat viewers require the EOF to be in the last 1024 bytes instead of at the end.
        /// </summary>
        private const int EndOfFileSearchRange = 1024;
-        private readonly byte[] endOfFileBytes;
+        private static readonly byte[] EndOfFileBytes = 
        private readonly byte[] startXRefBytes;
        public FileTrailerParser()
        {
-            endOfFileBytes = "%%EOF".Select(x => (byte)x).ToArray();
+            (byte)'%',
-            startXRefBytes = "startxref".Select(x => (byte)x).ToArray();
+            (byte)'%',
-        }
+            (byte)'E',
            (byte)'O',
            (byte)'F'
        };
-        public long GetXrefOffset(IRandomAccessRead reader, bool isLenientParsing)
+        private static readonly byte[] StartXRefBytes =
        {
-            var startXrefOffset = GetByteOffsetForStartXref(reader, (int)reader.Length(), isLenientParsing);
+            (byte) 's',
-
+            (byte) 't',
-            reader.Seek(startXrefOffset);
+            (byte) 'a',
-
+            (byte) 'r',
-            long actualXrefOffset = Math.Max(0, ParseXrefStartPosition(reader));
+            (byte) 't',
-
+            (byte) 'x',
-            return actualXrefOffset;
+            (byte) 'r',
-        }
+            (byte) 'e',
-
+            (byte) 'f'
-        private long ParseXrefStartPosition(IRandomAccessRead reader)
+        };
        public long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
        {
-            long startXref = -1;
+            if (bytes == null)
            if (ReadHelper.IsString(reader, startXRefBytes))
            {
-                ReadHelper.ReadString(reader);
+                throw new ArgumentNullException(nameof(bytes));
                ReadHelper.SkipSpaces(reader);
                // This integer is the byte offset of the first object referenced by the xref or xref stream
                startXref = ReadHelper.ReadLong(reader);
            }
            return startXref;
        }
-        private long GetByteOffsetForStartXref(IRandomAccessRead reader, int fileLength, bool isLenientParsing)
+            if (scanner == null)
        {
            byte[] buf;
            long skipBytes;
            // read trailing bytes into buffer
            try
            {
-                var trailByteCount = fileLength < DefaultTrailerByteLength ? fileLength : DefaultTrailerByteLength;
+                throw new ArgumentNullException(nameof(scanner));
-                buf = new byte[trailByteCount];
+            }
-                skipBytes = fileLength - trailByteCount;
+            var fileLength = bytes.Length;
-                reader.Seek(skipBytes);
+            var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;
-                int off = 0;
+
-                while (off < trailByteCount)
+            var startPosition = fileLength - offsetFromEnd;
            bytes.Seek(startPosition);
            var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);
            scanner.Seek(startXrefPosition);
            if (!scanner.TryReadToken(out OperatorToken startXrefToken) || startXrefToken.Data != "startxref")
            {
                throw new InvalidOperationException($"The start xref position we found was not correct. Found {startXrefPosition} but it was occupied by token {scanner.CurrentToken}.");
            }
            NumericToken numeric = null;
            while (scanner.MoveNext())
            {
                if (scanner.CurrentToken is NumericToken token)
                {
-                    var readBytes = reader.Read(buf, off, trailByteCount - off);
+                    numeric = token;
                    break;
                }
-                    // in order to not get stuck in a loop we check readBytes (this should never happen)
+                if (!(scanner.CurrentToken is CommentToken))
-                    if (readBytes < 1)
+                {
-                    {
+                    throw new PdfDocumentFormatException($"Found an unexpected token following 'startxref': {scanner.CurrentToken}.");
                        throw new InvalidOperationException(
                                "No more bytes to read for trailing buffer, but expected: "
                                        + (trailByteCount - off));
                    }
                    off += readBytes;
                }
            }
-            finally
+
            if (numeric == null)
            {
-                reader.ReturnToBeginning();
+                throw new PdfDocumentFormatException($"Could not find the numeric value following 'startxref'. Searching from position {startXrefPosition}.");
            }
-            // find last '%%EOF'
+            return numeric.Long;
-            int bufOff = LastIndexOf(endOfFileBytes, buf, buf.Length);
+        }
-            if (bufOff < 0)
+
        private static long GetStartXrefPosition(IInputBytes bytes, int offsetFromEnd)
        {
            var startXrefs = new List<int>();
            var index = 0;
            var eofIndex = 0;
            var offset = 0;
            // Starting scanning the last 1024 bytes.
            while (bytes.MoveNext())
            {
-                if (isLenientParsing)
+                offset++;
                if (bytes.CurrentByte == StartXRefBytes[index])
                {
-                    // in lenient mode the '%%EOF' isn't needed
+                    // We might be reading "startxref".
-                    bufOff = buf.Length;
+                    eofIndex = 0;
-                    //LOG.debug("Missing end of file marker '" + new String(EOF_MARKER) + "'");
+                    index++;
                }
                else if (bytes.CurrentByte == EndOfFileBytes[eofIndex])
                {
                    // We might be reading "%%EOF".
                    eofIndex++;
                    index = 0;
                }
                else
                {
-                    throw new InvalidOperationException("Missing end of file marker '%%EOF'");
+                    eofIndex = 0;
                    index = 0;
                }
            }
            // find last startxref preceding EOF marker
            bufOff = LastIndexOf(startXRefBytes, buf, bufOff);
            long startXRefOffset = skipBytes + bufOff;
-            if (bufOff < 0)
+                if (index == StartXRefBytes.Length)
            {
                throw new NotImplementedException();
                //if (isLenientParsing)
                //{
                //    //LOG.debug("Performing brute force search for last startxref entry");
                //    long bfOffset = bfSearchForLastStartxrefEntry();
                //    bool offsetIsValid = false;
                //    if (bfOffset > -1)
                //    {
                //        reader.Seek(bfOffset);
                //        long bfXref = ParseXrefStartPosition();
                //        if (bfXref > -1)
                //        {
                //            offsetIsValid = checkXRefOffset(bfXref) == bfXref;
                //        }
                //    }
                //    reader.ReturnToBeginning();
                //    // use the new offset only if it is a valid pointer to a xref table
                //    return offsetIsValid ? bfOffset : -1;
                //}
                throw new InvalidOperationException("Missing 'startxref' marker.");
            }
            return startXRefOffset;
        }
        private int LastIndexOf(byte[] pattern, byte[] bytes, int endOff)
        {
            int lastPatternByte = pattern.Length - 1;
            int bufferOffset = endOff;
            int patternByte = lastPatternByte;
            byte targetByte = pattern[patternByte];
            while (--bufferOffset >= 0)
            {
                if (bytes[bufferOffset] == targetByte)
                {
-                    if (--patternByte < 0)
+                    // Add this "startxref" (position from the end of the document to the first 's').
-                    {
+                    startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
-                        // whole pattern matched
+
-                        return bufferOffset;
+                    // Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
-                    }
+                    index = 0;
                    // matched current byte, advance to preceding one
                    targetByte = pattern[patternByte];
                }
-                else if (patternByte < lastPatternByte)
+                else if (eofIndex == EndOfFileBytes.Length)
                {
-                    // no byte match but already matched some chars; reset
+                    // Stop at the EOF if present.
-                    patternByte = lastPatternByte;
+                    break;
                    targetByte = pattern[patternByte];
                }
            }
-            return -1;
+            if (startXrefs.Count == 0)
            {
                throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters.");
            }
            return bytes.Length - startXrefs[startXrefs.Count - 1];
        }
    }
 }
--- a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs
+++ b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs
@@ -29,9 +29,11 @@
            var reader = new RandomAccessBuffer(fileBytes);
-            var tokenScanner = new CoreTokenScanner(new ByteArrayInputBytes(fileBytes));
+            var inputBytes = new ByteArrayInputBytes(fileBytes);
-            var document = OpenDocument(reader,tokenScanner, container,  isLenientParsing);
+            var tokenScanner = new CoreTokenScanner(inputBytes);
            var document = OpenDocument(reader, inputBytes, tokenScanner, container,  isLenientParsing);
            return document;
        }
@@ -46,13 +48,13 @@
            return Open(File.ReadAllBytes(filename), options);
        }
-        private static PdfDocument OpenDocument(IRandomAccessRead reader, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
+        private static PdfDocument OpenDocument(IRandomAccessRead reader, IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
        {
            var log = container.Get<ILog>();
            var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
-
+            
-            var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing);
+            var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
            var pool = new CosObjectPool();
--- a/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs
+++ b/src/UglyToad.Pdf/Tokenization/Tokens/NumericToken.cs
@@ -1,5 +1,6 @@
 namespace UglyToad.Pdf.Tokenization.Tokens
 {
    using System;
    using System.Globalization;
    public class NumericToken : IDataToken<decimal>
@@ -10,14 +11,24 @@
        public int Int { get; }
        public bool IsBiggerThanInt { get; }
        public long Long { get; }
        public NumericToken(decimal value)
        {
            Data = value;
            IsWhole = decimal.Floor(value) == value;
            Int = (int) value;
            Long = (long) value;
            try
            {
                Int = (int) value;
            }
            catch (OverflowException)
            {
                IsBiggerThanInt = true;
            }
        }
        public override string ToString()