special case handling for faulty offsets in xref with missing whitespace between eof and object number

2025-09-21 04:17:57 +08:00 · 2019-06-14 20:40:24 +01:00
parent 4c716fcbd6
commit 98424b32aa
2 changed files with 51 additions and 29 deletions
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@@ -4,6 +4,7 @@
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.IO;
+    using System.Text.RegularExpressions;
    using Encryption;
    using Exceptions;
    using Filters;
@@ -26,6 +27,8 @@

    internal class PdfTokenScanner : IPdfTokenScanner
    {
+        private static readonly Regex EndsWithNumberRegex = new Regex(@"(?<=^[^\s\d]+)\d+$");
+
        private readonly IInputBytes inputBytes;
        private readonly IObjectLocationProvider objectLocationProvider;
        private readonly IFilterProvider filterProvider;
@@ -96,7 +99,26 @@

            if (objectNumber == null || generation == null)
            {
-                return false;
+                // Handle case where the scanner correctly reads most of an object token but includes too much of the first token
+                // specifically %%EOF1 0 obj where scanning starts from 'F'.
+                if (generation != null && previousTokens[0] is OperatorToken op)
+                {
+                    var match = EndsWithNumberRegex.Match(op.Data);
+
+                    if (match.Success && int.TryParse(match.Value, out var number))
+                    {
+                        startPosition = previousTokenPositions[0] + match.Index;
+                        objectNumber = new NumericToken(number);
+                    }
+                    else
+                    {
+                        return false;
+                    }
+                }
+                else
+                {
+                    return false;
+                }
            }

            // Read all tokens between obj and endobj.