handle the format header being preceded by nonsense

some files seem to have the format header preceded by large amounts of junk but this appears to be valid for chrome and acrobat reader. this change ups the amount of nonsense to be read prior to the version header. also makes parsing of the version header culture invariant which may be related to #85.
2025-10-15 03:34:52 +08:00 · 2020-01-25 16:53:41 +00:00
parent d9492ab2f8
commit a561c8954e
2 changed files with 13 additions and 9 deletions
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/FileHeaderParserTests.cs
@@ -60,14 +60,14 @@
        }

        [Fact]
-        public void HeaderPrecededByJunkNonLenientThrows()
+        public void HeaderPrecededByJunkNonLenientDoesNotThrow()
        {
            var scanner = StringBytesTestConverter.Scanner(@"one    
    %PDF-1.2");

-            Action action = () => FileHeaderParser.Parse(scanner, false, log);
+            var result = FileHeaderParser.Parse(scanner, false, log);

-            Assert.Throws<PdfDocumentFormatException>(action);
+            Assert.Equal(1.2m, result.Version);
        }

        [Fact]
@@ -82,14 +82,14 @@
        }

        [Fact]
-        public void HeaderPrecededByTooMuchJunkThrows()
+        public void HeaderPrecededByJunkDoesNotThrow()
        {
            var scanner = StringBytesTestConverter.Scanner(@"one two
 three %PDF-1.6");

-            Action action = () => FileHeaderParser.Parse(scanner, true, log);
+            var result = FileHeaderParser.Parse(scanner, true, log);

-            Assert.Throws<PdfDocumentFormatException>(action);
+            Assert.Equal(1.6m, result.Version);
        }

        [Fact]
--- a/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs
@@ -1,6 +1,7 @@
 namespace UglyToad.PdfPig.Parser.FileStructure
 {
    using System;
+    using System.Globalization;
    using Content;
    using Core;
    using Logging;
@@ -43,11 +44,11 @@

            var comment = scanner.CurrentToken as CommentToken;

-            var junkSkip = isLenientParsing ? 2 : 0;
+            const int junkTokensTolerance = 25;
            var attempts = 0;
            while (comment == null)
            {
-                if (attempts == junkSkip)
+                if (attempts == junkTokensTolerance)
                {
                    throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
                }
@@ -69,7 +70,10 @@

            const int toDecimalStartLength = 4;

-            if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), out var version))
+            if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), 
+                NumberStyles.Number,
+                CultureInfo.InvariantCulture,
+                out var version))
            {
                return HandleMissingVersion(comment, isLenientParsing, log);
            }