allow reading to continue if encountering an invalid surrogate pair

investigating the corpus at https://digitalcorpora.s3.amazonaws.com/s3_browser.html#corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/ the input file 0000000.pdf contained a utf-16 surrogate pair in an input defined as usc2. the approach of various parsers varies here, adobe acrobat seems to hard crash, pdf js returns the same text we now parse, chrome parses the intended text (2 invalid characters and "ib exam"). we don't care too much about matching chrome exactly so doing the same as firefox is fine here
2025-08-20 04:48:53 +08:00 · 2025-07-15 18:59:18 -05:00 · 2025-07-15 18:59:18 -05:00 · 31658ca020
commit 31658ca020
parent 1021729727
1 changed files with 16 additions and 1 deletions
--- a/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs
+++ b/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs
@ -147,6 +147,8 @@

        public int ReadCode(IInputBytes bytes, bool useLenientParsing)
        {
+            var myPosition = bytes.CurrentOffset;
+
            if (hasEmptyCodespace)
            {
                var data = new byte[minCodeLength];
@ -184,7 +186,20 @@
                }
            }

-            throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}.");
+            // If we encounter invalid inputs we read min bytes and convert directly to an integer.
+            if (useLenientParsing)
+            {
+                bytes.Seek(myPosition);
+                for (var i = 0; i < minCodeLength; i++)
+                {
+                    result[i] = ReadByte(bytes, useLenientParsing);
+                }
+
+                // https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java#L207
+                return ByteArrayToInt(result.AsSpan(0, minCodeLength));
+            }
+
+            throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}. Bytes: {BitConverter.ToString(result)}.");
        }

        private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)