From 31658ca02088504156d960119a6e053d20204622 Mon Sep 17 00:00:00 2001 From: EliotJones Date: Tue, 15 Jul 2025 18:59:18 -0500 Subject: [PATCH] allow reading to continue if encountering an invalid surrogate pair investigating the corpus at https://digitalcorpora.s3.amazonaws.com/s3_browser.html#corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/ the input file 0000000.pdf contained a utf-16 surrogate pair in an input defined as usc2. the approach of various parsers varies here, adobe acrobat seems to hard crash, pdf js returns the same text we now parse, chrome parses the intended text (2 invalid characters and "ib exam"). we don't care too much about matching chrome exactly so doing the same as firefox is fine here --- src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs b/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs index ab551dc1..59a2b7e7 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs @@ -147,6 +147,8 @@ public int ReadCode(IInputBytes bytes, bool useLenientParsing) { + var myPosition = bytes.CurrentOffset; + if (hasEmptyCodespace) { var data = new byte[minCodeLength]; @@ -184,7 +186,20 @@ } } - throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}."); + // If we encounter invalid inputs we read min bytes and convert directly to an integer. + if (useLenientParsing) + { + bytes.Seek(myPosition); + for (var i = 0; i < minCodeLength; i++) + { + result[i] = ReadByte(bytes, useLenientParsing); + } + + // https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java#L207 + return ByteArrayToInt(result.AsSpan(0, minCodeLength)); + } + + throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}. Bytes: {BitConverter.ToString(result)}."); } private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)