allow reading to continue if encountering an invalid surrogate pair

investigating the corpus at
https://digitalcorpora.s3.amazonaws.com/s3_browser.html#corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/

the input file 0000000.pdf contained a utf-16 surrogate pair in an input
defined as usc2. the approach of various parsers varies here, adobe
acrobat seems to hard crash, pdf js returns the same text we now
parse, chrome parses the intended text (2 invalid characters and
"ib exam"). we don't care too much about matching chrome exactly
so doing the same as firefox is fine here
This commit is contained in:
EliotJones 2025-07-15 18:59:18 -05:00 committed by BobLd
parent 1021729727
commit 31658ca020

View File

@ -147,6 +147,8 @@
public int ReadCode(IInputBytes bytes, bool useLenientParsing)
{
var myPosition = bytes.CurrentOffset;
if (hasEmptyCodespace)
{
var data = new byte[minCodeLength];
@ -184,7 +186,20 @@
}
}
throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}.");
// If we encounter invalid inputs we read min bytes and convert directly to an integer.
if (useLenientParsing)
{
bytes.Seek(myPosition);
for (var i = 0; i < minCodeLength; i++)
{
result[i] = ReadByte(bytes, useLenientParsing);
}
// https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java#L207
return ByteArrayToInt(result.AsSpan(0, minCodeLength));
}
throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}. Bytes: {BitConverter.ToString(result)}.");
}
private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)