mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 04:48:53 +08:00
allow reading to continue if encountering an invalid surrogate pair
investigating the corpus at https://digitalcorpora.s3.amazonaws.com/s3_browser.html#corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/ the input file 0000000.pdf contained a utf-16 surrogate pair in an input defined as usc2. the approach of various parsers varies here, adobe acrobat seems to hard crash, pdf js returns the same text we now parse, chrome parses the intended text (2 invalid characters and "ib exam"). we don't care too much about matching chrome exactly so doing the same as firefox is fine here
This commit is contained in:
parent
1021729727
commit
31658ca020
@ -147,6 +147,8 @@
|
||||
|
||||
public int ReadCode(IInputBytes bytes, bool useLenientParsing)
|
||||
{
|
||||
var myPosition = bytes.CurrentOffset;
|
||||
|
||||
if (hasEmptyCodespace)
|
||||
{
|
||||
var data = new byte[minCodeLength];
|
||||
@ -184,7 +186,20 @@
|
||||
}
|
||||
}
|
||||
|
||||
throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}.");
|
||||
// If we encounter invalid inputs we read min bytes and convert directly to an integer.
|
||||
if (useLenientParsing)
|
||||
{
|
||||
bytes.Seek(myPosition);
|
||||
for (var i = 0; i < minCodeLength; i++)
|
||||
{
|
||||
result[i] = ReadByte(bytes, useLenientParsing);
|
||||
}
|
||||
|
||||
// https://github.com/apache/pdfbox/blob/f81c7c5a06126db68aa985a0e755cdbffed7d270/fontbox/src/main/java/org/apache/fontbox/cmap/CMap.java#L207
|
||||
return ByteArrayToInt(result.AsSpan(0, minCodeLength));
|
||||
}
|
||||
|
||||
throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}. Bytes: {BitConverter.ToString(result)}.");
|
||||
}
|
||||
|
||||
private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)
|
||||
|
Loading…
Reference in New Issue
Block a user