Use pdfScanner in ReadVerticalDisplacements and fix #693 and return 0 in CMap on exception in ReadByte() if useLenientParsing is true and fix #692
Some checks failed
Build and test / build (push) Has been cancelled
Run Integration Tests / build (push) Has been cancelled

This commit is contained in:
BobLd
2024-10-18 23:17:00 +01:00
parent ea95a7ae7a
commit e10609e4e1
11 changed files with 74 additions and 34 deletions

View File

@@ -4,6 +4,36 @@
public class GithubIssuesTests
{
[Fact]
public void Issue693()
{
var doc = IntegrationHelpers.GetDocumentPath("reference-2-numeric-error.pdf");
using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true }))
{
var page1 = document.GetPage(1);
Assert.Equal(1269, page1.Letters.Count);
}
}
[Fact]
public void Issue692()
{
var doc = IntegrationHelpers.GetDocumentPath("cmap-parsing-exception.pdf");
using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true }))
{
var page1 = document.GetPage(1);
Assert.Equal(796, page1.Letters.Count);
}
using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = false, SkipMissingFonts = false }))
{
var ex = Assert.Throws<InvalidOperationException>(() => document.GetPage(1));
Assert.StartsWith("Read byte called on input bytes which was at end of byte set.", ex.Message);
}
}
[Fact]
public void Issue874()
{

View File

@@ -7,7 +7,8 @@
[
"issue_671.pdf",
"GHOSTSCRIPT-698363-0.pdf",
"ErcotFacts.pdf"
"ErcotFacts.pdf",
"cmap-parsing-exception.pdf"
];
[Theory]

View File

@@ -166,7 +166,7 @@
cidFontFactory,
filterProvider,
pdfScanner,
parsingOptions.Logger);
parsingOptions);
var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader);

View File

@@ -1,11 +1,10 @@
namespace UglyToad.PdfPig
{
using System;
using System.Collections.Generic;
using System;
using System.Diagnostics.CodeAnalysis;
using Core;
using Filters;
using Parser.Parts;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;

View File

@@ -10,7 +10,7 @@
/// The CMap (character code map) maps character codes to character identifiers (CIDs).
/// The set of characters which a CMap refers to is the "character set" (charset).
/// </summary>
internal class CMap
internal sealed class CMap
{
public CharacterIdentifierSystemInfo Info { get; }
@@ -140,13 +140,12 @@
return 0;
}
public override string ToString()
{
return Name;
}
public int ReadCode(IInputBytes bytes)
public int ReadCode(IInputBytes bytes, bool useLenientParsing)
{
if (hasEmptyCodespace)
{
@@ -166,7 +165,7 @@
break;
}
result[i] = ReadByte(bytes);
result[i] = ReadByte(bytes, useLenientParsing);
}
for (int i = minCodeLength - 1; i < maxCodeLength; i++)
@@ -181,17 +180,23 @@
}
if (byteCount < maxCodeLength)
{
result[byteCount] = ReadByte(bytes);
result[byteCount] = ReadByte(bytes, useLenientParsing);
}
}
throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}.");
}
private static byte ReadByte(IInputBytes bytes)
private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)
{
if (!bytes.MoveNext())
{
if (useLenientParsing)
{
// See issue #692
return 0;
}
throw new InvalidOperationException("Read byte called on input bytes which was at end of byte set. Current offset: " + bytes.CurrentOffset);
}
@@ -208,6 +213,5 @@
}
return code;
}
}
}

View File

@@ -9,7 +9,7 @@
/// Defines the information content (actual text) of the font
/// as opposed to the display format.
/// </summary>
internal class ToUnicodeCMap
internal sealed class ToUnicodeCMap
{
private readonly CMap? cMap;
@@ -45,9 +45,9 @@
return cMap.TryConvertToUnicode(code, out value);
}
public int ReadCode(IInputBytes inputBytes)
public int ReadCode(IInputBytes inputBytes, bool useLenientParsing)
{
return cMap!.ReadCode(inputBytes);
return cMap!.ReadCode(inputBytes, useLenientParsing);
}
}
}

View File

@@ -21,6 +21,8 @@
private readonly Dictionary<int, CharacterBoundingBox> boundingBoxCache
= new Dictionary<int, CharacterBoundingBox>();
private readonly bool useLenientParsing;
public NameToken Name => BaseFont;
public NameToken BaseFont { get; }
@@ -41,6 +43,7 @@
CMap cmap,
CMap? toUnicodeCMap,
CMap? ucs2CMap,
ParsingOptions parsingOptions,
bool isChineseJapaneseOrKorean)
{
this.ucs2CMap = ucs2CMap;
@@ -52,13 +55,15 @@
ToUnicode = new ToUnicodeCMap(toUnicodeCMap);
Details = cidFont.Details?.WithName(Name.Data)
?? FontDetails.GetDefault(Name.Data);
useLenientParsing = parsingOptions.UseLenientParsing;
}
public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
{
var current = bytes.CurrentOffset;
var code = CMap.ReadCode(bytes);
var code = CMap.ReadCode(bytes, useLenientParsing);
codeLength = (int)(bytes.CurrentOffset - current);

View File

@@ -15,23 +15,25 @@
using Tokens;
using Util;
internal class Type0FontHandler : IFontHandler
internal sealed class Type0FontHandler : IFontHandler
{
private readonly CidFontFactory cidFontFactory;
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner scanner;
private readonly ILog logger;
private readonly ParsingOptions parsingOptions;
public Type0FontHandler(
CidFontFactory cidFontFactory,
ILookupFilterProvider filterProvider,
IPdfTokenScanner scanner,
ILog logger)
ParsingOptions parsingOptions)
{
this.cidFontFactory = cidFontFactory;
this.filterProvider = filterProvider;
this.scanner = scanner;
this.logger = logger;
logger = parsingOptions.Logger;
this.parsingOptions = parsingOptions;
}
public IFont Generate(DictionaryToken dictionary)
@@ -91,7 +93,7 @@
}
}
var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, isChineseJapaneseOrKorean);
var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, parsingOptions, isChineseJapaneseOrKorean);
return font;
}

View File

@@ -17,7 +17,7 @@
using UglyToad.PdfPig.Logging;
using Util;
internal class CidFontFactory
internal sealed class CidFontFactory
{
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner pdfScanner;
@@ -46,7 +46,7 @@
defaultWidth = defaultWidthToken.Double;
}
var verticalWritingMetrics = ReadVerticalDisplacements(dictionary);
var verticalWritingMetrics = ReadVerticalDisplacements(dictionary, pdfScanner);
FontDescriptor? descriptor = null;
if (TryGetFontDescriptor(dictionary, out var descriptorDictionary))
@@ -190,7 +190,7 @@
return widths;
}
private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict)
private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict, IPdfTokenScanner pdfScanner)
{
var verticalDisplacements = new Dictionary<int, double>();
var positionVectors = new Dictionary<int, PdfVector>();
@@ -210,22 +210,21 @@
}
// vertical metrics for individual CIDs.
if (dict.TryGet(NameToken.W2, out var w2Token) && w2Token is ArrayToken w2)
if (dict.TryGet(NameToken.W2, pdfScanner, out ArrayToken? w2))
{
for (var i = 0; i < w2.Data.Count; i++)
{
var c = (NumericToken)w2.Data[i];
var c = DirectObjectFinder.Get<NumericToken>(w2.Data[i], pdfScanner);
var next = w2.Data[++i];
if (next is ArrayToken array)
if (DirectObjectFinder.TryGet(next, pdfScanner, out ArrayToken? array))
{
for (var j = 0; j < array.Data.Count; j++)
{
var cid = c.Int + j;
// ReSharper disable InconsistentNaming
var w1y = (NumericToken)array.Data[j];
var v1x = (NumericToken)array.Data[++j];
var v1y = (NumericToken)array.Data[++j];
var w1y = DirectObjectFinder.Get<NumericToken>(array.Data[j], pdfScanner);
var v1x = DirectObjectFinder.Get<NumericToken>(array.Data[++j], pdfScanner);
var v1y = DirectObjectFinder.Get<NumericToken>(array.Data[++j], pdfScanner);
verticalDisplacements[cid] = w1y.Double;
@@ -236,9 +235,9 @@
{
var first = c.Int;
var last = ((NumericToken)next).Int;
var w1y = (NumericToken)w2.Data[++i];
var v1x = (NumericToken)w2.Data[++i];
var v1y = (NumericToken)w2.Data[++i];
var w1y = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
var v1x = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
var v1y = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
// ReSharper restore InconsistentNaming
for (var cid = first; cid <= last; cid++)
@@ -250,7 +249,7 @@
}
}
}
return new VerticalWritingMetrics(dw2, verticalDisplacements, positionVectors);
}