bug fix for indirect page link, bug fix for array in base font range in cmap

This commit is contained in:
Eliot Jones
2018-01-07 11:51:18 +00:00
parent c75b9d10bd
commit a6c3dba25a
4 changed files with 115 additions and 23 deletions

View File

@@ -0,0 +1,61 @@
namespace UglyToad.Pdf.Tests.Fonts.Parser.Parts
{
using Pdf.Fonts.Cmap;
using Pdf.Fonts.Parser.Parts;
using Pdf.Tokenization.Tokens;
using Xunit;
public class BaseFontRangeParserTests
{
private readonly BaseFontRangeParser parser = new BaseFontRangeParser();
[Fact]
public void CanParseWithArray()
{
var input = StringBytesTestConverter.Scanner("<0003> <0004> [<0020> <0041>]");
var builder = new CharacterMapBuilder();
parser.Parse(new NumericToken(1), input, builder, false);
Assert.Equal(2, builder.BaseFontCharacterMap.Count);
Assert.Equal(" ", builder.BaseFontCharacterMap[3]);
Assert.Equal("A", builder.BaseFontCharacterMap[4]);
}
[Fact]
public void CanParseWithHex()
{
var input = StringBytesTestConverter.Scanner("<8141> <8147> <8141>");
var builder = new CharacterMapBuilder();
parser.Parse(new NumericToken(1), input, builder, false);
Assert.Equal(7, builder.BaseFontCharacterMap.Count);
Assert.Equal("腁", builder.BaseFontCharacterMap[33089]);
Assert.Equal(char.ConvertFromUtf32(33090), builder.BaseFontCharacterMap[33090]);
}
[Fact]
public void CanParseTwoRowsWithDifferentFormat()
{
var input = StringBytesTestConverter.Scanner(@"<0019> <001B> <3C>
<0001> <0003> [/happy /feet /penguin]");
var builder = new CharacterMapBuilder();
parser.Parse(new NumericToken(2), input, builder, false);
Assert.Equal(6, builder.BaseFontCharacterMap.Count);
Assert.Equal("happy", builder.BaseFontCharacterMap[1]);
Assert.Equal("feet", builder.BaseFontCharacterMap[2]);
Assert.Equal("penguin", builder.BaseFontCharacterMap[3]);
Assert.Equal("<", builder.BaseFontCharacterMap[25]);
}
}
}

View File

@@ -61,5 +61,14 @@
Assert.Contains("Söderberg", page.Text); Assert.Contains("Söderberg", page.Text);
} }
} }
//[Fact]
//public void localFileTest()
//{
// using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\CV.pdf"))
// {
// var page = document.GetPage(1);
// }
//}
} }
} }

View File

@@ -9,6 +9,7 @@
using IO; using IO;
using Logging; using Logging;
using Parser; using Parser;
using Parser.Parts;
internal class Pages internal class Pages
{ {
@@ -111,7 +112,7 @@
foreach (var kid in kids.OfType<CosObject>()) foreach (var kid in kids.OfType<CosObject>())
{ {
// todo: exit early // todo: exit early
var child = pdfObjectParser.Parse(kid.ToIndirectReference(), reader, isLenientParsing) as PdfDictionary; var child = DirectObjectFinder.Find<PdfDictionary>(kid, pdfObjectParser, reader, isLenientParsing);
var thisPageMatches = FindPage(child, soughtPageNumber, pageNumbersObserved); var thisPageMatches = FindPage(child, soughtPageNumber, pageNumbersObserved);

View File

@@ -4,35 +4,39 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using Cmap; using Cmap;
using Exceptions;
using Tokenization.Scanner; using Tokenization.Scanner;
using Tokenization.Tokens; using Tokenization.Tokens;
/// <summary>
/// The beginbfrange and endbfrange operators map i ranges of input codes to the corresponding output code range.
/// </summary>
internal class BaseFontRangeParser : ICidFontPartParser<NumericToken> internal class BaseFontRangeParser : ICidFontPartParser<NumericToken>
{ {
public void Parse(NumericToken numeric, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing) public void Parse(NumericToken numberOfOperations, ITokenScanner scanner, CharacterMapBuilder builder, bool isLenientParsing)
{ {
for (var i = 0; i < numeric.Int; i++) for (var i = 0; i < numberOfOperations.Int; i++)
{ {
// The start of the input code range.
if (!scanner.TryReadToken(out HexToken lowSourceCode)) if (!scanner.TryReadToken(out HexToken lowSourceCode))
{ {
// TODO: message throw new InvalidFontFormatException($"bfrange was missing the low source code: {scanner.CurrentToken}");
throw new InvalidOperationException();
} }
// The inclusive end of the input code range.
if (!scanner.TryReadToken(out HexToken highSourceCode)) if (!scanner.TryReadToken(out HexToken highSourceCode))
{ {
// TODO: message throw new InvalidFontFormatException($"bfrange was missing the high source code: {scanner.CurrentToken}");
throw new InvalidOperationException();
} }
if (!scanner.MoveNext()) if (!scanner.MoveNext())
{ {
// TODO: message throw new InvalidFontFormatException("bfrange ended unexpectedly after the high source code.");
throw new InvalidOperationException();
} }
List<byte> destinationBytes = null; List<byte> destinationBytes = null;
ArrayToken destinationArray = null; ArrayToken destinationArray = null;
switch (scanner.CurrentToken) switch (scanner.CurrentToken)
{ {
case ArrayToken arrayToken: case ArrayToken arrayToken:
@@ -51,7 +55,35 @@
var startCode = new List<byte>(lowSourceCode.Bytes); var startCode = new List<byte>(lowSourceCode.Bytes);
var endCode = highSourceCode.Bytes; var endCode = highSourceCode.Bytes;
if (destinationArray != null)
{
int arrayIndex = 0; int arrayIndex = 0;
while (!done)
{
if (Compare(startCode, endCode) >= 0)
{
done = true;
}
var destination = destinationArray.Data[arrayIndex];
if (destination is NameToken name)
{
builder.AddBaseFontCharacter(startCode, name.Data.Name);
}
else if (destination is HexToken hex)
{
builder.AddBaseFontCharacter(startCode, hex.Bytes);
}
Increment(startCode, startCode.Count - 1);
arrayIndex++;
}
continue;
}
while (!done) while (!done)
{ {
if (Compare(startCode, endCode) >= 0) if (Compare(startCode, endCode) >= 0)
@@ -63,19 +95,8 @@
Increment(startCode, startCode.Count - 1); Increment(startCode, startCode.Count - 1);
if (destinationArray == null)
{
Increment(destinationBytes, destinationBytes.Count - 1); Increment(destinationBytes, destinationBytes.Count - 1);
} }
else
{
arrayIndex++;
if (arrayIndex < destinationArray.Data.Count)
{
destinationBytes = ((HexToken)destinationArray.Data[arrayIndex]).Bytes.ToList();
}
}
}
} }
} }