fix bugs revealed by mortality metadata document. get references using the direct object finder. fix a bug with string tokenizer

This commit is contained in:
Eliot Jones
2018-01-05 23:08:20 +00:00
parent 2e7f9b8d76
commit bbcb5af2be
7 changed files with 74 additions and 13 deletions

View File

@@ -4,7 +4,7 @@
using System.IO;
using Xunit;
public class MultiplePageMortalityStatistics
public class MultiplePageMortalityStatisticsTests
{
private static string GetFilename()
{
@@ -32,5 +32,18 @@
Assert.Equal(1.7m, document.Version);
}
}
[Fact]
public void GetsFirstPageContent()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.Contains("Mortality Statistics: Metadata", page.Text);
Assert.Contains("Notification to the registrar by the coroner that he does not consider it necessary to hold an inquest no post-mortem held (Form 100A salmon pink)", page.Text);
Assert.Contains("Presumption of death certificate", page.Text);
}
}
}
}

View File

@@ -114,6 +114,39 @@ endobj";
AssertCorrectToken<StringToken, string>(tokens[5], "Bob");
Assert.IsType<ArrayToken>(tokens[6]);
}
[Fact]
public void CorrectlyScansArrayWithEscapedStrings()
{
const string s = @"<0078>Tj
/TT0 1 Tf
0.463 0 Td
( )Tj
-0.002 Tc 0.007 Tw 11.04 -0 0 11.04 180 695.52 Tm
[(R)2.6(eg)-11.3(i)2.7(s)-2(t)4.2(r)-5.9(at)-6.6(i)2.6(on S)2(e)10.5(r)-6(v)8.9(i)2.6(c)-2(e S)1.9(o)10.6(f)-17.5(t)4.3(w)13.4(ar)-6(e \()-6(R)2.6(S)2(S)1.9(\))]TJ
0 Tc 0 Tw 16.12 0 Td";
var tokens = new List<IToken>();
var scanner = scannerFactory(StringBytesTestConverter.Convert(s, false).Bytes);
while (scanner.MoveNext())
{
tokens.Add(scanner.CurrentToken);
}
Assert.Equal(30, tokens.Count);
AssertCorrectToken<OperatorToken, string>(tokens[29], "Td");
AssertCorrectToken<NumericToken, decimal>(tokens[28], 0);
AssertCorrectToken<NumericToken, decimal>(tokens[27], 16.12m);
AssertCorrectToken<OperatorToken, string>(tokens[26], "Tw");
var array = Assert.IsType<ArrayToken>(tokens[21]);
AssertCorrectToken<StringToken, string>(array.Data[array.Data.Count - 1], ")");
AssertCorrectToken<NumericToken, decimal>(array.Data[array.Data.Count - 2], 1.9m);
}
private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
{

View File

@@ -11,6 +11,7 @@
using Geometry;
using IO;
using Pdf.Parser;
using Pdf.Parser.Parts;
using TrueType;
using TrueType.Parser;
@@ -52,7 +53,7 @@
var baseFont = dictionary.GetName(CosName.BASE_FONT);
var systemInfo = GetSystemInfo(dictionary);
var systemInfo = GetSystemInfo(dictionary, reader, isLenientParsing);
var subType = dictionary.GetName(CosName.SUBTYPE);
if (CosName.CID_FONT_TYPE0.Equals(subType))
@@ -217,11 +218,25 @@
return new VerticalWritingMetrics(dw2, verticalDisplacements, positionVectors);
}
private static CharacterIdentifierSystemInfo GetSystemInfo(PdfDictionary dictionary)
private CharacterIdentifierSystemInfo GetSystemInfo(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
if(!dictionary.TryGetItemOfType(CosName.CIDSYSTEMINFO, out PdfDictionary cidDictionary))
if(!dictionary.TryGetValue(CosName.CIDSYSTEMINFO, out var cidEntry))
{
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: " + dictionary);
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: {dictionary}");
}
if (cidEntry is PdfDictionary cidDictionary)
{
}
else if (cidEntry is CosObject cidObject)
{
cidDictionary =
DirectObjectFinder.Find<PdfDictionary>(cidObject, pdfObjectParser, reader, isLenientParsing);
}
else
{
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: {dictionary}");
}
var registry = (CosString) cidDictionary.GetItemOrDefault(CosName.REGISTRY);

View File

@@ -8,6 +8,7 @@
using Geometry;
using Graphics;
using IO;
using Parts;
using Util;
internal class PageFactory : IPageFactory
@@ -53,7 +54,7 @@
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
if (contentObject != null)
{
var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream;
var contentStream = DirectObjectFinder.Find<PdfRawStream>(contentObject, pdfObjectParser, reader, false);
if (contentStream == null)
{

View File

@@ -6,7 +6,7 @@
internal static class DirectObjectFinder
{
public static CosBase Find<T>(CosObject baseObject, IPdfObjectParser parser, IRandomAccessRead reader,
public static T Find<T>(CosObject baseObject, IPdfObjectParser parser, IRandomAccessRead reader,
bool isLenientParsing) where T : CosBase
{
var result = parser.Parse(baseObject.ToIndirectReference(), reader, isLenientParsing);

View File

@@ -1,6 +1,7 @@
namespace UglyToad.Pdf.Tokenization
{
using System.Collections.Generic;
using System.Diagnostics;
using IO;
using Scanner;
using Tokens;
@@ -26,6 +27,7 @@
while (!CurrentByteEndsCurrentArray(inputBytes, previousToken) && scanner.MoveNext())
{
previousToken = scanner.CurrentToken;
contents.Add(scanner.CurrentToken);
}

View File

@@ -80,29 +80,26 @@
numberOfBrackets--;
}
isEscapeActive = false;
if (numberOfBrackets > 0)
{
builder.Append(c);
break;
}
// TODO: Check for other ends of string where the string is improperly formatted. See commented method
// numberOfBrackets = CheckForEndOfString(inputBytes, numberOfBrackets);
isEscapeActive = false;
break;
case '(':
isLineBreaking = false;
if (!isEscapeActive)
{
numberOfBrackets++;
}
isEscapeActive = false;
builder.Append(c);
break;
// Escape