mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-01-18 19:51:24 +08:00
fix bugs revealed by mortality metadata document. get references using the direct object finder. fix a bug with string tokenizer
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
using System.IO;
|
||||
using Xunit;
|
||||
|
||||
public class MultiplePageMortalityStatistics
|
||||
public class MultiplePageMortalityStatisticsTests
|
||||
{
|
||||
private static string GetFilename()
|
||||
{
|
||||
@@ -32,5 +32,18 @@
|
||||
Assert.Equal(1.7m, document.Version);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetsFirstPageContent()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename()))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
Assert.Contains("Mortality Statistics: Metadata", page.Text);
|
||||
Assert.Contains("Notification to the registrar by the coroner that he does not consider it necessary to hold an inquest – no post-mortem held (Form 100A – salmon pink)", page.Text);
|
||||
Assert.Contains("Presumption of death certificate", page.Text);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -114,6 +114,39 @@ endobj";
|
||||
AssertCorrectToken<StringToken, string>(tokens[5], "Bob");
|
||||
Assert.IsType<ArrayToken>(tokens[6]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CorrectlyScansArrayWithEscapedStrings()
|
||||
{
|
||||
const string s = @"<0078>Tj
|
||||
/TT0 1 Tf
|
||||
0.463 0 Td
|
||||
( )Tj
|
||||
-0.002 Tc 0.007 Tw 11.04 -0 0 11.04 180 695.52 Tm
|
||||
[(R)2.6(eg)-11.3(i)2.7(s)-2(t)4.2(r)-5.9(at)-6.6(i)2.6(on S)2(e)10.5(r)-6(v)8.9(i)2.6(c)-2(e S)1.9(o)10.6(f)-17.5(t)4.3(w)13.4(ar)-6(e \()-6(R)2.6(S)2(S)1.9(\))]TJ
|
||||
0 Tc 0 Tw 16.12 0 Td";
|
||||
|
||||
var tokens = new List<IToken>();
|
||||
|
||||
var scanner = scannerFactory(StringBytesTestConverter.Convert(s, false).Bytes);
|
||||
|
||||
while (scanner.MoveNext())
|
||||
{
|
||||
tokens.Add(scanner.CurrentToken);
|
||||
}
|
||||
|
||||
Assert.Equal(30, tokens.Count);
|
||||
|
||||
AssertCorrectToken<OperatorToken, string>(tokens[29], "Td");
|
||||
AssertCorrectToken<NumericToken, decimal>(tokens[28], 0);
|
||||
AssertCorrectToken<NumericToken, decimal>(tokens[27], 16.12m);
|
||||
AssertCorrectToken<OperatorToken, string>(tokens[26], "Tw");
|
||||
|
||||
var array = Assert.IsType<ArrayToken>(tokens[21]);
|
||||
|
||||
AssertCorrectToken<StringToken, string>(array.Data[array.Data.Count - 1], ")");
|
||||
AssertCorrectToken<NumericToken, decimal>(array.Data[array.Data.Count - 2], 1.9m);
|
||||
}
|
||||
|
||||
private static void AssertCorrectToken<T, TData>(IToken token, TData expected) where T : IDataToken<TData>
|
||||
{
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
using Geometry;
|
||||
using IO;
|
||||
using Pdf.Parser;
|
||||
using Pdf.Parser.Parts;
|
||||
using TrueType;
|
||||
using TrueType.Parser;
|
||||
|
||||
@@ -52,7 +53,7 @@
|
||||
|
||||
var baseFont = dictionary.GetName(CosName.BASE_FONT);
|
||||
|
||||
var systemInfo = GetSystemInfo(dictionary);
|
||||
var systemInfo = GetSystemInfo(dictionary, reader, isLenientParsing);
|
||||
|
||||
var subType = dictionary.GetName(CosName.SUBTYPE);
|
||||
if (CosName.CID_FONT_TYPE0.Equals(subType))
|
||||
@@ -217,11 +218,25 @@
|
||||
return new VerticalWritingMetrics(dw2, verticalDisplacements, positionVectors);
|
||||
}
|
||||
|
||||
private static CharacterIdentifierSystemInfo GetSystemInfo(PdfDictionary dictionary)
|
||||
private CharacterIdentifierSystemInfo GetSystemInfo(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
|
||||
{
|
||||
if(!dictionary.TryGetItemOfType(CosName.CIDSYSTEMINFO, out PdfDictionary cidDictionary))
|
||||
if(!dictionary.TryGetValue(CosName.CIDSYSTEMINFO, out var cidEntry))
|
||||
{
|
||||
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: " + dictionary);
|
||||
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: {dictionary}");
|
||||
}
|
||||
|
||||
if (cidEntry is PdfDictionary cidDictionary)
|
||||
{
|
||||
|
||||
}
|
||||
else if (cidEntry is CosObject cidObject)
|
||||
{
|
||||
cidDictionary =
|
||||
DirectObjectFinder.Find<PdfDictionary>(cidObject, pdfObjectParser, reader, isLenientParsing);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidFontFormatException($"No CID System Info was found in the CID Font dictionary: {dictionary}");
|
||||
}
|
||||
|
||||
var registry = (CosString) cidDictionary.GetItemOrDefault(CosName.REGISTRY);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
using Geometry;
|
||||
using Graphics;
|
||||
using IO;
|
||||
using Parts;
|
||||
using Util;
|
||||
|
||||
internal class PageFactory : IPageFactory
|
||||
@@ -53,7 +54,7 @@
|
||||
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
|
||||
if (contentObject != null)
|
||||
{
|
||||
var contentStream = pdfObjectParser.Parse(contentObject.ToIndirectReference(), reader, false) as PdfRawStream;
|
||||
var contentStream = DirectObjectFinder.Find<PdfRawStream>(contentObject, pdfObjectParser, reader, false);
|
||||
|
||||
if (contentStream == null)
|
||||
{
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
internal static class DirectObjectFinder
|
||||
{
|
||||
public static CosBase Find<T>(CosObject baseObject, IPdfObjectParser parser, IRandomAccessRead reader,
|
||||
public static T Find<T>(CosObject baseObject, IPdfObjectParser parser, IRandomAccessRead reader,
|
||||
bool isLenientParsing) where T : CosBase
|
||||
{
|
||||
var result = parser.Parse(baseObject.ToIndirectReference(), reader, isLenientParsing);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
namespace UglyToad.Pdf.Tokenization
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using IO;
|
||||
using Scanner;
|
||||
using Tokens;
|
||||
@@ -26,6 +27,7 @@
|
||||
while (!CurrentByteEndsCurrentArray(inputBytes, previousToken) && scanner.MoveNext())
|
||||
{
|
||||
previousToken = scanner.CurrentToken;
|
||||
|
||||
contents.Add(scanner.CurrentToken);
|
||||
}
|
||||
|
||||
|
||||
@@ -80,29 +80,26 @@
|
||||
numberOfBrackets--;
|
||||
}
|
||||
|
||||
isEscapeActive = false;
|
||||
if (numberOfBrackets > 0)
|
||||
{
|
||||
builder.Append(c);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: Check for other ends of string where the string is improperly formatted. See commented method
|
||||
// numberOfBrackets = CheckForEndOfString(inputBytes, numberOfBrackets);
|
||||
|
||||
isEscapeActive = false;
|
||||
|
||||
break;
|
||||
case '(':
|
||||
isLineBreaking = false;
|
||||
|
||||
|
||||
|
||||
|
||||
if (!isEscapeActive)
|
||||
{
|
||||
numberOfBrackets++;
|
||||
}
|
||||
|
||||
isEscapeActive = false;
|
||||
builder.Append(c);
|
||||
break;
|
||||
// Escape
|
||||
|
||||
Reference in New Issue
Block a user