handle case where contents is an array of objects

This commit is contained in:
Eliot Jones
2018-01-06 18:25:47 +00:00
parent eb66611e55
commit 02845e8ebb
11 changed files with 193 additions and 15 deletions

View File

@@ -0,0 +1,57 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.IO;
using System.Linq;
using Content;
using Xunit;
public class JudgementDocumentTests
{
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Judgement Document.pdf");
}
[Fact]
public void HasCorrectNumberOfPages()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(13, document.NumberOfPages);
}
}
[Fact]
public void HasCorrectPageContents()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.Contains("Royal Courts of Justice, Rolls Building Fetter Lane, London, EC4A 1NL", page.Text);
page = document.GetPage(2);
Assert.Contains("The reference to BAR is to another trade organisation of which CMUK was", page.Text);
}
}
[Fact]
public void HasCorrectPageSize()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var pages = Enumerable.Range(1, 13)
.Select(x => document.GetPage(x))
.ToList();
Assert.All(pages, x => Assert.Equal(PageSize.A4, x.Size));
}
}
}
}

View File

@@ -2,8 +2,6 @@
{
using System;
using System.IO;
using System.Linq;
using Content;
using Xunit;
public class SinglePageType1FontTests

View File

@@ -12,6 +12,7 @@
<None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
<None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
<None Remove="Integration\Documents\Judgement Document.pdf" />
<None Remove="Integration\Documents\Multiple Page - from Mortality Statistics.pdf" />
<None Remove="Integration\Documents\Single Page Form Content - from itext 1_1.pdf" />
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
@@ -34,6 +35,9 @@
<Content Include="Integration\Documents\Font Size Test - from google chrome print pdf.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Judgement Document.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Multiple Page - from Mortality Statistics.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>

View File

@@ -7,6 +7,7 @@
using Fonts;
using IO;
using Parser;
using Parser.Parts;
internal class ResourceContainer : IResourceStore
{
@@ -71,7 +72,7 @@
throw new InvalidOperationException($"The font with name {pair.Key} did not link to an object key. Value was: {pair.Value}.");
}
var fontObject = pdfObjectParser.Parse(objectKey.ToIndirectReference(), reader, false) as PdfDictionary;
var fontObject = DirectObjectFinder.Find<PdfDictionary>(objectKey, pdfObjectParser, reader, false);
if (fontObject == null)
{

View File

@@ -343,6 +343,74 @@
break;
case FamilyName:
builder.FamilyName = ReadLine(bytes);
break;
case Weight:
builder.Weight = ReadLine(bytes);
break;
case ItalicAngle:
builder.ItalicAngle = ReadDecimal(bytes);
break;
case IsFixedPitch:
builder.IsFixedPitch = ReadBool(bytes);
break;
case FontBbox:
builder.SetBoundingBox(ReadDecimal(bytes), ReadDecimal(bytes),
ReadDecimal(bytes), ReadDecimal(bytes));
break;
case UnderlinePosition:
builder.UnderlinePosition = ReadDecimal(bytes);
break;
case UnderlineThickness:
builder.UnderlineThickness = ReadDecimal(bytes);
break;
case Version:
builder.Version = ReadLine(bytes);
break;
case Notice:
builder.Notice = ReadLine(bytes);
break;
case EncodingScheme:
builder.EncodingScheme = ReadLine(bytes);
break;
case MappingScheme:
builder.MappingScheme = (int) ReadDecimal(bytes);
break;
case CharacterSet:
builder.CharacterSet = ReadLine(bytes);
break;
case IsBaseFont:
builder.IsBaseFont = ReadBool(bytes);
break;
case CapHeight:
builder.CapHeight = ReadDecimal(bytes);
break;
case XHeight:
builder.XHeight = ReadDecimal(bytes);
break;
case Ascender:
builder.Ascender = ReadDecimal(bytes);
break;
case Descender:
builder.Descender = ReadDecimal(bytes);
break;
case StdHw:
builder.StdHw = ReadDecimal(bytes);
break;
case StdVw:
builder.StdVw = ReadDecimal(bytes);
break;
case CharWidth:
builder.SetCharacterWidth(ReadDecimal(bytes), ReadDecimal(bytes));
break;
case StartCharMetrics:
var count = (int)ReadDecimal(bytes);
for (int i = 0; i < count; i++)
{
var metric = ReadCharacterMetric(bytes);
builder.CharacterMetrics.Add(metric);
}
var end = ReadString(bytes);
break;
}
}
@@ -414,6 +482,13 @@
return Builder.ToString();
}
private static IndividualCharacterMetric ReadCharacterMetric(IInputBytes bytes)
{
var line = ReadLine(bytes);
return new IndividualCharacterMetric();
}
}
internal interface IAdobeFontMetricsParser

View File

@@ -7,6 +7,7 @@
using IO;
using Parts;
using Pdf.Parser;
using Pdf.Parser.Parts;
internal static class FontDictionaryAccessHelper
{
@@ -32,11 +33,16 @@
return lastChar.AsInt();
}
public static decimal[] GetWidths(PdfDictionary dictionary)
public static decimal[] GetWidths(IPdfObjectParser pdfObjectParser, PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
if (!dictionary.TryGetItemOfType(CosName.WIDTHS, out COSArray widthArray))
{
throw new InvalidFontFormatException($"No widths array was found in the font dictionary for this TrueType font: {dictionary}.");
if (!dictionary.TryGetItemOfType(CosName.WIDTHS, out CosObject arr))
{
throw new InvalidFontFormatException($"No widths array was found in the font dictionary for this TrueType font: {dictionary}.");
}
widthArray = DirectObjectFinder.Find<COSArray>(arr, pdfObjectParser, reader, isLenientParsing);
}
return widthArray.Select(x => ((ICosNumber)x).AsDecimal()).ToArray();

View File

@@ -41,7 +41,7 @@
var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary);
var widths = FontDictionaryAccessHelper.GetWidths(dictionary);
var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing);
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);

View File

@@ -33,7 +33,7 @@
var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary);
var widths = FontDictionaryAccessHelper.GetWidths(dictionary);
var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing);
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);

View File

@@ -6,6 +6,7 @@
using Cos;
using Exceptions;
using IO;
using Parts;
internal class CatalogFactory
{
@@ -33,7 +34,7 @@
throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}.");
}
var pages = pdfObjectParser.Parse(value.ToIndirectReference(), reader, isLenientParsing);
var pages = DirectObjectFinder.Find<PdfDictionary>(value, pdfObjectParser, reader, isLenientParsing);
if (!(pages is PdfDictionary pagesDictionary))
{

View File

@@ -1,9 +1,13 @@
namespace UglyToad.Pdf.Parser
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Content;
using ContentStream;
using Cos;
using Exceptions;
using Filters;
using Geometry;
using Graphics;
@@ -51,8 +55,8 @@
PageContent content = default(PageContent);
var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
if (contentObject != null)
var contents = dictionary.GetItemOrDefault(CosName.CONTENTS);
if (contents is CosObject contentObject)
{
var contentStream = DirectObjectFinder.Find<PdfRawStream>(contentObject, pdfObjectParser, reader, false);
@@ -61,15 +65,33 @@
throw new InvalidOperationException("Failed to parse the content for the page: " + number);
}
var contents = contentStream.Decode(filterProvider);
var bytes = contentStream.Decode(filterProvider);
var txt = OtherEncodings.BytesAsLatin1String(contents);
content = GetContent(bytes, cropBox, userSpaceUnit);
}
else if (contents is COSArray arr)
{
var bytes = new List<byte>();
foreach (var item in arr)
{
var obj = item as CosObject;
if (obj == null)
{
throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
}
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
var contentStream = DirectObjectFinder.Find<PdfRawStream>(obj, pdfObjectParser, reader, isLenientParsing);
if (contentStream == null)
{
throw new InvalidOperationException($"Could not find the contents for object {obj}.");
}
var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit);
bytes.AddRange(contentStream.Decode(filterProvider));
}
content = context.Process(operations);
content = GetContent(bytes, cropBox, userSpaceUnit);
}
var page = new Page(number, mediaBox, cropBox, content);
@@ -77,6 +99,20 @@
return page;
}
private PageContent GetContent(IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit)
{
if (Debugger.IsAttached)
{
var txt = OtherEncodings.BytesAsLatin1String(contentBytes.ToArray());
}
var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes));
var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit);
return context.Process(operations);
}
private static UserSpaceUnit GetUserSpaceUnits(PdfDictionary dictionary)
{
var spaceUnits = UserSpaceUnit.Default;