handle case where contents is an array of objects

2025-10-15 19:54:52 +08:00 · 2018-01-06 18:25:47 +00:00
parent eb66611e55
commit 02845e8ebb
11 changed files with 193 additions and 15 deletions
--- a/src/UglyToad.Pdf.Tests/Integration/Documents/Judgement
+++ b/src/UglyToad.Pdf.Tests/Integration/Documents/Judgement
--- a/src/UglyToad.Pdf.Tests/Integration/JudgementDocumentTests.cs
+++ b/src/UglyToad.Pdf.Tests/Integration/JudgementDocumentTests.cs
@@ -0,0 +1,57 @@
+namespace UglyToad.Pdf.Tests.Integration
+{
+    using System;
+    using System.IO;
+    using System.Linq;
+    using Content;
+    using Xunit;
+
+    public class JudgementDocumentTests
+    {
+        private static string GetFilename()
+        {
+            var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
+
+            return Path.Combine(documentFolder, "Judgement Document.pdf");
+        }
+
+        [Fact]
+        public void HasCorrectNumberOfPages()
+        {
+            var file = GetFilename();
+
+            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
+            {
+                Assert.Equal(13, document.NumberOfPages);
+            }
+        }
+
+        [Fact]
+        public void HasCorrectPageContents()
+        {
+            using (var document = PdfDocument.Open(GetFilename()))
+            {
+                var page = document.GetPage(1);
+
+                Assert.Contains("Royal Courts of Justice, Rolls Building Fetter Lane, London, EC4A 1NL", page.Text);
+                
+                page = document.GetPage(2);
+
+                Assert.Contains("The reference to BAR is to another trade organisation of which CMUK was", page.Text);
+            }
+        }
+
+        [Fact]
+        public void HasCorrectPageSize()
+        {
+            using (var document = PdfDocument.Open(GetFilename()))
+            {
+                var pages = Enumerable.Range(1, 13)
+                    .Select(x => document.GetPage(x))
+                    .ToList();
+
+                Assert.All(pages, x => Assert.Equal(PageSize.A4, x.Size));
+            }
+        }
+    }
+}
--- a/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleOpenOfficeTests
+++ b/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleOpenOfficeTests
@@ -2,8 +2,6 @@
 {
    using System;
    using System.IO;
-    using System.Linq;
-    using Content;
    using Xunit;

    public class SinglePageType1FontTests
--- a/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj
+++ b/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj
@@ -12,6 +12,7 @@
    <None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
    <None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
    <None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
+    <None Remove="Integration\Documents\Judgement Document.pdf" />
    <None Remove="Integration\Documents\Multiple Page - from Mortality Statistics.pdf" />
    <None Remove="Integration\Documents\Single Page Form Content - from itext 1_1.pdf" />
    <None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
@@ -34,6 +35,9 @@
    <Content Include="Integration\Documents\Font Size Test - from google chrome print pdf.pdf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
+    <Content Include="Integration\Documents\Judgement Document.pdf">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
    <Content Include="Integration\Documents\Multiple Page - from Mortality Statistics.pdf">
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
    </Content>
--- a/src/UglyToad.Pdf/Content/ResourceContainer.cs
+++ b/src/UglyToad.Pdf/Content/ResourceContainer.cs
@@ -7,6 +7,7 @@
    using Fonts;
    using IO;
    using Parser;
+    using Parser.Parts;

    internal class ResourceContainer : IResourceStore
    {
@@ -71,7 +72,7 @@
                    throw new InvalidOperationException($"The font with name {pair.Key} did not link to an object key. Value was: {pair.Value}.");
                }
                
-                var fontObject = pdfObjectParser.Parse(objectKey.ToIndirectReference(), reader, false) as PdfDictionary;
+                var fontObject = DirectObjectFinder.Find<PdfDictionary>(objectKey, pdfObjectParser, reader, false);

                if (fontObject == null)
                {
--- a/src/UglyToad.Pdf/Fonts/Parser/AdobeFontMetricsParser.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/AdobeFontMetricsParser.cs
@@ -343,6 +343,74 @@
                        break;
                    case FamilyName:
                        builder.FamilyName = ReadLine(bytes);
+                        break;
+                    case Weight:
+                        builder.Weight = ReadLine(bytes);
+                        break;
+                    case ItalicAngle:
+                        builder.ItalicAngle = ReadDecimal(bytes);
+                        break;
+                    case IsFixedPitch:
+                        builder.IsFixedPitch = ReadBool(bytes);
+                        break;
+                    case FontBbox:
+                        builder.SetBoundingBox(ReadDecimal(bytes), ReadDecimal(bytes),
+                            ReadDecimal(bytes), ReadDecimal(bytes));
+                        break;
+                    case UnderlinePosition:
+                        builder.UnderlinePosition = ReadDecimal(bytes);
+                        break;
+                    case UnderlineThickness:
+                        builder.UnderlineThickness = ReadDecimal(bytes);
+                        break;
+                    case Version:
+                        builder.Version = ReadLine(bytes);
+                        break;
+                    case Notice:
+                        builder.Notice = ReadLine(bytes);
+                        break;
+                    case EncodingScheme:
+                        builder.EncodingScheme = ReadLine(bytes);
+                        break;
+                    case MappingScheme:
+                        builder.MappingScheme = (int) ReadDecimal(bytes);
+                        break;
+                    case CharacterSet:
+                        builder.CharacterSet = ReadLine(bytes);
+                        break;
+                    case IsBaseFont:
+                        builder.IsBaseFont = ReadBool(bytes);
+                        break;
+                    case CapHeight:
+                        builder.CapHeight = ReadDecimal(bytes);
+                        break;
+                    case XHeight:
+                        builder.XHeight = ReadDecimal(bytes);
+                        break;
+                    case Ascender:
+                        builder.Ascender = ReadDecimal(bytes);
+                        break;
+                    case Descender:
+                        builder.Descender = ReadDecimal(bytes);
+                        break;
+                    case StdHw:
+                        builder.StdHw = ReadDecimal(bytes);
+                        break;
+                    case StdVw:
+                        builder.StdVw = ReadDecimal(bytes);
+                        break;
+                    case CharWidth:
+                        builder.SetCharacterWidth(ReadDecimal(bytes), ReadDecimal(bytes));
+                        break;
+                    case StartCharMetrics:
+                        var count = (int)ReadDecimal(bytes);
+                        for (int i = 0; i < count; i++)
+                        {
+                            var metric = ReadCharacterMetric(bytes);
+                            builder.CharacterMetrics.Add(metric);
+                        }
+                        var end = ReadString(bytes);
+
                        break;
                }
            }
@@ -414,6 +482,13 @@

            return Builder.ToString();
        }
+
+        private static IndividualCharacterMetric ReadCharacterMetric(IInputBytes bytes)
+        {
+            var line = ReadLine(bytes);
+
+            return new IndividualCharacterMetric();
+        }
    }

    internal interface IAdobeFontMetricsParser
--- a/src/UglyToad.Pdf/Fonts/Parser/FontDictionaryAccessHelper.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/FontDictionaryAccessHelper.cs
@@ -7,6 +7,7 @@
    using IO;
    using Parts;
    using Pdf.Parser;
+    using Pdf.Parser.Parts;

    internal static class FontDictionaryAccessHelper
    {
@@ -32,11 +33,16 @@
            return lastChar.AsInt();
        }

-        public static decimal[] GetWidths(PdfDictionary dictionary)
+        public static decimal[] GetWidths(IPdfObjectParser pdfObjectParser, PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
        {
            if (!dictionary.TryGetItemOfType(CosName.WIDTHS, out COSArray widthArray))
            {
-                throw new InvalidFontFormatException($"No widths array was found in the font dictionary for this TrueType font: {dictionary}.");
+                if (!dictionary.TryGetItemOfType(CosName.WIDTHS, out CosObject arr))
+                {
+                    throw new InvalidFontFormatException($"No widths array was found in the font dictionary for this TrueType font: {dictionary}.");
+                }
+
+                widthArray = DirectObjectFinder.Find<COSArray>(arr, pdfObjectParser, reader, isLenientParsing);
            }

            return widthArray.Select(x => ((ICosNumber)x).AsDecimal()).ToArray();
--- a/src/UglyToad.Pdf/Fonts/Parser/Handlers/TrueTypeFontHandler.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Handlers/TrueTypeFontHandler.cs
@@ -41,7 +41,7 @@

            var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary);

-            var widths = FontDictionaryAccessHelper.GetWidths(dictionary);
+            var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing);

            var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);

--- a/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type1FontHandler.cs
+++ b/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type1FontHandler.cs
@@ -33,7 +33,7 @@

            var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary);

-            var widths = FontDictionaryAccessHelper.GetWidths(dictionary);
+            var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing);

            var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);
            
--- a/src/UglyToad.Pdf/Parser/CatalogFactory.cs
+++ b/src/UglyToad.Pdf/Parser/CatalogFactory.cs
@@ -6,6 +6,7 @@
    using Cos;
    using Exceptions;
    using IO;
+    using Parts;

    internal class CatalogFactory
    {
@@ -33,7 +34,7 @@
                throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}.");
            }

-            var pages = pdfObjectParser.Parse(value.ToIndirectReference(), reader, isLenientParsing);
+            var pages = DirectObjectFinder.Find<PdfDictionary>(value, pdfObjectParser, reader, isLenientParsing);

            if (!(pages is PdfDictionary pagesDictionary))
            {
--- a/src/UglyToad.Pdf/Parser/PageFactory.cs
+++ b/src/UglyToad.Pdf/Parser/PageFactory.cs
@@ -1,9 +1,13 @@
 namespace UglyToad.Pdf.Parser
 {
    using System;
+    using System.Collections.Generic;
+    using System.Diagnostics;
+    using System.Linq;
    using Content;
    using ContentStream;
    using Cos;
+    using Exceptions;
    using Filters;
    using Geometry;
    using Graphics;
@@ -51,8 +55,8 @@

            PageContent content = default(PageContent);

-            var contentObject = dictionary.GetItemOrDefault(CosName.CONTENTS) as CosObject;
-            if (contentObject != null)
+            var contents = dictionary.GetItemOrDefault(CosName.CONTENTS);
+            if (contents is CosObject contentObject)
            {
                var contentStream = DirectObjectFinder.Find<PdfRawStream>(contentObject, pdfObjectParser,  reader, false);

@@ -61,15 +65,33 @@
                    throw new InvalidOperationException("Failed to parse the content for the page: " + number);
                }

-                var contents = contentStream.Decode(filterProvider);
+                var bytes = contentStream.Decode(filterProvider);

-                var txt = OtherEncodings.BytesAsLatin1String(contents);
+                content = GetContent(bytes, cropBox, userSpaceUnit);
+            }
+            else if (contents is COSArray arr)
+            {
+                var bytes = new List<byte>();
+                
+                foreach (var item in arr)
+                {
+                    var obj = item as CosObject;
+                    if (obj == null)
+                    {
+                        throw new PdfDocumentFormatException($"The contents contained something which was not an indirect reference: {item}.");
+                    }

-                var operations = pageContentParser.Parse(new ByteArrayInputBytes(contents));
+                    var contentStream = DirectObjectFinder.Find<PdfRawStream>(obj, pdfObjectParser, reader, isLenientParsing);
+                    
+                    if (contentStream == null)
+                    {
+                        throw new InvalidOperationException($"Could not find the contents for object {obj}.");
+                    }

-                var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit);
+                    bytes.AddRange(contentStream.Decode(filterProvider));
+                }

-                content = context.Process(operations);
+                content = GetContent(bytes, cropBox, userSpaceUnit);
            }

            var page = new Page(number, mediaBox, cropBox, content);
@@ -77,6 +99,20 @@
            return page;
        }

+        private PageContent GetContent(IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit)
+        {
+            if (Debugger.IsAttached)
+            {
+                var txt = OtherEncodings.BytesAsLatin1String(contentBytes.ToArray());
+            }
+
+            var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes));
+
+            var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit);
+
+            return context.Process(operations);
+        }
+
        private static UserSpaceUnit GetUserSpaceUnits(PdfDictionary dictionary)
        {
            var spaceUnits = UserSpaceUnit.Default;