encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

2025-06-28 15:30:17 +08:00 · 2017-12-28 13:14:03 +00:00 · 2017-12-28 13:14:03 +00:00 · b1d28a5af8
commit b1d28a5af8
parent 940c51e2fb
8 changed files with 437 additions and 966 deletions
--- a/README.md
+++ b/README.md
@ -3,4 +3,40 @@
 [![Build status](https://ci.appveyor.com/api/projects/status/ni7et2j2ml60pdi3?svg=true)](https://ci.appveyor.com/project/EliotJones/pdf)
 [![codecov](https://codecov.io/gh/UglyToad/Pdf/branch/master/graph/badge.svg)](https://codecov.io/gh/UglyToad/Pdf)
-Convert the [PdfBox](https://github.com/apache/pdfbox) code to C#.
+The aim of this project is to convert the [PdfBox](https://github.com/apache/pdfbox) code to C# in order to provide a properly open source (i.e. no copyleft) solution for inspecting PDF documents. This uses the Apache 2.0 licence.
 ## Status ##
 There is a lot left to do for this project, the initial minimum viable project when released to Alpha will provide:
 + Page counts and sizes (in points) for a document.
 + Access to the text contents of each page. Note that since PDF has no concept of a "word" it will be up to the consumer of the text to work out where the words are within the text.
 + (Possible) The locations and bounds of each letter on the page.
 For the initial alpha release all files will be opened rather than streamed so this will not support large files.
 Eventually the library should support all existing PdfBox operations such as accessing graphical elements, form elements as well as creating PDF documents.
 ## Usage ##
 The initial public API will be as limited as possible to allow extensive refactoring to take place. The proposed usage is as follows:
    using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf"))
    {
        int pageCount = document.NumberOfPages;
        Page page = document.GetPage(1);
        decimal widthInPoints = page.Width;
        decimal heightInPoints = page.Height;
        string text = page.Text;
    }
 The ```PdfDocument``` will also support opening from byte arrays (as well as streams eventually):
    byte[] fileBytes = File.ReadAllBytes(@"C:\my-file.pdf");
    (using PdfDocument document = PdfDocument.Open(fileBytes))
    {
        int numberOfPages = document.NumberOfPages;
    }
--- a/src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs
+++ b/src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs
@ -0,0 +1,39 @@
 namespace UglyToad.Pdf.Tests.Integration
 {
    using System;
    public class AssertablePositionData
    {
        public decimal X { get; set; }
        public decimal Y { get; set; }
        public decimal Width { get; set; }
        public string Text { get; set; }
        public decimal FontSize { get; set; }
        public string FontName { get; set; }
        public static AssertablePositionData Parse(string line)
        {
            var parts = line.Split('\t', StringSplitOptions.None);
            if (parts.Length != 6)
            {
                throw new ArgumentException($"Expected 6 parts to the line, instead got {parts.Length}");
            }
            return new AssertablePositionData
            {
                X = decimal.Parse(parts[0]),
                Y = decimal.Parse(parts[1]),
                Width = decimal.Parse(parts[2]),
                Text = parts[3],
                FontSize = decimal.Parse(parts[4]),
                FontName = parts[5]
            };
        }
    }
 }
--- a/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs
+++ b/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs
@ -43,25 +43,6 @@
    public class PdfParserTests
    {
        [Fact]
        public void CanParseSimpleGoogleDocsDocument()
        {
            // To see the text as shown in Visual Studio or Notepad++, use the OtherEncodings.BytesAsLatin1String()
            var file = GetNthFilename();
            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
            {
                Assert.Equal(1, document.Pages.Count);
                var page = document.Pages.GetPage(1);
                Assert.Equal(1, page.Number);
                var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty);
                Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty));
            }
        }
        [Fact]
        public void CanDecompressNormalObjectStream()
        {
--- a/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs
+++ b/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs
@ -0,0 +1,328 @@
 // ReSharper disable AccessToDisposedClosure
 namespace UglyToad.Pdf.Tests.Integration
 {
    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using Xunit;
    public class SinglePageSimpleTests
    {
        private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
        {
            "\u200B"
        };
        private static string GetFilename()
        {
            var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
            return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
        }
        [Fact]
        public void HasCorrectNumberOfPages()
        {
            var file = GetFilename();
            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
            {
                Assert.Equal(1, document.NumberOfPages);
            }
        }
        [Fact]
        public void CanAccessPage()
        {
            var file = GetFilename();
            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
            {
                var page = document.GetPage(1);
                Assert.NotNull(page);
                Assert.Equal(1, page.Number);
            }
        }
        [Fact]
        public void AccessPageLowerThanOneThrows()
        {
            var file = GetFilename();
            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
            {
                Action action = () => document.GetPage(0);
                Assert.Throws<ArgumentOutOfRangeException>(action);
            }
        }
        [Fact]
        public void PageHasCorrectDimensions()
        {
            var file = GetFilename();
            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
            {
                var page = document.GetPage(1);
                Assert.Equal(612, page.Width);
                Assert.Equal(792, page.Height);
            }
        }
        [Fact]
        public void PageHasCorrectTextIgnoringHiddenCharacters()
        {
            var file = GetFilename();
            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
            {
                var page = document.GetPage(1);
                var text = string.Join(string.Empty, page.Letters.Select(x => x.Value).Where(x => !IgnoredHiddenCharacters.Contains(x)));
                const string expected =
                    "This is the document title  There is some lede text here  And then another line of text. ";
                Assert.Equal(expected, text);
            }
        }
        private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
        {
            // X    Y   Width   Letter  FontSize    Font
            const string fromPdfBox = @"72	105	9.771912	T	21	ArialMT
 81.77106	105	8.897049	h	21	ArialMT
 90.66733	105	3.554138	i	21	ArialMT
 94.22115	105	7.998741	s	21	ArialMT
 102.2192	105	0		21	Gautami
 106.6634	105	0		21	Gautami
 106.6634	105	3.554131	i	21	ArialMT
 110.2173	105	7.998749	s	21	ArialMT
 118.2153	105	0		21	Gautami
 122.6595	105	0		21	Gautami
 122.6595	105	4.444618	t	21	ArialMT
 127.1038	105	8.897049	h	21	ArialMT
 136	105	8.897049	e	21	ArialMT
 144.8963	105	0		21	Gautami
 149.3405	105	0		21	Gautami
 149.3405	105	8.897049	d	21	ArialMT
 158.2368	105	8.897049	o	21	ArialMT
 167.1331	105	7.998749	c	21	ArialMT
 175.1311	105	8.897049	u	21	ArialMT
 184.0274	105	13.32605	m	21	ArialMT
 197.3523	105	8.897049	e	21	ArialMT
 206.2485	105	8.897049	n	21	ArialMT
 215.1448	105	4.444611	t	21	ArialMT
 219.5891	105	0		21	Gautami
 224.0333	105	0		21	Gautami
 224.0333	105	4.444611	t	21	ArialMT
 228.4775	105	3.554138	i	21	ArialMT
 232.0313	105	4.444611	t	21	ArialMT
 236.4756	105	3.554123	l	21	ArialMT
 240.0294	105	8.897049	e	21	ArialMT
 72	143.25	6.716187	T	14	ArialMT
 78.71446	143.25	6.114899	h	14	ArialMT
 84.8278	143.25	6.114891	e	14	ArialMT
 90.94113	143.25	3.661423	r	14	ArialMT
 94.60161	143.25	6.114899	e	14	ArialMT
 100.7149	143.25	0		14	Gautami
 103.7689	143.25	0		14	Gautami
 103.7689	143.25	2.442749	i	14	ArialMT
 106.211	143.25	5.497505	s	14	ArialMT
 111.7071	143.25	0		14	Gautami
 114.7611	143.25	0		14	Gautami
 114.7611	143.25	5.497505	s	14	ArialMT
 120.2572	143.25	6.114899	o	14	ArialMT
 126.3705	143.25	9.158928	m	14	ArialMT
 135.5271	143.25	6.114899	e	14	ArialMT
 141.6404	143.25	0		14	Gautami
 144.6944	143.25	0		14	Gautami
 144.6944	143.25	2.442749	l	14	ArialMT
 147.1365	143.25	6.114899	e	14	ArialMT
 153.2499	143.25	6.114899	d	14	ArialMT
 159.3632	143.25	6.114899	e	14	ArialMT
 165.4765	143.25	0		14	Gautami
 168.5305	143.25	0		14	Gautami
 168.5305	143.25	3.054749	t	14	ArialMT
 171.5845	143.25	6.114899	e	14	ArialMT
 177.6978	143.25	5.497498	x	14	ArialMT
 183.1939	143.25	3.054764	t	14	ArialMT
 186.2479	143.25	0		14	Gautami
 189.3019	143.25	0		14	Gautami
 189.3019	143.25	6.114899	h	14	ArialMT
 195.4152	143.25	6.114899	e	14	ArialMT
 201.5285	143.25	3.661423	r	14	ArialMT
 205.189	143.25	6.114899	e	14	ArialMT
 72	173.25	7.33358	A	14	ArialMT
 79.3317	173.25	6.114891	n	14	ArialMT
 85.44504	173.25	6.114891	d	14	ArialMT
 91.55836	173.25	0		14	Gautami
 94.61235	173.25	0		14	Gautami
 94.61235	173.25	3.054756	t	14	ArialMT
 97.66633	173.25	6.114899	h	14	ArialMT
 103.7797	173.25	6.114899	e	14	ArialMT
 109.893	173.25	6.114899	n	14	ArialMT
 116.0063	173.25	0		14	Gautami
 119.0603	173.25	0		14	Gautami
 119.0603	173.25	6.114899	a	14	ArialMT
 125.1736	173.25	6.114899	n	14	ArialMT
 131.287	173.25	6.114899	o	14	ArialMT
 137.4003	173.25	3.054749	t	14	ArialMT
 140.4543	173.25	6.114899	h	14	ArialMT
 146.5676	173.25	6.114899	e	14	ArialMT
 152.6809	173.25	3.661423	r	14	ArialMT
 156.3414	173.25	0		14	Gautami
 159.3954	173.25	0		14	Gautami
 159.3954	173.25	2.442749	l	14	ArialMT
 161.8375	173.25	2.442734	i	14	ArialMT
 164.2796	173.25	6.114899	n	14	ArialMT
 170.393	173.25	6.114899	e	14	ArialMT
 176.5063	173.25	0		14	Gautami
 179.5603	173.25	0		14	Gautami
 179.5603	173.25	6.114899	o	14	ArialMT
 185.6736	173.25	3.054764	f	14	ArialMT
 188.7276	173.25	0		14	Gautami
 191.7816	173.25	0		14	Gautami
 191.7816	173.25	3.054764	t	14	ArialMT
 194.8355	173.25	6.114899	e	14	ArialMT
 200.9489	173.25	5.497482	x	14	ArialMT
 206.445	173.25	3.054764	t	14	ArialMT
 209.499	173.25	3.054764	.	14	ArialMT";
            return fromPdfBox.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
                .Select(AssertablePositionData.Parse)
                .ToList();
        }
        private static IReadOnlyList<AssertablePositionData> GetOtherPositionData1()
        {
            // These do not include the font information
            const string fromOther = @"72	105	9.758476	T	0	ArialMT
 81.77106	105	8.894608	h	0	ArialMT
 90.66733	105	3.551445	i	0	ArialMT
 94.22115	105	7.998749	s	0	ArialMT
 102.2192	105	4.431305	 	0	ArialMT
 102.2192	105	0		0	ArialMT
 106.6634	105	3.551445	i	0	ArialMT
 106.6634	105	0		0	ArialMT
 110.2173	105	7.998749	s	0	ArialMT
 118.2153	105	0		0	ArialMT
 118.2153	105	4.431305	 	0	ArialMT
 122.6595	105	4.431305	t	0	ArialMT
 122.6595	105	0		0	ArialMT
 127.1038	105	8.894608	h	0	ArialMT
 136	105	8.894608	e	0	ArialMT
 144.8963	105	4.431305	 	0	ArialMT
 144.8963	105	0		0	ArialMT
 149.3405	105	8.894608	d	0	ArialMT
 149.3405	105	0		0	ArialMT
 158.2368	105	8.894608	o	0	ArialMT
 167.1331	105	7.998749	c	0	ArialMT
 175.1311	105	8.894608	u	0	ArialMT
 184.0274	105	13.32591	m	0	ArialMT
 197.3523	105	8.894608	e	0	ArialMT
 206.2485	105	8.894608	n	0	ArialMT
 215.1448	105	4.431305	t	0	ArialMT
 219.5891	105	4.431305	 	0	ArialMT
 219.5891	105	0		0	ArialMT
 224.0333	105	4.431305	t	0	ArialMT
 224.0333	105	0		0	ArialMT
 228.4775	105	3.551453	i	0	ArialMT
 232.0313	105	4.431305	t	0	ArialMT
 236.4756	105	3.551453	l	0	ArialMT
 240.0294	105	8.894608	e	0	ArialMT
 248.918	105	4.431305	 	0	ArialMT
 72	128.25	3.045616	 	0	ArialMT
 72	143.25	6.706947	T	0	ArialMT
 78.71446	143.25	6.11322	h	0	ArialMT
 84.8278	143.25	6.11322	e	0	ArialMT
 90.94113	143.25	3.661331	r	0	ArialMT
 94.60161	143.25	6.11322	e	0	ArialMT
 100.7149	143.25	3.045616	 	0	ArialMT
 100.7149	143.25	0		0	ArialMT
 103.7689	143.25	2.440887	i	0	ArialMT
 103.7689	143.25	0		0	ArialMT
 106.211	143.25	5.497498	s	0	ArialMT
 111.7071	143.25	3.045616	 	0	ArialMT
 111.7071	143.25	0		0	ArialMT
 114.7611	143.25	0		0	ArialMT
 114.7611	143.25	5.497498	s	0	ArialMT
 120.2572	143.25	6.11322	o	0	ArialMT
 126.3705	143.25	9.158836	m	0	ArialMT
 135.5271	143.25	6.11322	e	0	ArialMT
 141.6404	143.25	0		0	ArialMT
 141.6404	143.25	3.045609	 	0	ArialMT
 144.6944	143.25	2.440887	l	0	ArialMT
 144.6944	143.25	0		0	ArialMT
 147.1365	143.25	6.11322	e	0	ArialMT
 153.2499	143.25	6.11322	d	0	ArialMT
 159.3632	143.25	6.11322	e	0	ArialMT
 165.4765	143.25	0		0	ArialMT
 165.4765	143.25	3.045609	 	0	ArialMT
 168.5305	143.25	3.045609	t	0	ArialMT
 168.5305	143.25	0		0	ArialMT
 171.5845	143.25	6.11322	e	0	ArialMT
 177.6978	143.25	5.497498	x	0	ArialMT
 183.1939	143.25	3.045609	t	0	ArialMT
 186.2479	143.25	0		0	ArialMT
 186.2479	143.25	3.045609	 	0	ArialMT
 189.3019	143.25	6.11322	h	0	ArialMT
 189.3019	143.25	0		0	ArialMT
 195.4152	143.25	6.11322	e	0	ArialMT
 201.5285	143.25	3.661331	r	0	ArialMT
 205.189	143.25	6.11322	e	0	ArialMT
 211.3008	143.25	3.045609	 	0	ArialMT
 72	158.25	3.045616	 	0	ArialMT
 72	173.25	7.32267	A	0	ArialMT
 79.3317	173.25	6.11322	n	0	ArialMT
 85.44504	173.25	6.11322	d	0	ArialMT
 91.55836	173.25	3.045616	 	0	ArialMT
 91.55836	173.25	0		0	ArialMT
 94.61235	173.25	0		0	ArialMT
 94.61235	173.25	3.045616	t	0	ArialMT
 97.66633	173.25	6.11322	h	0	ArialMT
 103.7797	173.25	6.11322	e	0	ArialMT
 109.893	173.25	6.11322	n	0	ArialMT
 116.0063	173.25	0		0	ArialMT
 116.0063	173.25	3.045616	 	0	ArialMT
 119.0603	173.25	6.11322	a	0	ArialMT
 119.0603	173.25	0		0	ArialMT
 125.1736	173.25	6.11322	n	0	ArialMT
 131.287	173.25	6.11322	o	0	ArialMT
 137.4003	173.25	3.045609	t	0	ArialMT
 140.4543	173.25	6.11322	h	0	ArialMT
 146.5676	173.25	6.11322	e	0	ArialMT
 152.6809	173.25	3.661331	r	0	ArialMT
 156.3414	173.25	3.045609	 	0	ArialMT
 156.3414	173.25	0		0	ArialMT
 159.3954	173.25	2.440887	l	0	ArialMT
 159.3954	173.25	0		0	ArialMT
 161.8375	173.25	2.440887	i	0	ArialMT
 164.2796	173.25	6.11322	n	0	ArialMT
 170.393	173.25	6.11322	e	0	ArialMT
 176.5063	173.25	3.045609	 	0	ArialMT
 176.5063	173.25	0		0	ArialMT
 179.5603	173.25	6.11322	o	0	ArialMT
 179.5603	173.25	0		0	ArialMT
 185.6736	173.25	3.045609	f	0	ArialMT
 188.7276	173.25	0		0	ArialMT
 188.7276	173.25	3.045609	 	0	ArialMT
 191.7816	173.25	3.045609	t	0	ArialMT
 191.7816	173.25	0		0	ArialMT
 194.8355	173.25	6.11322	e	0	ArialMT
 200.9489	173.25	5.497498	x	0	ArialMT
 206.445	173.25	3.045609	t	0	ArialMT
 209.499	173.25	3.045609	.	0	ArialMT
 212.543	173.25	3.045609	 	0	ArialMT";
            return fromOther.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
                .Select(AssertablePositionData.Parse)
                .ToList();
        }
    }
 }
--- a/src/UglyToad.Pdf/Content/Page.cs
+++ b/src/UglyToad.Pdf/Content/Page.cs
@ -10,11 +10,21 @@
        /// </summary>
        public int Number { get; }
-        public MediaBox MediaBox { get; }
+        internal MediaBox MediaBox { get; }
        internal PageContent Content { get; }
-        public IReadOnlyList<Letter> Text => Content?.Letters ?? new Letter[0];
+        public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
        /// <summary>
        /// Gets the width of the page in points.
        /// </summary>
        public decimal Width { get; }
        /// <summary>
        /// Gets the height of the page in points.
        /// </summary>
        public decimal Height { get; }
        internal Page(int number, MediaBox mediaBox, PageContent content)
        {
@ -26,6 +36,9 @@
            Number = number;
            MediaBox = mediaBox;
            Content = content;
            Width = mediaBox.Bounds.Width;
            Height = mediaBox.Bounds.Height;
        }
    }
 }
--- a/src/UglyToad.Pdf/Content/Pages.cs
+++ b/src/UglyToad.Pdf/Content/Pages.cs
@ -75,7 +75,7 @@
            if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary))
            {
-                throw new InvalidOperationException("Could not find the page with number: " + pageNumber);
+                throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
            }
            var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);
--- a/src/UglyToad.Pdf/Parser/BaseParser.cs
+++ b/src/UglyToad.Pdf/Parser/BaseParser.cs
@ -1,941 +0,0 @@
 namespace UglyToad.Pdf.Parser
 {
    using System;
    using System.IO;
    using System.Text;
    using Cos;
    using IO;
    using Util;
    /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
    /**
     * This class is used to contain parsing logic that will be used by both the
     * PDFParser and the COSStreamParser.
     *
     * @author Ben Litchfield
     */
    public abstract class BaseParser
    {
        private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L;
        private static readonly long GENERATION_NUMBER_THRESHOLD = 65535;
        static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length;
        /**
         * Log instance.
         */
        protected static readonly int E = 'e';
        protected static readonly int N = 'n';
        protected static readonly int D = 'd';
        protected static readonly int S = 's';
        protected static readonly int T = 't';
        protected static readonly int R = 'r';
        protected static readonly int A = 'a';
        protected static readonly int M = 'm';
        protected static readonly int O = 'o';
        protected static readonly int B = 'b';
        protected static readonly int J = 'j';
        /**
         * This is a string constant that will be used for comparisons.
         */
        public static readonly string DEF = "def";
        /**
         * This is a string constant that will be used for comparisons.
         */
        protected static readonly string ENDOBJ_string = "endobj";
        /**
         * This is a string constant that will be used for comparisons.
         */
        protected static readonly string ENDSTREAM_string = "endstream";
        /**
         * This is a string constant that will be used for comparisons.
         */
        protected static readonly string STREAM_string = "stream";
        /**
         * This is a string constant that will be used for comparisons.
         */
        private static readonly string TRUE = "true";
        /**
         * This is a string constant that will be used for comparisons.
         */
        private static readonly string FALSE = "false";
        /**
         * This is a string constant that will be used for comparisons.
         */
        private static readonly string NULL = "null";
        /**
         * ASCII code for line feed.
         */
        protected static readonly byte ASCII_LF = 10;
        /**
         * ASCII code for carriage return.
         */
        protected static readonly byte ASCII_CR = 13;
        private static readonly byte ASCII_ZERO = 48;
        private static readonly byte ASCII_NINE = 57;
        private static readonly byte ASCII_SPACE = 32;
        /**
         * This is the stream that will be read from.
         */
        protected readonly SequentialSource seqSource;
        /**
         * This is the document that will be parsed.
         */
        protected COSDocument document;
        /**
         * Default constructor.
         */
        public BaseParser(SequentialSource pdfSource)
        {
            this.seqSource = pdfSource;
        }
        private static bool isHexDigit(char ch)
        {
            return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
        }
        protected void skipWhiteSpaces()
        {
            //PDF Ref 3.2.7 A stream must be followed by either
            //a CRLF or LF but nothing else.
            int whitespace = seqSource.read();
            //see brother_scan_cover.pdf, it adds whitespaces
            //after the stream but before the start of the
            //data, so just read those first
            while (ASCII_SPACE == whitespace)
            {
                whitespace = seqSource.read();
            }
            if (ASCII_CR == whitespace)
            {
                whitespace = seqSource.read();
                if (ASCII_LF != whitespace)
                {
                    seqSource.unread(whitespace);
                    //The spec says this is invalid but it happens in the real
                    //world so we must support it.
                }
            }
            else if (ASCII_LF != whitespace)
            {
                //we are in an error.
                //but again we will do a lenient parsing and just assume that everything
                //is fine
                seqSource.unread(whitespace);
            }
        }
        /**
         * This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
         * format: /Title ( (5) /Creator which was patched in 1 place.
         *
         * However it missed the case where the number of opening and closing parenthesis isn't balanced
         *
         * The second bug was in this format /Title (c:\) /Producer
         *
         * This patch moves this code out of the parseCOSstring method, so it can be used twice.
         *
         * @param bracesParameter the number of braces currently open.
         *
         * @return the corrected value of the brace counter
         * @throws IOException
         */
        private int checkForEndOfstring(int bracesParameter)
        {
            int braces = bracesParameter;
            byte[]
            nextThreeBytes = new byte[3];
            int amountRead = seqSource.read(nextThreeBytes);
            // Check the next 3 bytes if available
            // The following cases are valid indicators for the end of the string
            // 1. Next line contains another COSObject: CR + LF + '/'
            // 2. CosDictionary ends in the next line: CR + LF + '>'
            // 3. Next line contains another COSObject: CR + '/'
            // 4. CosDictionary ends in the next line: CR + '>'
            if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR)
            {
                if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>')
                || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
                {
                    braces = 0;
                }
            }
            if (amountRead > 0)
            {
                seqSource.unread(nextThreeBytes, 0, amountRead);
            }
            return braces;
        }
        /**
         * This will parse a PDF string.
         *
         * @return The parsed PDF string.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected CosString parseCOSstring()
        {
            char nextChar = (char)seqSource.read();
            if (nextChar == '<')
            {
                return parseCOSHexstring();
            }
            else if (nextChar != '(')
            {
                throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
                nextChar + "' " + seqSource);
            }
            var charLf = (char)ASCII_LF;
            using (var memoryStream = new MemoryStream())
            using (var writer = new StreamWriter(memoryStream))
            {
                // This is the number of braces read
                int braces = 1;
                int c = seqSource.read();
                while (braces > 0 && c != -1)
                {
                    char ch = (char)c;
                    int nextc = -2; // not yet read
                    if (ch == ')')
                    {
                        braces--;
                        braces = checkForEndOfstring(braces);
                        if (braces != 0)
                        {
                            writer.Write(ch);
                        }
                    }
                    else if (ch == '(')
                    {
                        braces++;
                        writer.Write(ch);
                    }
                    else if (ch == '\\')
                    {
                        //patched by ram
                        char next = (char)seqSource.read();
                        switch (next)
                        {
                            case 'n':
                                writer.Write('\n');
                                break;
                            case 'r':
                                writer.Write('\r');
                                break;
                            case 't':
                                writer.Write('\t');
                                break;
                            case 'b':
                                writer.Write('\b');
                                break;
                            case 'f':
                                writer.Write('\f');
                                break;
                            case ')':
                                // PDFBox 276 /Title (c:\)
                                braces = checkForEndOfstring(braces);
                                if (braces != 0)
                                {
                                    writer.Write(next);
                                }
                                else
                                {
                                    writer.Write('\\');
                                }
                                break;
                            case '(':
                            case '\\':
                                writer.Write(next);
                                break;
                                //case charLf:
                                // case ASCII_CR:
                                //this is a break in the line so ignore it and the newline and continue
                                c = seqSource.read();
                                while (isEOL(c) && c != -1)
                                {
                                    c = seqSource.read();
                                }
                                nextc = c;
                                break;
                            case '0':
                            case '1':
                            case '2':
                            case '3':
                            case '4':
                            case '5':
                            case '6':
                            case '7':
                                {
                                    var octal = new StringBuilder();
                                    octal.Append(next);
                                    c = seqSource.read();
                                    char digit = (char)c;
                                    if (digit >= '0' && digit <= '7')
                                    {
                                        octal.Append(digit);
                                        c = seqSource.read();
                                        digit = (char)c;
                                        if (digit >= '0' && digit <= '7')
                                        {
                                            octal.Append(digit);
                                        }
                                        else
                                        {
                                            nextc = c;
                                        }
                                    }
                                    else
                                    {
                                        nextc = c;
                                    }
                                    int character = 0;
                                    try
                                    {
                                        character = Convert.ToInt32(octal.ToString(), 8);
                                    }
                                    catch (FormatException e)
                                    {
                                        throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
                                    }
                                    writer.Write(character);
                                    break;
                                }
                            default:
                                // dropping the backslash
                                // see 7.3.4.2 Literal strings for further information
                                writer.Write(next);
                                break;
                        }
                    }
                    else
                    {
                        writer.Write(ch);
                    }
                    if (nextc != -2)
                    {
                        c = nextc;
                    }
                    else
                    {
                        c = seqSource.read();
                    }
                }
                if (c != -1)
                {
                    seqSource.unread(c);
                }
                return new CosString(memoryStream.ToArray());
            }
        }
        /**
         * This will parse a PDF HEX string with fail fast semantic
         * meaning that we stop if a not allowed character is found.
         * This is necessary in order to detect malformed input and
         * be able to skip to next object start.
         *
         * We assume starting '&lt;' was already read.
         * 
         * @return The parsed PDF string.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        private CosString parseCOSHexstring()
        {
            var sBuf = new StringBuilder();
            while (true)
            {
                int c = seqSource.read();
                if (isHexDigit((char)c))
                {
                    sBuf.Append((char)c);
                }
                else if (c == '>')
                {
                    break;
                }
                else if (c < 0)
                {
                    throw new IOException("Missing closing bracket for hex string. Reached EOS.");
                }
                else if ((c == ' ') || (c == '\n') ||
                (c == '\t') || (c == '\r') ||
                (c == '\b') || (c == '\f'))
                {
                    continue;
                }
                else
                {
                    // if invalid chars was found: discard last
                    // hex character if it is not part of a pair
                    if (sBuf.Length % 2 != 0)
                    {
                        sBuf.Remove(sBuf.Length - 1, 1);
                    }
                    // read till the closing bracket was found
                    do
                    {
                        c = seqSource.read();
                    }
                    while (c != '>' && c >= 0);
                    // might have reached EOF while looking for the closing bracket
                    // this can happen for malformed PDFs only. Make sure that there is
                    // no endless loop.
                    if (c < 0)
                    {
                        throw new IOException("Missing closing bracket for hex string. Reached EOS.");
                    }
                    // exit loop
                    break;
                }
            }
            return CosString.ParseHex(sBuf.ToString());
        }
        /**
         * Determine if a character terminates a PDF name.
         *
         * @param ch The character
         * @return true if the character terminates a PDF name, otherwise false.
         */
        protected bool isEndOfName(int ch)
        {
            return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' ||
            ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' ||
            ch == 0 || ch == '\f';
        }
        /**
         * Returns true if a byte sequence is valid UTF-8.
         */
        private bool isValidUTF8(byte[] input)
        {
            try
            {
                Decoder d = Encoding.UTF8.GetDecoder();
                var charLength = d.GetCharCount(input, 0, input.Length);
                var chars = new char[charLength];
                d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _);
                return true;
            }
            catch (Exception e)
            {
                return false;
            }
        }
        /**
         * This will parse a bool object from the stream.
         *
         * @return The parsed bool object.
         *
         * @throws IOException If an IO error occurs during parsing.
         */
        protected CosBoolean parsebool()
        {
            CosBoolean retval = null;
            char c = (char)seqSource.peek();
            if (c == 't')
            {
                string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4));
                if (!truestring.Equals(TRUE))
                {
                    throw new IOException("Error parsing bool: expected='true' actual='" + truestring
                    + "' at offset " + seqSource.getPosition());
                }
                else
                {
                    retval = CosBoolean.True;
                }
            }
            else if (c == 'f')
            {
                string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5));
                if (!falsestring.Equals(FALSE))
                {
                    throw new IOException("Error parsing bool: expected='true' actual='" + falsestring
                    + "' at offset " + seqSource.getPosition());
                }
                else
                {
                    retval = CosBoolean.False;
                }
            }
            else
            {
                throw new IOException("Error parsing bool expected='t or f' actual='" + c
                + "' at offset " + seqSource.getPosition());
            }
            return retval;
        }
        /**
         * This will read the next string from the stream.
         *
         * @return The string that was read from the stream, never null.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected string readstring()
        {
            SkipSpaces();
            StringBuilder buffer = new StringBuilder();
            int c = seqSource.read();
            while (!isEndOfName((char)c) && c != -1)
            {
                buffer.Append((char)c);
                c = seqSource.read();
            }
            if (c != -1)
            {
                seqSource.unread(c);
            }
            return buffer.ToString();
        }
        /**
         * Read one string and throw an exception if it is not the expected value.
         *
         * @param expectedstring the string value that is expected.
         * @throws IOException if the string char is not the expected value or if an
         * I/O error occurs.
         */
        protected void readExpectedstring(string expectedstring)
        {
            readExpectedstring(expectedstring, false);
        }
        /**
         * Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
         * 
         * @param expectedstring pattern to be skipped
         * @param skipSpaces if set to true spaces before and after the string will be skipped
         * @throws IOException if pattern could not be read
         */
        protected void readExpectedstring(string expectedstring, bool skipSpaces)
        {
            SkipSpaces();
            foreach (var c in expectedstring)
            {
                if (seqSource.read() != c)
                {
                    throw new IOException("Expected string '" + expectedstring
                    + "' but missed at character '" + c + "' at offset "
                    + seqSource.getPosition());
                }
            }
            SkipSpaces();
        }
        /**
         * Read one char and throw an exception if it is not the expected value.
         *
         * @param ec the char value that is expected.
         * @throws IOException if the read char is not the expected value or if an
         * I/O error occurs.
         */
        protected void readExpectedChar(char ec)
        {
            char c = (char)seqSource.read();
            if (c != ec)
            {
                throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition());
            }
        }
        /**
         * This will read the next string from the stream up to a certain length.
         *
         * @param length The length to stop reading at.
         *
         * @return The string that was read from the stream of length 0 to length.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected string readstring(int length)
        {
            SkipSpaces();
            int c = seqSource.read();
            //average string size is around 2 and the normal string buffer size is
            //about 16 so lets save some space.
            StringBuilder buffer = new StringBuilder(length);
            while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length &&
            c != '[' &&
            c != '<' &&
            c != '(' &&
            c != '/')
            {
                buffer.Append((char)c);
                c = seqSource.read();
            }
            if (c != -1)
            {
                seqSource.unread(c);
            }
            return buffer.ToString();
        }
        /**
         * This will tell if the next character is a closing brace( close of PDF array ).
         *
         * @return true if the next byte is ']', false otherwise.
         *
         * @throws IOException If an IO error occurs.
         */
        protected bool isClosing()
        {
            return isClosing(seqSource.peek());
        }
        /**
         * This will tell if the next character is a closing brace( close of PDF array ).
         *
         * @param c The character to check against end of line
         * @return true if the next byte is ']', false otherwise.
         */
        protected bool isClosing(int c)
        {
            return c == ']';
        }
        /**
         * This will read bytes until the first end of line marker occurs.
         * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
         * which is an important detail if one wants to unread the line.
         *
         * @return The characters between the current position and the end of the line.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected string readLine()
        {
            if (seqSource.isEOF())
            {
                throw new IOException("Error: End-of-File, expected line");
            }
            StringBuilder buffer = new StringBuilder(11);
            int c;
            while ((c = seqSource.read()) != -1)
            {
                // CR and LF are valid EOLs
                if (isEOL(c))
                {
                    break;
                }
                buffer.Append((char)c);
            }
            // CR+LF is also a valid EOL 
            if (isCR(c) && isLF(seqSource.peek()))
            {
                seqSource.read();
            }
            return buffer.ToString();
        }
        /**
         * This will tell if the next byte to be read is an end of line byte.
         *
         * @return true if the next byte is 0x0A or 0x0D.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected bool isEOL()
        {
            return isEOL(seqSource.peek());
        }
        /**
         * This will tell if the next byte to be read is an end of line byte.
         *
         * @param c The character to check against end of line
         * @return true if the next byte is 0x0A or 0x0D.
         */
        protected bool isEOL(int c)
        {
            return isLF(c) || isCR(c);
        }
        private bool isLF(int c)
        {
            return ASCII_LF == c;
        }
        private bool isCR(int c)
        {
            return ASCII_CR == c;
        }
        /**
         * This will tell if the next byte is whitespace or not.
         *
         * @return true if the next byte in the stream is a whitespace character.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected bool isWhitespace()
        {
            return isWhitespace(seqSource.peek());
        }
        /**
         * This will tell if a character is whitespace or not.  These values are
         * specified in table 1 (page 12) of ISO 32000-1:2008.
         * @param c The character to check against whitespace
         * @return true if the character is a whitespace character.
         */
        protected bool isWhitespace(int c)
        {
            return c == 0 || c == 9 || c == 12 || c == ASCII_LF
            || c == ASCII_CR || c == ASCII_SPACE;
        }
        /**
         * This will tell if the next byte is a space or not.
         *
         * @return true if the next byte in the stream is a space character.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected bool isSpace()
        {
            return isSpace(seqSource.peek());
        }
        /**
         * This will tell if the given value is a space or not.
         * 
         * @param c The character to check against space
         * @return true if the next byte in the stream is a space character.
         */
        protected bool isSpace(int c)
        {
            return ASCII_SPACE == c;
        }
        /**
         * This will tell if the next byte is a digit or not.
         *
         * @return true if the next byte in the stream is a digit.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected bool isDigit()
        {
            return isDigit(seqSource.peek());
        }
        /**
         * This will tell if the given value is a digit or not.
         * 
         * @param c The character to be checked
         * @return true if the next byte in the stream is a digit.
         */
        protected static bool isDigit(int c)
        {
            return c >= ASCII_ZERO && c <= ASCII_NINE;
        }
        /**
         * This will skip all spaces and comments that are present.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected void SkipSpaces()
        {
            int c = seqSource.read();
            // 37 is the % character, a comment
            while (isWhitespace(c) || c == 37)
            {
                if (c == 37)
                {
                    // skip past the comment section
                    c = seqSource.read();
                    while (!isEOL(c) && c != -1)
                    {
                        c = seqSource.read();
                    }
                }
                else
                {
                    c = seqSource.read();
                }
            }
            if (c != -1)
            {
                seqSource.unread(c);
            }
        }
        /**
         * This will read a long from the Stream and throw an {@link IOException} if
         * the long value is negative or has more than 10 digits (i.e. : bigger than
         * {@link #OBJECT_NUMBER_THRESHOLD})
         *
         * @return the object number being read.
         * @throws IOException if an I/O error occurs
         */
        protected long readObjectNumber()
        {
            long retval = readLong();
            if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
            {
                throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
            }
            return retval;
        }
        /**
         * This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
         * has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
         * @return the generation number being read.
         * @throws IOException if an I/O error occurs
         */
        protected int readGenerationNumber()
        {
            int retval = readInt();
            if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
            {
                throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
            }
            return retval;
        }
        /**
         * This will read an integer from the stream.
         *
         * @return The integer that was read from the stream.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected int readInt()
        {
            SkipSpaces();
            int retval = 0;
            StringBuilder intBuffer = readstringNumber();
            try
            {
                retval = int.Parse(intBuffer.ToString());
            }
            catch (FormatException e)
            {
                seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
                throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e);
            }
            return retval;
        }
        /**
         * This will read an long from the stream.
         *
         * @return The long that was read from the stream.
         *
         * @throws IOException If there is an error reading from the stream.
         */
        protected long readLong()
        {
            SkipSpaces();
            long retval = 0;
            StringBuilder longBuffer = readstringNumber();
            try
            {
                retval = long.Parse(longBuffer.ToString());
            }
            catch (FormatException e)
            {
                seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString()));
                throw new IOException(
                    $"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e);
            }
            return retval;
        }
        /**
         * This method is used to read a token by the {@linkplain #readInt()} method
         * and the {@linkplain #readLong()} method.
         *
         * @return the token to parse as integer or long by the calling method.
         * @throws IOException throws by the {@link #seqSource} methods.
         */
        protected StringBuilder readstringNumber()
        {
            int lastByte = 0;
            StringBuilder buffer = new StringBuilder();
            while ((lastByte = seqSource.read()) != ASCII_SPACE &&
            lastByte != ASCII_LF &&
            lastByte != ASCII_CR &&
            lastByte != 60 && //see sourceforge bug 1714707
            lastByte != '[' && // PDFBOX-1845
            lastByte != '(' && // PDFBOX-2579
            lastByte != 0 && //See sourceforge bug 853328
            lastByte != -1)
            {
                buffer.Append((char)lastByte);
                if (buffer.Length > MAX_LENGTH_LONG)
                {
                    throw new IOException("Number '" + buffer +
                    "' is getting too long, stop reading at offset " + seqSource.getPosition());
                }
            }
            if (lastByte != -1)
            {
                seqSource.unread(lastByte);
            }
            return buffer;
        }
    }
 }
--- a/src/UglyToad.Pdf/PdfDocument.cs
+++ b/src/UglyToad.Pdf/PdfDocument.cs
@ -25,10 +25,15 @@
        private readonly ParsingCachingProviders cachingProviders;
        [NotNull]
-        public Catalog Catalog { get; }
+        internal Catalog Catalog { get; }
        [NotNull]
-        public Pages Pages { get; }
+        internal Pages Pages { get; }
        /// <summary>
        /// Get the number of pages in this document.
        /// </summary>
        public int NumberOfPages => Pages.Count;
        internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
            bool isLenientParsing, 
@ -50,6 +55,16 @@
        public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
        public static PdfDocument Open(string filename, ParsingOptions options = null) => PdfDocumentFactory.Open(filename, options);
        /// <summary>
        /// Get the page with the specified page number.
        /// </summary>
        /// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
        /// <returns>The page.</returns>
        public Page GetPage(int pageNumber)
        {
            return Pages.GetPage(pageNumber);
        }
        public void Dispose()
        {
            try