encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

2025-06-28 15:30:17 +08:00 · 2017-12-28 13:14:03 +00:00 · 2017-12-28 13:14:03 +00:00 · b1d28a5af8
commit b1d28a5af8
parent 940c51e2fb
8 changed files with 437 additions and 966 deletions
--- a/README.md
+++ b/README.md
@ -3,4 +3,40 @@
 [![Build status](https://ci.appveyor.com/api/projects/status/ni7et2j2ml60pdi3?svg=true)](https://ci.appveyor.com/project/EliotJones/pdf)
 [![codecov](https://codecov.io/gh/UglyToad/Pdf/branch/master/graph/badge.svg)](https://codecov.io/gh/UglyToad/Pdf)

-Convert the [PdfBox](https://github.com/apache/pdfbox) code to C#.
+The aim of this project is to convert the [PdfBox](https://github.com/apache/pdfbox) code to C# in order to provide a properly open source (i.e. no copyleft) solution for inspecting PDF documents. This uses the Apache 2.0 licence.
+
+## Status ##
+
+There is a lot left to do for this project, the initial minimum viable project when released to Alpha will provide:
+
+ Page counts and sizes (in points) for a document.
+ Access to the text contents of each page. Note that since PDF has no concept of a "word" it will be up to the consumer of the text to work out where the words are within the text.
+ (Possible) The locations and bounds of each letter on the page.
+
+For the initial alpha release all files will be opened rather than streamed so this will not support large files.
+
+Eventually the library should support all existing PdfBox operations such as accessing graphical elements, form elements as well as creating PDF documents.
+
+## Usage ##
+
+The initial public API will be as limited as possible to allow extensive refactoring to take place. The proposed usage is as follows:
+
+    using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf"))
+    {
+        int pageCount = document.NumberOfPages;
+
+        Page page = document.GetPage(1);
+
+        decimal widthInPoints = page.Width;
+        decimal heightInPoints = page.Height;
+
+        string text = page.Text;
+    }
+
+The ```PdfDocument``` will also support opening from byte arrays (as well as streams eventually):
+
+    byte[] fileBytes = File.ReadAllBytes(@"C:\my-file.pdf");
+    (using PdfDocument document = PdfDocument.Open(fileBytes))
+    {
+        int numberOfPages = document.NumberOfPages;
+    }
--- a/src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs
+++ b/src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs
@ -0,0 +1,39 @@
+namespace UglyToad.Pdf.Tests.Integration
+{
+    using System;
+
+    public class AssertablePositionData
+    {
+        public decimal X { get; set; }
+
+        public decimal Y { get; set; }
+
+        public decimal Width { get; set; }
+
+        public string Text { get; set; }
+
+        public decimal FontSize { get; set; }
+
+        public string FontName { get; set; }
+
+        public static AssertablePositionData Parse(string line)
+        {
+            var parts = line.Split('\t', StringSplitOptions.None);
+
+            if (parts.Length != 6)
+            {
+                throw new ArgumentException($"Expected 6 parts to the line, instead got {parts.Length}");
+            }
+
+            return new AssertablePositionData
+            {
+                X = decimal.Parse(parts[0]),
+                Y = decimal.Parse(parts[1]),
+                Width = decimal.Parse(parts[2]),
+                Text = parts[3],
+                FontSize = decimal.Parse(parts[4]),
+                FontName = parts[5]
+            };
+        }
+    }
+}
--- a/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs
+++ b/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs
@ -43,25 +43,6 @@

    public class PdfParserTests
    {
-        [Fact]
-        public void CanParseSimpleGoogleDocsDocument()
-        {
-            // To see the text as shown in Visual Studio or Notepad++, use the OtherEncodings.BytesAsLatin1String()
-            var file = GetNthFilename();
-            
-            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
-            {
-                Assert.Equal(1, document.Pages.Count);
-
-                var page = document.Pages.GetPage(1);
-                Assert.Equal(1, page.Number);
-
-                var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty);
-
-                Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty));
-            }
-        }
-
        [Fact]
        public void CanDecompressNormalObjectStream()
        {
--- a/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs
+++ b/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs
@ -0,0 +1,328 @@
+// ReSharper disable AccessToDisposedClosure
+namespace UglyToad.Pdf.Tests.Integration
+{
+    using System;
+    using System.Collections.Generic;
+    using System.IO;
+    using System.Linq;
+    using Xunit;
+
+    public class SinglePageSimpleTests
+    {
+        private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
+        {
+            "\u200B"
+        };
+
+        private static string GetFilename()
+        {
+            var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
+
+            return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
+        }
+
+        [Fact]
+        public void HasCorrectNumberOfPages()
+        {
+            var file = GetFilename();
+
+            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
+            {
+                Assert.Equal(1, document.NumberOfPages);
+            }
+        }
+
+        [Fact]
+        public void CanAccessPage()
+        {
+            var file = GetFilename();
+
+            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
+            {
+                var page = document.GetPage(1);
+
+                Assert.NotNull(page);
+
+                Assert.Equal(1, page.Number);
+            }
+        }
+
+        [Fact]
+        public void AccessPageLowerThanOneThrows()
+        {
+            var file = GetFilename();
+
+            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
+            {
+                Action action = () => document.GetPage(0);
+
+                Assert.Throws<ArgumentOutOfRangeException>(action);
+            }
+        }
+
+        [Fact]
+        public void PageHasCorrectDimensions()
+        {
+            var file = GetFilename();
+
+            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
+            {
+                var page = document.GetPage(1);
+
+                Assert.Equal(612, page.Width);
+                Assert.Equal(792, page.Height);
+            }
+        }
+
+        [Fact]
+        public void PageHasCorrectTextIgnoringHiddenCharacters()
+        {
+            var file = GetFilename();
+
+            using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
+            {
+                var page = document.GetPage(1);
+
+                var text = string.Join(string.Empty, page.Letters.Select(x => x.Value).Where(x => !IgnoredHiddenCharacters.Contains(x)));
+
+                const string expected =
+                    "This is the document title  There is some lede text here  And then another line of text. ";
+
+                Assert.Equal(expected, text);
+            }
+        }
+
+        private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
+        {
+            // X    Y   Width   Letter  FontSize    Font
+            const string fromPdfBox = @"72	105	9.771912	T	21	ArialMT
+81.77106	105	8.897049	h	21	ArialMT
+90.66733	105	3.554138	i	21	ArialMT
+94.22115	105	7.998741	s	21	ArialMT
+102.2192	105	0		21	Gautami
+106.6634	105	0		21	Gautami
+106.6634	105	3.554131	i	21	ArialMT
+110.2173	105	7.998749	s	21	ArialMT
+118.2153	105	0		21	Gautami
+122.6595	105	0		21	Gautami
+122.6595	105	4.444618	t	21	ArialMT
+127.1038	105	8.897049	h	21	ArialMT
+136	105	8.897049	e	21	ArialMT
+144.8963	105	0		21	Gautami
+149.3405	105	0		21	Gautami
+149.3405	105	8.897049	d	21	ArialMT
+158.2368	105	8.897049	o	21	ArialMT
+167.1331	105	7.998749	c	21	ArialMT
+175.1311	105	8.897049	u	21	ArialMT
+184.0274	105	13.32605	m	21	ArialMT
+197.3523	105	8.897049	e	21	ArialMT
+206.2485	105	8.897049	n	21	ArialMT
+215.1448	105	4.444611	t	21	ArialMT
+219.5891	105	0		21	Gautami
+224.0333	105	0		21	Gautami
+224.0333	105	4.444611	t	21	ArialMT
+228.4775	105	3.554138	i	21	ArialMT
+232.0313	105	4.444611	t	21	ArialMT
+236.4756	105	3.554123	l	21	ArialMT
+240.0294	105	8.897049	e	21	ArialMT
+72	143.25	6.716187	T	14	ArialMT
+78.71446	143.25	6.114899	h	14	ArialMT
+84.8278	143.25	6.114891	e	14	ArialMT
+90.94113	143.25	3.661423	r	14	ArialMT
+94.60161	143.25	6.114899	e	14	ArialMT
+100.7149	143.25	0		14	Gautami
+103.7689	143.25	0		14	Gautami
+103.7689	143.25	2.442749	i	14	ArialMT
+106.211	143.25	5.497505	s	14	ArialMT
+111.7071	143.25	0		14	Gautami
+114.7611	143.25	0		14	Gautami
+114.7611	143.25	5.497505	s	14	ArialMT
+120.2572	143.25	6.114899	o	14	ArialMT
+126.3705	143.25	9.158928	m	14	ArialMT
+135.5271	143.25	6.114899	e	14	ArialMT
+141.6404	143.25	0		14	Gautami
+144.6944	143.25	0		14	Gautami
+144.6944	143.25	2.442749	l	14	ArialMT
+147.1365	143.25	6.114899	e	14	ArialMT
+153.2499	143.25	6.114899	d	14	ArialMT
+159.3632	143.25	6.114899	e	14	ArialMT
+165.4765	143.25	0		14	Gautami
+168.5305	143.25	0		14	Gautami
+168.5305	143.25	3.054749	t	14	ArialMT
+171.5845	143.25	6.114899	e	14	ArialMT
+177.6978	143.25	5.497498	x	14	ArialMT
+183.1939	143.25	3.054764	t	14	ArialMT
+186.2479	143.25	0		14	Gautami
+189.3019	143.25	0		14	Gautami
+189.3019	143.25	6.114899	h	14	ArialMT
+195.4152	143.25	6.114899	e	14	ArialMT
+201.5285	143.25	3.661423	r	14	ArialMT
+205.189	143.25	6.114899	e	14	ArialMT
+72	173.25	7.33358	A	14	ArialMT
+79.3317	173.25	6.114891	n	14	ArialMT
+85.44504	173.25	6.114891	d	14	ArialMT
+91.55836	173.25	0		14	Gautami
+94.61235	173.25	0		14	Gautami
+94.61235	173.25	3.054756	t	14	ArialMT
+97.66633	173.25	6.114899	h	14	ArialMT
+103.7797	173.25	6.114899	e	14	ArialMT
+109.893	173.25	6.114899	n	14	ArialMT
+116.0063	173.25	0		14	Gautami
+119.0603	173.25	0		14	Gautami
+119.0603	173.25	6.114899	a	14	ArialMT
+125.1736	173.25	6.114899	n	14	ArialMT
+131.287	173.25	6.114899	o	14	ArialMT
+137.4003	173.25	3.054749	t	14	ArialMT
+140.4543	173.25	6.114899	h	14	ArialMT
+146.5676	173.25	6.114899	e	14	ArialMT
+152.6809	173.25	3.661423	r	14	ArialMT
+156.3414	173.25	0		14	Gautami
+159.3954	173.25	0		14	Gautami
+159.3954	173.25	2.442749	l	14	ArialMT
+161.8375	173.25	2.442734	i	14	ArialMT
+164.2796	173.25	6.114899	n	14	ArialMT
+170.393	173.25	6.114899	e	14	ArialMT
+176.5063	173.25	0		14	Gautami
+179.5603	173.25	0		14	Gautami
+179.5603	173.25	6.114899	o	14	ArialMT
+185.6736	173.25	3.054764	f	14	ArialMT
+188.7276	173.25	0		14	Gautami
+191.7816	173.25	0		14	Gautami
+191.7816	173.25	3.054764	t	14	ArialMT
+194.8355	173.25	6.114899	e	14	ArialMT
+200.9489	173.25	5.497482	x	14	ArialMT
+206.445	173.25	3.054764	t	14	ArialMT
+209.499	173.25	3.054764	.	14	ArialMT";
+
+            return fromPdfBox.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
+                .Select(AssertablePositionData.Parse)
+                .ToList();
+        }
+
+        private static IReadOnlyList<AssertablePositionData> GetOtherPositionData1()
+        {
+            // These do not include the font information
+            const string fromOther = @"72	105	9.758476	T	0	ArialMT
+81.77106	105	8.894608	h	0	ArialMT
+90.66733	105	3.551445	i	0	ArialMT
+94.22115	105	7.998749	s	0	ArialMT
+102.2192	105	4.431305	 	0	ArialMT
+102.2192	105	0		0	ArialMT
+106.6634	105	3.551445	i	0	ArialMT
+106.6634	105	0		0	ArialMT
+110.2173	105	7.998749	s	0	ArialMT
+118.2153	105	0		0	ArialMT
+118.2153	105	4.431305	 	0	ArialMT
+122.6595	105	4.431305	t	0	ArialMT
+122.6595	105	0		0	ArialMT
+127.1038	105	8.894608	h	0	ArialMT
+136	105	8.894608	e	0	ArialMT
+144.8963	105	4.431305	 	0	ArialMT
+144.8963	105	0		0	ArialMT
+149.3405	105	8.894608	d	0	ArialMT
+149.3405	105	0		0	ArialMT
+158.2368	105	8.894608	o	0	ArialMT
+167.1331	105	7.998749	c	0	ArialMT
+175.1311	105	8.894608	u	0	ArialMT
+184.0274	105	13.32591	m	0	ArialMT
+197.3523	105	8.894608	e	0	ArialMT
+206.2485	105	8.894608	n	0	ArialMT
+215.1448	105	4.431305	t	0	ArialMT
+219.5891	105	4.431305	 	0	ArialMT
+219.5891	105	0		0	ArialMT
+224.0333	105	4.431305	t	0	ArialMT
+224.0333	105	0		0	ArialMT
+228.4775	105	3.551453	i	0	ArialMT
+232.0313	105	4.431305	t	0	ArialMT
+236.4756	105	3.551453	l	0	ArialMT
+240.0294	105	8.894608	e	0	ArialMT
+248.918	105	4.431305	 	0	ArialMT
+72	128.25	3.045616	 	0	ArialMT
+72	143.25	6.706947	T	0	ArialMT
+78.71446	143.25	6.11322	h	0	ArialMT
+84.8278	143.25	6.11322	e	0	ArialMT
+90.94113	143.25	3.661331	r	0	ArialMT
+94.60161	143.25	6.11322	e	0	ArialMT
+100.7149	143.25	3.045616	 	0	ArialMT
+100.7149	143.25	0		0	ArialMT
+103.7689	143.25	2.440887	i	0	ArialMT
+103.7689	143.25	0		0	ArialMT
+106.211	143.25	5.497498	s	0	ArialMT
+111.7071	143.25	3.045616	 	0	ArialMT
+111.7071	143.25	0		0	ArialMT
+114.7611	143.25	0		0	ArialMT
+114.7611	143.25	5.497498	s	0	ArialMT
+120.2572	143.25	6.11322	o	0	ArialMT
+126.3705	143.25	9.158836	m	0	ArialMT
+135.5271	143.25	6.11322	e	0	ArialMT
+141.6404	143.25	0		0	ArialMT
+141.6404	143.25	3.045609	 	0	ArialMT
+144.6944	143.25	2.440887	l	0	ArialMT
+144.6944	143.25	0		0	ArialMT
+147.1365	143.25	6.11322	e	0	ArialMT
+153.2499	143.25	6.11322	d	0	ArialMT
+159.3632	143.25	6.11322	e	0	ArialMT
+165.4765	143.25	0		0	ArialMT
+165.4765	143.25	3.045609	 	0	ArialMT
+168.5305	143.25	3.045609	t	0	ArialMT
+168.5305	143.25	0		0	ArialMT
+171.5845	143.25	6.11322	e	0	ArialMT
+177.6978	143.25	5.497498	x	0	ArialMT
+183.1939	143.25	3.045609	t	0	ArialMT
+186.2479	143.25	0		0	ArialMT
+186.2479	143.25	3.045609	 	0	ArialMT
+189.3019	143.25	6.11322	h	0	ArialMT
+189.3019	143.25	0		0	ArialMT
+195.4152	143.25	6.11322	e	0	ArialMT
+201.5285	143.25	3.661331	r	0	ArialMT
+205.189	143.25	6.11322	e	0	ArialMT
+211.3008	143.25	3.045609	 	0	ArialMT
+72	158.25	3.045616	 	0	ArialMT
+72	173.25	7.32267	A	0	ArialMT
+79.3317	173.25	6.11322	n	0	ArialMT
+85.44504	173.25	6.11322	d	0	ArialMT
+91.55836	173.25	3.045616	 	0	ArialMT
+91.55836	173.25	0		0	ArialMT
+94.61235	173.25	0		0	ArialMT
+94.61235	173.25	3.045616	t	0	ArialMT
+97.66633	173.25	6.11322	h	0	ArialMT
+103.7797	173.25	6.11322	e	0	ArialMT
+109.893	173.25	6.11322	n	0	ArialMT
+116.0063	173.25	0		0	ArialMT
+116.0063	173.25	3.045616	 	0	ArialMT
+119.0603	173.25	6.11322	a	0	ArialMT
+119.0603	173.25	0		0	ArialMT
+125.1736	173.25	6.11322	n	0	ArialMT
+131.287	173.25	6.11322	o	0	ArialMT
+137.4003	173.25	3.045609	t	0	ArialMT
+140.4543	173.25	6.11322	h	0	ArialMT
+146.5676	173.25	6.11322	e	0	ArialMT
+152.6809	173.25	3.661331	r	0	ArialMT
+156.3414	173.25	3.045609	 	0	ArialMT
+156.3414	173.25	0		0	ArialMT
+159.3954	173.25	2.440887	l	0	ArialMT
+159.3954	173.25	0		0	ArialMT
+161.8375	173.25	2.440887	i	0	ArialMT
+164.2796	173.25	6.11322	n	0	ArialMT
+170.393	173.25	6.11322	e	0	ArialMT
+176.5063	173.25	3.045609	 	0	ArialMT
+176.5063	173.25	0		0	ArialMT
+179.5603	173.25	6.11322	o	0	ArialMT
+179.5603	173.25	0		0	ArialMT
+185.6736	173.25	3.045609	f	0	ArialMT
+188.7276	173.25	0		0	ArialMT
+188.7276	173.25	3.045609	 	0	ArialMT
+191.7816	173.25	3.045609	t	0	ArialMT
+191.7816	173.25	0		0	ArialMT
+194.8355	173.25	6.11322	e	0	ArialMT
+200.9489	173.25	5.497498	x	0	ArialMT
+206.445	173.25	3.045609	t	0	ArialMT
+209.499	173.25	3.045609	.	0	ArialMT
+212.543	173.25	3.045609	 	0	ArialMT";
+
+            return fromOther.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
+                .Select(AssertablePositionData.Parse)
+                .ToList();
+        }
+    }
+}
--- a/src/UglyToad.Pdf/Content/Page.cs
+++ b/src/UglyToad.Pdf/Content/Page.cs
@ -10,11 +10,21 @@
        /// </summary>
        public int Number { get; }

-        public MediaBox MediaBox { get; }
+        internal MediaBox MediaBox { get; }

        internal PageContent Content { get; }

-        public IReadOnlyList<Letter> Text => Content?.Letters ?? new Letter[0];
+        public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
+
+        /// <summary>
+        /// Gets the width of the page in points.
+        /// </summary>
+        public decimal Width { get; }
+
+        /// <summary>
+        /// Gets the height of the page in points.
+        /// </summary>
+        public decimal Height { get; }

        internal Page(int number, MediaBox mediaBox, PageContent content)
        {
@ -26,6 +36,9 @@
            Number = number;
            MediaBox = mediaBox;
            Content = content;
+
+            Width = mediaBox.Bounds.Width;
+            Height = mediaBox.Bounds.Height;
        }
    }
 }
--- a/src/UglyToad.Pdf/Content/Pages.cs
+++ b/src/UglyToad.Pdf/Content/Pages.cs
@ -75,7 +75,7 @@

            if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary))
            {
-                throw new InvalidOperationException("Could not find the page with number: " + pageNumber);
+                throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
            }

            var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);
--- a/src/UglyToad.Pdf/Parser/BaseParser.cs
+++ b/src/UglyToad.Pdf/Parser/BaseParser.cs
@ -1,941 +0,0 @@
-namespace UglyToad.Pdf.Parser
-{
-    using System;
-    using System.IO;
-    using System.Text;
-    using Cos;
-    using IO;
-    using Util;
-
-    /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-    /**
-     * This class is used to contain parsing logic that will be used by both the
-     * PDFParser and the COSStreamParser.
-     *
-     * @author Ben Litchfield
-     */
-    public abstract class BaseParser
-    {
-        private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L;
-
-        private static readonly long GENERATION_NUMBER_THRESHOLD = 65535;
-
-        static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length;
-
-        /**
-         * Log instance.
-         */
-        protected static readonly int E = 'e';
-        protected static readonly int N = 'n';
-        protected static readonly int D = 'd';
-
-        protected static readonly int S = 's';
-        protected static readonly int T = 't';
-        protected static readonly int R = 'r';
-        protected static readonly int A = 'a';
-        protected static readonly int M = 'm';
-
-        protected static readonly int O = 'o';
-        protected static readonly int B = 'b';
-        protected static readonly int J = 'j';
-
-        /**
-         * This is a string constant that will be used for comparisons.
-         */
-        public static readonly string DEF = "def";
-        /**
-         * This is a string constant that will be used for comparisons.
-         */
-        protected static readonly string ENDOBJ_string = "endobj";
-        /**
-         * This is a string constant that will be used for comparisons.
-         */
-        protected static readonly string ENDSTREAM_string = "endstream";
-        /**
-         * This is a string constant that will be used for comparisons.
-         */
-        protected static readonly string STREAM_string = "stream";
-        /**
-         * This is a string constant that will be used for comparisons.
-         */
-        private static readonly string TRUE = "true";
-        /**
-         * This is a string constant that will be used for comparisons.
-         */
-        private static readonly string FALSE = "false";
-        /**
-         * This is a string constant that will be used for comparisons.
-         */
-        private static readonly string NULL = "null";
-
-        /**
-         * ASCII code for line feed.
-         */
-        protected static readonly byte ASCII_LF = 10;
-        /**
-         * ASCII code for carriage return.
-         */
-        protected static readonly byte ASCII_CR = 13;
-        private static readonly byte ASCII_ZERO = 48;
-        private static readonly byte ASCII_NINE = 57;
-        private static readonly byte ASCII_SPACE = 32;
-
-        /**
-         * This is the stream that will be read from.
-         */
-        protected readonly SequentialSource seqSource;
-
-        /**
-         * This is the document that will be parsed.
-         */
-        protected COSDocument document;
-
-        /**
-         * Default constructor.
-         */
-        public BaseParser(SequentialSource pdfSource)
-        {
-            this.seqSource = pdfSource;
-        }
-
-        private static bool isHexDigit(char ch)
-        {
-            return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
-        }
-        
-        protected void skipWhiteSpaces()
-        {
-            //PDF Ref 3.2.7 A stream must be followed by either
-            //a CRLF or LF but nothing else.
-
-            int whitespace = seqSource.read();
-
-            //see brother_scan_cover.pdf, it adds whitespaces
-            //after the stream but before the start of the
-            //data, so just read those first
-            while (ASCII_SPACE == whitespace)
-            {
-                whitespace = seqSource.read();
-            }
-
-            if (ASCII_CR == whitespace)
-            {
-                whitespace = seqSource.read();
-                if (ASCII_LF != whitespace)
-                {
-                    seqSource.unread(whitespace);
-                    //The spec says this is invalid but it happens in the real
-                    //world so we must support it.
-                }
-            }
-            else if (ASCII_LF != whitespace)
-            {
-                //we are in an error.
-                //but again we will do a lenient parsing and just assume that everything
-                //is fine
-                seqSource.unread(whitespace);
-            }
-        }
-
-        /**
-         * This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
-         * format: /Title ( (5) /Creator which was patched in 1 place.
-         *
-         * However it missed the case where the number of opening and closing parenthesis isn't balanced
-         *
-         * The second bug was in this format /Title (c:\) /Producer
-         *
-         * This patch moves this code out of the parseCOSstring method, so it can be used twice.
-         *
-         * @param bracesParameter the number of braces currently open.
-         *
-         * @return the corrected value of the brace counter
-         * @throws IOException
-         */
-        private int checkForEndOfstring(int bracesParameter)
-        {
-            int braces = bracesParameter;
-            byte[]
-            nextThreeBytes = new byte[3];
-            int amountRead = seqSource.read(nextThreeBytes);
-
-            // Check the next 3 bytes if available
-            // The following cases are valid indicators for the end of the string
-            // 1. Next line contains another COSObject: CR + LF + '/'
-            // 2. CosDictionary ends in the next line: CR + LF + '>'
-            // 3. Next line contains another COSObject: CR + '/'
-            // 4. CosDictionary ends in the next line: CR + '>'
-            if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR)
-            {
-                if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>')
-                || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
-                {
-                    braces = 0;
-                }
-            }
-            if (amountRead > 0)
-            {
-                seqSource.unread(nextThreeBytes, 0, amountRead);
-            }
-            return braces;
-        }
-
-        /**
-         * This will parse a PDF string.
-         *
-         * @return The parsed PDF string.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected CosString parseCOSstring()
-        {
-            char nextChar = (char)seqSource.read();
-            if (nextChar == '<')
-            {
-                return parseCOSHexstring();
-            }
-            else if (nextChar != '(')
-            {
-                throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
-                nextChar + "' " + seqSource);
-            }
-
-            var charLf = (char)ASCII_LF;
-
-            using (var memoryStream = new MemoryStream())
-            using (var writer = new StreamWriter(memoryStream))
-            {
-                // This is the number of braces read
-                int braces = 1;
-                int c = seqSource.read();
-                while (braces > 0 && c != -1)
-                {
-                    char ch = (char)c;
-                    int nextc = -2; // not yet read
-
-                    if (ch == ')')
-                    {
-
-                        braces--;
-                        braces = checkForEndOfstring(braces);
-                        if (braces != 0)
-                        {
-                            writer.Write(ch);
-                        }
-                    }
-                    else if (ch == '(')
-                    {
-                        braces++;
-                        writer.Write(ch);
-                    }
-                    else if (ch == '\\')
-                    {
-                        //patched by ram
-                        char next = (char)seqSource.read();
-                        switch (next)
-                        {
-                            case 'n':
-                                writer.Write('\n');
-                                break;
-                            case 'r':
-                                writer.Write('\r');
-                                break;
-                            case 't':
-                                writer.Write('\t');
-                                break;
-                            case 'b':
-                                writer.Write('\b');
-                                break;
-                            case 'f':
-                                writer.Write('\f');
-                                break;
-                            case ')':
-                                // PDFBox 276 /Title (c:\)
-                                braces = checkForEndOfstring(braces);
-                                if (braces != 0)
-                                {
-                                    writer.Write(next);
-                                }
-                                else
-                                {
-                                    writer.Write('\\');
-                                }
-                                break;
-                            case '(':
-                            case '\\':
-                                writer.Write(next);
-                                break;
-                                //case charLf:
-                                // case ASCII_CR:
-                                //this is a break in the line so ignore it and the newline and continue
-                                c = seqSource.read();
-                                while (isEOL(c) && c != -1)
-                                {
-                                    c = seqSource.read();
-                                }
-                                nextc = c;
-                                break;
-                            case '0':
-                            case '1':
-                            case '2':
-                            case '3':
-                            case '4':
-                            case '5':
-                            case '6':
-                            case '7':
-                                {
-                                    var octal = new StringBuilder();
-                                    octal.Append(next);
-                                    c = seqSource.read();
-                                    char digit = (char)c;
-                                    if (digit >= '0' && digit <= '7')
-                                    {
-                                        octal.Append(digit);
-                                        c = seqSource.read();
-                                        digit = (char)c;
-                                        if (digit >= '0' && digit <= '7')
-                                        {
-                                            octal.Append(digit);
-                                        }
-                                        else
-                                        {
-                                            nextc = c;
-                                        }
-                                    }
-                                    else
-                                    {
-                                        nextc = c;
-                                    }
-
-                                    int character = 0;
-                                    try
-                                    {
-                                        character = Convert.ToInt32(octal.ToString(), 8);
-                                    }
-                                    catch (FormatException e)
-                                    {
-                                        throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
-                                    }
-                                    writer.Write(character);
-                                    break;
-                                }
-                            default:
-
-                                // dropping the backslash
-                                // see 7.3.4.2 Literal strings for further information
-                                writer.Write(next);
-                                break;
-
-                        }
-                    }
-                    else
-                    {
-                        writer.Write(ch);
-                    }
-                    if (nextc != -2)
-                    {
-                        c = nextc;
-                    }
-                    else
-                    {
-                        c = seqSource.read();
-                    }
-                }
-                if (c != -1)
-                {
-                    seqSource.unread(c);
-                }
-                return new CosString(memoryStream.ToArray());
-            }
-
-        }
-
-        /**
-         * This will parse a PDF HEX string with fail fast semantic
-         * meaning that we stop if a not allowed character is found.
-         * This is necessary in order to detect malformed input and
-         * be able to skip to next object start.
-         *
-         * We assume starting '&lt;' was already read.
-         * 
-         * @return The parsed PDF string.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        private CosString parseCOSHexstring()
-        {
-            var sBuf = new StringBuilder();
-            while (true)
-            {
-                int c = seqSource.read();
-                if (isHexDigit((char)c))
-                {
-                    sBuf.Append((char)c);
-                }
-                else if (c == '>')
-                {
-                    break;
-                }
-                else if (c < 0)
-                {
-                    throw new IOException("Missing closing bracket for hex string. Reached EOS.");
-                }
-                else if ((c == ' ') || (c == '\n') ||
-                (c == '\t') || (c == '\r') ||
-                (c == '\b') || (c == '\f'))
-                {
-                    continue;
-                }
-                else
-                {
-                    // if invalid chars was found: discard last
-                    // hex character if it is not part of a pair
-                    if (sBuf.Length % 2 != 0)
-                    {
-                        sBuf.Remove(sBuf.Length - 1, 1);
-                    }
-
-                    // read till the closing bracket was found
-                    do
-                    {
-                        c = seqSource.read();
-                    }
-                    while (c != '>' && c >= 0);
-
-                    // might have reached EOF while looking for the closing bracket
-                    // this can happen for malformed PDFs only. Make sure that there is
-                    // no endless loop.
-                    if (c < 0)
-                    {
-                        throw new IOException("Missing closing bracket for hex string. Reached EOS.");
-                    }
-
-                    // exit loop
-                    break;
-                }
-            }
-            return CosString.ParseHex(sBuf.ToString());
-        }
-        
-
-        /**
-         * Determine if a character terminates a PDF name.
-         *
-         * @param ch The character
-         * @return true if the character terminates a PDF name, otherwise false.
-         */
-        protected bool isEndOfName(int ch)
-        {
-            return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' ||
-            ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' ||
-            ch == 0 || ch == '\f';
-        }
-        
-        /**
-         * Returns true if a byte sequence is valid UTF-8.
-         */
-        private bool isValidUTF8(byte[] input)
-        {
-            try
-            {
-                Decoder d = Encoding.UTF8.GetDecoder();
-                var charLength = d.GetCharCount(input, 0, input.Length);
-                var chars = new char[charLength];
-                d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _);
-                return true;
-            }
-            catch (Exception e)
-            {
-                return false;
-            }
-        }
-
-
-        /**
-         * This will parse a bool object from the stream.
-         *
-         * @return The parsed bool object.
-         *
-         * @throws IOException If an IO error occurs during parsing.
-         */
-        protected CosBoolean parsebool()
-        {
-            CosBoolean retval = null;
-            char c = (char)seqSource.peek();
-            if (c == 't')
-            {
-                string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4));
-                if (!truestring.Equals(TRUE))
-                {
-                    throw new IOException("Error parsing bool: expected='true' actual='" + truestring
-                    + "' at offset " + seqSource.getPosition());
-                }
-                else
-                {
-                    retval = CosBoolean.True;
-                }
-            }
-            else if (c == 'f')
-            {
-                string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5));
-                if (!falsestring.Equals(FALSE))
-                {
-                    throw new IOException("Error parsing bool: expected='true' actual='" + falsestring
-                    + "' at offset " + seqSource.getPosition());
-                }
-                else
-                {
-                    retval = CosBoolean.False;
-                }
-            }
-            else
-            {
-                throw new IOException("Error parsing bool expected='t or f' actual='" + c
-                + "' at offset " + seqSource.getPosition());
-            }
-            return retval;
-        }
-
-        /**
-         * This will read the next string from the stream.
-         *
-         * @return The string that was read from the stream, never null.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected string readstring()
-        {
-            SkipSpaces();
-            StringBuilder buffer = new StringBuilder();
-            int c = seqSource.read();
-            while (!isEndOfName((char)c) && c != -1)
-            {
-                buffer.Append((char)c);
-                c = seqSource.read();
-            }
-            if (c != -1)
-            {
-                seqSource.unread(c);
-            }
-            return buffer.ToString();
-        }
-
-        /**
-         * Read one string and throw an exception if it is not the expected value.
-         *
-         * @param expectedstring the string value that is expected.
-         * @throws IOException if the string char is not the expected value or if an
-         * I/O error occurs.
-         */
-        protected void readExpectedstring(string expectedstring)
-        {
-            readExpectedstring(expectedstring, false);
-        }
-
-        /**
-         * Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
-         * 
-         * @param expectedstring pattern to be skipped
-         * @param skipSpaces if set to true spaces before and after the string will be skipped
-         * @throws IOException if pattern could not be read
-         */
-        protected void readExpectedstring(string expectedstring, bool skipSpaces)
-        {
-            SkipSpaces();
-            foreach (var c in expectedstring)
-            {
-                if (seqSource.read() != c)
-                {
-                    throw new IOException("Expected string '" + expectedstring
-                    + "' but missed at character '" + c + "' at offset "
-                    + seqSource.getPosition());
-                }
-            }
-            SkipSpaces();
-        }
-
-        /**
-         * Read one char and throw an exception if it is not the expected value.
-         *
-         * @param ec the char value that is expected.
-         * @throws IOException if the read char is not the expected value or if an
-         * I/O error occurs.
-         */
-        protected void readExpectedChar(char ec)
-        {
-            char c = (char)seqSource.read();
-            if (c != ec)
-            {
-                throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition());
-            }
-        }
-
-        /**
-         * This will read the next string from the stream up to a certain length.
-         *
-         * @param length The length to stop reading at.
-         *
-         * @return The string that was read from the stream of length 0 to length.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected string readstring(int length)
-        {
-            SkipSpaces();
-
-            int c = seqSource.read();
-
-            //average string size is around 2 and the normal string buffer size is
-            //about 16 so lets save some space.
-            StringBuilder buffer = new StringBuilder(length);
-            while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length &&
-            c != '[' &&
-            c != '<' &&
-            c != '(' &&
-            c != '/')
-            {
-                buffer.Append((char)c);
-                c = seqSource.read();
-            }
-            if (c != -1)
-            {
-                seqSource.unread(c);
-            }
-            return buffer.ToString();
-        }
-
-        /**
-         * This will tell if the next character is a closing brace( close of PDF array ).
-         *
-         * @return true if the next byte is ']', false otherwise.
-         *
-         * @throws IOException If an IO error occurs.
-         */
-        protected bool isClosing()
-        {
-            return isClosing(seqSource.peek());
-        }
-
-        /**
-         * This will tell if the next character is a closing brace( close of PDF array ).
-         *
-         * @param c The character to check against end of line
-         * @return true if the next byte is ']', false otherwise.
-         */
-        protected bool isClosing(int c)
-        {
-            return c == ']';
-        }
-
-        /**
-         * This will read bytes until the first end of line marker occurs.
-         * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
-         * which is an important detail if one wants to unread the line.
-         *
-         * @return The characters between the current position and the end of the line.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected string readLine()
-        {
-            if (seqSource.isEOF())
-            {
-                throw new IOException("Error: End-of-File, expected line");
-            }
-
-            StringBuilder buffer = new StringBuilder(11);
-
-            int c;
-            while ((c = seqSource.read()) != -1)
-            {
-                // CR and LF are valid EOLs
-                if (isEOL(c))
-                {
-                    break;
-                }
-                buffer.Append((char)c);
-            }
-            // CR+LF is also a valid EOL 
-            if (isCR(c) && isLF(seqSource.peek()))
-            {
-                seqSource.read();
-            }
-            return buffer.ToString();
-        }
-
-        /**
-         * This will tell if the next byte to be read is an end of line byte.
-         *
-         * @return true if the next byte is 0x0A or 0x0D.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected bool isEOL()
-        {
-            return isEOL(seqSource.peek());
-        }
-
-        /**
-         * This will tell if the next byte to be read is an end of line byte.
-         *
-         * @param c The character to check against end of line
-         * @return true if the next byte is 0x0A or 0x0D.
-         */
-        protected bool isEOL(int c)
-        {
-            return isLF(c) || isCR(c);
-        }
-
-        private bool isLF(int c)
-        {
-            return ASCII_LF == c;
-        }
-
-        private bool isCR(int c)
-        {
-            return ASCII_CR == c;
-        }
-
-        /**
-         * This will tell if the next byte is whitespace or not.
-         *
-         * @return true if the next byte in the stream is a whitespace character.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected bool isWhitespace()
-        {
-            return isWhitespace(seqSource.peek());
-        }
-
-        /**
-         * This will tell if a character is whitespace or not.  These values are
-         * specified in table 1 (page 12) of ISO 32000-1:2008.
-         * @param c The character to check against whitespace
-         * @return true if the character is a whitespace character.
-         */
-        protected bool isWhitespace(int c)
-        {
-            return c == 0 || c == 9 || c == 12 || c == ASCII_LF
-            || c == ASCII_CR || c == ASCII_SPACE;
-        }
-
-        /**
-         * This will tell if the next byte is a space or not.
-         *
-         * @return true if the next byte in the stream is a space character.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected bool isSpace()
-        {
-            return isSpace(seqSource.peek());
-        }
-
-        /**
-         * This will tell if the given value is a space or not.
-         * 
-         * @param c The character to check against space
-         * @return true if the next byte in the stream is a space character.
-         */
-        protected bool isSpace(int c)
-        {
-            return ASCII_SPACE == c;
-        }
-
-        /**
-         * This will tell if the next byte is a digit or not.
-         *
-         * @return true if the next byte in the stream is a digit.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected bool isDigit()
-        {
-            return isDigit(seqSource.peek());
-        }
-
-        /**
-         * This will tell if the given value is a digit or not.
-         * 
-         * @param c The character to be checked
-         * @return true if the next byte in the stream is a digit.
-         */
-        protected static bool isDigit(int c)
-        {
-            return c >= ASCII_ZERO && c <= ASCII_NINE;
-        }
-
-        /**
-         * This will skip all spaces and comments that are present.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected void SkipSpaces()
-        {
-            int c = seqSource.read();
-            // 37 is the % character, a comment
-            while (isWhitespace(c) || c == 37)
-            {
-                if (c == 37)
-                {
-                    // skip past the comment section
-                    c = seqSource.read();
-                    while (!isEOL(c) && c != -1)
-                    {
-                        c = seqSource.read();
-                    }
-                }
-                else
-                {
-                    c = seqSource.read();
-                }
-            }
-            if (c != -1)
-            {
-                seqSource.unread(c);
-            }
-        }
-
-        /**
-         * This will read a long from the Stream and throw an {@link IOException} if
-         * the long value is negative or has more than 10 digits (i.e. : bigger than
-         * {@link #OBJECT_NUMBER_THRESHOLD})
-         *
-         * @return the object number being read.
-         * @throws IOException if an I/O error occurs
-         */
-        protected long readObjectNumber()
-        {
-            long retval = readLong();
-            if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
-            {
-                throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
-            }
-            return retval;
-        }
-
-        /**
-         * This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
-         * has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
-         * @return the generation number being read.
-         * @throws IOException if an I/O error occurs
-         */
-        protected int readGenerationNumber()
-        {
-            int retval = readInt();
-            if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
-            {
-                throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
-            }
-            return retval;
-        }
-
-        /**
-         * This will read an integer from the stream.
-         *
-         * @return The integer that was read from the stream.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected int readInt()
-        {
-            SkipSpaces();
-            int retval = 0;
-
-            StringBuilder intBuffer = readstringNumber();
-
-            try
-            {
-                retval = int.Parse(intBuffer.ToString());
-            }
-            catch (FormatException e)
-            {
-                seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
-                throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e);
-            }
-            return retval;
-        }
-
-
-        /**
-         * This will read an long from the stream.
-         *
-         * @return The long that was read from the stream.
-         *
-         * @throws IOException If there is an error reading from the stream.
-         */
-        protected long readLong()
-        {
-            SkipSpaces();
-            long retval = 0;
-
-            StringBuilder longBuffer = readstringNumber();
-
-            try
-            {
-                retval = long.Parse(longBuffer.ToString());
-            }
-            catch (FormatException e)
-            {
-                seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString()));
-
-                throw new IOException(
-                    $"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e);
-            }
-
-            return retval;
-        }
-
-        /**
-         * This method is used to read a token by the {@linkplain #readInt()} method
-         * and the {@linkplain #readLong()} method.
-         *
-         * @return the token to parse as integer or long by the calling method.
-         * @throws IOException throws by the {@link #seqSource} methods.
-         */
-        protected StringBuilder readstringNumber()
-        {
-            int lastByte = 0;
-            StringBuilder buffer = new StringBuilder();
-            while ((lastByte = seqSource.read()) != ASCII_SPACE &&
-            lastByte != ASCII_LF &&
-            lastByte != ASCII_CR &&
-            lastByte != 60 && //see sourceforge bug 1714707
-            lastByte != '[' && // PDFBOX-1845
-            lastByte != '(' && // PDFBOX-2579
-            lastByte != 0 && //See sourceforge bug 853328
-            lastByte != -1)
-            {
-                buffer.Append((char)lastByte);
-                if (buffer.Length > MAX_LENGTH_LONG)
-                {
-                    throw new IOException("Number '" + buffer +
-                    "' is getting too long, stop reading at offset " + seqSource.getPosition());
-                }
-            }
-            if (lastByte != -1)
-            {
-                seqSource.unread(lastByte);
-            }
-            return buffer;
-        }
-    }
-
-}
--- a/src/UglyToad.Pdf/PdfDocument.cs
+++ b/src/UglyToad.Pdf/PdfDocument.cs
@ -25,10 +25,15 @@
        private readonly ParsingCachingProviders cachingProviders;

        [NotNull]
-        public Catalog Catalog { get; }
+        internal Catalog Catalog { get; }

        [NotNull]
-        public Pages Pages { get; }
+        internal Pages Pages { get; }
+
+        /// <summary>
+        /// Get the number of pages in this document.
+        /// </summary>
+        public int NumberOfPages => Pages.Count;

        internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
            bool isLenientParsing, 
@ -50,6 +55,16 @@
        public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
        public static PdfDocument Open(string filename, ParsingOptions options = null) => PdfDocumentFactory.Open(filename, options);

+        /// <summary>
+        /// Get the page with the specified page number.
+        /// </summary>
+        /// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
+        /// <returns>The page.</returns>
+        public Page GetPage(int pageNumber)
+        {
+            return Pages.GetPage(pageNumber);
+        }
+
        public void Dispose()
        {
            try