diff --git a/README.md b/README.md index a16009bd..ae12cf3e 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,40 @@ [![Build status](https://ci.appveyor.com/api/projects/status/ni7et2j2ml60pdi3?svg=true)](https://ci.appveyor.com/project/EliotJones/pdf) [![codecov](https://codecov.io/gh/UglyToad/Pdf/branch/master/graph/badge.svg)](https://codecov.io/gh/UglyToad/Pdf) -Convert the [PdfBox](https://github.com/apache/pdfbox) code to C#. +The aim of this project is to convert the [PdfBox](https://github.com/apache/pdfbox) code to C# in order to provide a properly open source (i.e. no copyleft) solution for inspecting PDF documents. This uses the Apache 2.0 licence. + +## Status ## + +There is a lot left to do for this project, the initial minimum viable project when released to Alpha will provide: + ++ Page counts and sizes (in points) for a document. ++ Access to the text contents of each page. Note that since PDF has no concept of a "word" it will be up to the consumer of the text to work out where the words are within the text. ++ (Possible) The locations and bounds of each letter on the page. + +For the initial alpha release all files will be opened rather than streamed so this will not support large files. + +Eventually the library should support all existing PdfBox operations such as accessing graphical elements, form elements as well as creating PDF documents. + +## Usage ## + +The initial public API will be as limited as possible to allow extensive refactoring to take place. The proposed usage is as follows: + + using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf")) + { + int pageCount = document.NumberOfPages; + + Page page = document.GetPage(1); + + decimal widthInPoints = page.Width; + decimal heightInPoints = page.Height; + + string text = page.Text; + } + +The ```PdfDocument``` will also support opening from byte arrays (as well as streams eventually): + + byte[] fileBytes = File.ReadAllBytes(@"C:\my-file.pdf"); + (using PdfDocument document = PdfDocument.Open(fileBytes)) + { + int numberOfPages = document.NumberOfPages; + } diff --git a/src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs b/src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs new file mode 100644 index 00000000..ca767943 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs @@ -0,0 +1,39 @@ +namespace UglyToad.Pdf.Tests.Integration +{ + using System; + + public class AssertablePositionData + { + public decimal X { get; set; } + + public decimal Y { get; set; } + + public decimal Width { get; set; } + + public string Text { get; set; } + + public decimal FontSize { get; set; } + + public string FontName { get; set; } + + public static AssertablePositionData Parse(string line) + { + var parts = line.Split('\t', StringSplitOptions.None); + + if (parts.Length != 6) + { + throw new ArgumentException($"Expected 6 parts to the line, instead got {parts.Length}"); + } + + return new AssertablePositionData + { + X = decimal.Parse(parts[0]), + Y = decimal.Parse(parts[1]), + Width = decimal.Parse(parts[2]), + Text = parts[3], + FontSize = decimal.Parse(parts[4]), + FontName = parts[5] + }; + } + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs b/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs index 45a06ae4..d8d3c189 100644 --- a/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs +++ b/src/UglyToad.Pdf.Tests/Integration/PdfParserTests.cs @@ -43,25 +43,6 @@ public class PdfParserTests { - [Fact] - public void CanParseSimpleGoogleDocsDocument() - { - // To see the text as shown in Visual Studio or Notepad++, use the OtherEncodings.BytesAsLatin1String() - var file = GetNthFilename(); - - using (var document = PdfDocument.Open(File.ReadAllBytes(file))) - { - Assert.Equal(1, document.Pages.Count); - - var page = document.Pages.GetPage(1); - Assert.Equal(1, page.Number); - - var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty); - - Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty)); - } - } - [Fact] public void CanDecompressNormalObjectStream() { diff --git a/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs b/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs new file mode 100644 index 00000000..a2e30c16 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs @@ -0,0 +1,328 @@ +// ReSharper disable AccessToDisposedClosure +namespace UglyToad.Pdf.Tests.Integration +{ + using System; + using System.Collections.Generic; + using System.IO; + using System.Linq; + using Xunit; + + public class SinglePageSimpleTests + { + private static readonly HashSet IgnoredHiddenCharacters = new HashSet + { + "\u200B" + }; + + private static string GetFilename() + { + var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents")); + + return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf"); + } + + [Fact] + public void HasCorrectNumberOfPages() + { + var file = GetFilename(); + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + Assert.Equal(1, document.NumberOfPages); + } + } + + [Fact] + public void CanAccessPage() + { + var file = GetFilename(); + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + var page = document.GetPage(1); + + Assert.NotNull(page); + + Assert.Equal(1, page.Number); + } + } + + [Fact] + public void AccessPageLowerThanOneThrows() + { + var file = GetFilename(); + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + Action action = () => document.GetPage(0); + + Assert.Throws(action); + } + } + + [Fact] + public void PageHasCorrectDimensions() + { + var file = GetFilename(); + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + var page = document.GetPage(1); + + Assert.Equal(612, page.Width); + Assert.Equal(792, page.Height); + } + } + + [Fact] + public void PageHasCorrectTextIgnoringHiddenCharacters() + { + var file = GetFilename(); + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + var page = document.GetPage(1); + + var text = string.Join(string.Empty, page.Letters.Select(x => x.Value).Where(x => !IgnoredHiddenCharacters.Contains(x))); + + const string expected = + "This is the document title There is some lede text here And then another line of text. "; + + Assert.Equal(expected, text); + } + } + + private static IReadOnlyList GetPdfBoxPositionData() + { + // X Y Width Letter FontSize Font + const string fromPdfBox = @"72 105 9.771912 T 21 ArialMT +81.77106 105 8.897049 h 21 ArialMT +90.66733 105 3.554138 i 21 ArialMT +94.22115 105 7.998741 s 21 ArialMT +102.2192 105 0 ​ 21 Gautami +106.6634 105 0 ​ 21 Gautami +106.6634 105 3.554131 i 21 ArialMT +110.2173 105 7.998749 s 21 ArialMT +118.2153 105 0 ​ 21 Gautami +122.6595 105 0 ​ 21 Gautami +122.6595 105 4.444618 t 21 ArialMT +127.1038 105 8.897049 h 21 ArialMT +136 105 8.897049 e 21 ArialMT +144.8963 105 0 ​ 21 Gautami +149.3405 105 0 ​ 21 Gautami +149.3405 105 8.897049 d 21 ArialMT +158.2368 105 8.897049 o 21 ArialMT +167.1331 105 7.998749 c 21 ArialMT +175.1311 105 8.897049 u 21 ArialMT +184.0274 105 13.32605 m 21 ArialMT +197.3523 105 8.897049 e 21 ArialMT +206.2485 105 8.897049 n 21 ArialMT +215.1448 105 4.444611 t 21 ArialMT +219.5891 105 0 ​ 21 Gautami +224.0333 105 0 ​ 21 Gautami +224.0333 105 4.444611 t 21 ArialMT +228.4775 105 3.554138 i 21 ArialMT +232.0313 105 4.444611 t 21 ArialMT +236.4756 105 3.554123 l 21 ArialMT +240.0294 105 8.897049 e 21 ArialMT +72 143.25 6.716187 T 14 ArialMT +78.71446 143.25 6.114899 h 14 ArialMT +84.8278 143.25 6.114891 e 14 ArialMT +90.94113 143.25 3.661423 r 14 ArialMT +94.60161 143.25 6.114899 e 14 ArialMT +100.7149 143.25 0 ​ 14 Gautami +103.7689 143.25 0 ​ 14 Gautami +103.7689 143.25 2.442749 i 14 ArialMT +106.211 143.25 5.497505 s 14 ArialMT +111.7071 143.25 0 ​ 14 Gautami +114.7611 143.25 0 ​ 14 Gautami +114.7611 143.25 5.497505 s 14 ArialMT +120.2572 143.25 6.114899 o 14 ArialMT +126.3705 143.25 9.158928 m 14 ArialMT +135.5271 143.25 6.114899 e 14 ArialMT +141.6404 143.25 0 ​ 14 Gautami +144.6944 143.25 0 ​ 14 Gautami +144.6944 143.25 2.442749 l 14 ArialMT +147.1365 143.25 6.114899 e 14 ArialMT +153.2499 143.25 6.114899 d 14 ArialMT +159.3632 143.25 6.114899 e 14 ArialMT +165.4765 143.25 0 ​ 14 Gautami +168.5305 143.25 0 ​ 14 Gautami +168.5305 143.25 3.054749 t 14 ArialMT +171.5845 143.25 6.114899 e 14 ArialMT +177.6978 143.25 5.497498 x 14 ArialMT +183.1939 143.25 3.054764 t 14 ArialMT +186.2479 143.25 0 ​ 14 Gautami +189.3019 143.25 0 ​ 14 Gautami +189.3019 143.25 6.114899 h 14 ArialMT +195.4152 143.25 6.114899 e 14 ArialMT +201.5285 143.25 3.661423 r 14 ArialMT +205.189 143.25 6.114899 e 14 ArialMT +72 173.25 7.33358 A 14 ArialMT +79.3317 173.25 6.114891 n 14 ArialMT +85.44504 173.25 6.114891 d 14 ArialMT +91.55836 173.25 0 ​ 14 Gautami +94.61235 173.25 0 ​ 14 Gautami +94.61235 173.25 3.054756 t 14 ArialMT +97.66633 173.25 6.114899 h 14 ArialMT +103.7797 173.25 6.114899 e 14 ArialMT +109.893 173.25 6.114899 n 14 ArialMT +116.0063 173.25 0 ​ 14 Gautami +119.0603 173.25 0 ​ 14 Gautami +119.0603 173.25 6.114899 a 14 ArialMT +125.1736 173.25 6.114899 n 14 ArialMT +131.287 173.25 6.114899 o 14 ArialMT +137.4003 173.25 3.054749 t 14 ArialMT +140.4543 173.25 6.114899 h 14 ArialMT +146.5676 173.25 6.114899 e 14 ArialMT +152.6809 173.25 3.661423 r 14 ArialMT +156.3414 173.25 0 ​ 14 Gautami +159.3954 173.25 0 ​ 14 Gautami +159.3954 173.25 2.442749 l 14 ArialMT +161.8375 173.25 2.442734 i 14 ArialMT +164.2796 173.25 6.114899 n 14 ArialMT +170.393 173.25 6.114899 e 14 ArialMT +176.5063 173.25 0 ​ 14 Gautami +179.5603 173.25 0 ​ 14 Gautami +179.5603 173.25 6.114899 o 14 ArialMT +185.6736 173.25 3.054764 f 14 ArialMT +188.7276 173.25 0 ​ 14 Gautami +191.7816 173.25 0 ​ 14 Gautami +191.7816 173.25 3.054764 t 14 ArialMT +194.8355 173.25 6.114899 e 14 ArialMT +200.9489 173.25 5.497482 x 14 ArialMT +206.445 173.25 3.054764 t 14 ArialMT +209.499 173.25 3.054764 . 14 ArialMT"; + + return fromPdfBox.Split("\r\n", StringSplitOptions.RemoveEmptyEntries) + .Select(AssertablePositionData.Parse) + .ToList(); + } + + private static IReadOnlyList GetOtherPositionData1() + { + // These do not include the font information + const string fromOther = @"72 105 9.758476 T 0 ArialMT +81.77106 105 8.894608 h 0 ArialMT +90.66733 105 3.551445 i 0 ArialMT +94.22115 105 7.998749 s 0 ArialMT +102.2192 105 4.431305 0 ArialMT +102.2192 105 0 ​ 0 ArialMT +106.6634 105 3.551445 i 0 ArialMT +106.6634 105 0 ​ 0 ArialMT +110.2173 105 7.998749 s 0 ArialMT +118.2153 105 0 ​ 0 ArialMT +118.2153 105 4.431305 0 ArialMT +122.6595 105 4.431305 t 0 ArialMT +122.6595 105 0 ​ 0 ArialMT +127.1038 105 8.894608 h 0 ArialMT +136 105 8.894608 e 0 ArialMT +144.8963 105 4.431305 0 ArialMT +144.8963 105 0 ​ 0 ArialMT +149.3405 105 8.894608 d 0 ArialMT +149.3405 105 0 ​ 0 ArialMT +158.2368 105 8.894608 o 0 ArialMT +167.1331 105 7.998749 c 0 ArialMT +175.1311 105 8.894608 u 0 ArialMT +184.0274 105 13.32591 m 0 ArialMT +197.3523 105 8.894608 e 0 ArialMT +206.2485 105 8.894608 n 0 ArialMT +215.1448 105 4.431305 t 0 ArialMT +219.5891 105 4.431305 0 ArialMT +219.5891 105 0 ​ 0 ArialMT +224.0333 105 4.431305 t 0 ArialMT +224.0333 105 0 ​ 0 ArialMT +228.4775 105 3.551453 i 0 ArialMT +232.0313 105 4.431305 t 0 ArialMT +236.4756 105 3.551453 l 0 ArialMT +240.0294 105 8.894608 e 0 ArialMT +248.918 105 4.431305 0 ArialMT +72 128.25 3.045616 0 ArialMT +72 143.25 6.706947 T 0 ArialMT +78.71446 143.25 6.11322 h 0 ArialMT +84.8278 143.25 6.11322 e 0 ArialMT +90.94113 143.25 3.661331 r 0 ArialMT +94.60161 143.25 6.11322 e 0 ArialMT +100.7149 143.25 3.045616 0 ArialMT +100.7149 143.25 0 ​ 0 ArialMT +103.7689 143.25 2.440887 i 0 ArialMT +103.7689 143.25 0 ​ 0 ArialMT +106.211 143.25 5.497498 s 0 ArialMT +111.7071 143.25 3.045616 0 ArialMT +111.7071 143.25 0 ​ 0 ArialMT +114.7611 143.25 0 ​ 0 ArialMT +114.7611 143.25 5.497498 s 0 ArialMT +120.2572 143.25 6.11322 o 0 ArialMT +126.3705 143.25 9.158836 m 0 ArialMT +135.5271 143.25 6.11322 e 0 ArialMT +141.6404 143.25 0 ​ 0 ArialMT +141.6404 143.25 3.045609 0 ArialMT +144.6944 143.25 2.440887 l 0 ArialMT +144.6944 143.25 0 ​ 0 ArialMT +147.1365 143.25 6.11322 e 0 ArialMT +153.2499 143.25 6.11322 d 0 ArialMT +159.3632 143.25 6.11322 e 0 ArialMT +165.4765 143.25 0 ​ 0 ArialMT +165.4765 143.25 3.045609 0 ArialMT +168.5305 143.25 3.045609 t 0 ArialMT +168.5305 143.25 0 ​ 0 ArialMT +171.5845 143.25 6.11322 e 0 ArialMT +177.6978 143.25 5.497498 x 0 ArialMT +183.1939 143.25 3.045609 t 0 ArialMT +186.2479 143.25 0 ​ 0 ArialMT +186.2479 143.25 3.045609 0 ArialMT +189.3019 143.25 6.11322 h 0 ArialMT +189.3019 143.25 0 ​ 0 ArialMT +195.4152 143.25 6.11322 e 0 ArialMT +201.5285 143.25 3.661331 r 0 ArialMT +205.189 143.25 6.11322 e 0 ArialMT +211.3008 143.25 3.045609 0 ArialMT +72 158.25 3.045616 0 ArialMT +72 173.25 7.32267 A 0 ArialMT +79.3317 173.25 6.11322 n 0 ArialMT +85.44504 173.25 6.11322 d 0 ArialMT +91.55836 173.25 3.045616 0 ArialMT +91.55836 173.25 0 ​ 0 ArialMT +94.61235 173.25 0 ​ 0 ArialMT +94.61235 173.25 3.045616 t 0 ArialMT +97.66633 173.25 6.11322 h 0 ArialMT +103.7797 173.25 6.11322 e 0 ArialMT +109.893 173.25 6.11322 n 0 ArialMT +116.0063 173.25 0 ​ 0 ArialMT +116.0063 173.25 3.045616 0 ArialMT +119.0603 173.25 6.11322 a 0 ArialMT +119.0603 173.25 0 ​ 0 ArialMT +125.1736 173.25 6.11322 n 0 ArialMT +131.287 173.25 6.11322 o 0 ArialMT +137.4003 173.25 3.045609 t 0 ArialMT +140.4543 173.25 6.11322 h 0 ArialMT +146.5676 173.25 6.11322 e 0 ArialMT +152.6809 173.25 3.661331 r 0 ArialMT +156.3414 173.25 3.045609 0 ArialMT +156.3414 173.25 0 ​ 0 ArialMT +159.3954 173.25 2.440887 l 0 ArialMT +159.3954 173.25 0 ​ 0 ArialMT +161.8375 173.25 2.440887 i 0 ArialMT +164.2796 173.25 6.11322 n 0 ArialMT +170.393 173.25 6.11322 e 0 ArialMT +176.5063 173.25 3.045609 0 ArialMT +176.5063 173.25 0 ​ 0 ArialMT +179.5603 173.25 6.11322 o 0 ArialMT +179.5603 173.25 0 ​ 0 ArialMT +185.6736 173.25 3.045609 f 0 ArialMT +188.7276 173.25 0 ​ 0 ArialMT +188.7276 173.25 3.045609 0 ArialMT +191.7816 173.25 3.045609 t 0 ArialMT +191.7816 173.25 0 ​ 0 ArialMT +194.8355 173.25 6.11322 e 0 ArialMT +200.9489 173.25 5.497498 x 0 ArialMT +206.445 173.25 3.045609 t 0 ArialMT +209.499 173.25 3.045609 . 0 ArialMT +212.543 173.25 3.045609 0 ArialMT"; + + return fromOther.Split("\r\n", StringSplitOptions.RemoveEmptyEntries) + .Select(AssertablePositionData.Parse) + .ToList(); + } + } +} diff --git a/src/UglyToad.Pdf/Content/Page.cs b/src/UglyToad.Pdf/Content/Page.cs index 7efc848b..bc098d89 100644 --- a/src/UglyToad.Pdf/Content/Page.cs +++ b/src/UglyToad.Pdf/Content/Page.cs @@ -10,11 +10,21 @@ /// public int Number { get; } - public MediaBox MediaBox { get; } + internal MediaBox MediaBox { get; } internal PageContent Content { get; } - public IReadOnlyList Text => Content?.Letters ?? new Letter[0]; + public IReadOnlyList Letters => Content?.Letters ?? new Letter[0]; + + /// + /// Gets the width of the page in points. + /// + public decimal Width { get; } + + /// + /// Gets the height of the page in points. + /// + public decimal Height { get; } internal Page(int number, MediaBox mediaBox, PageContent content) { @@ -26,6 +36,9 @@ Number = number; MediaBox = mediaBox; Content = content; + + Width = mediaBox.Bounds.Width; + Height = mediaBox.Bounds.Height; } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Content/Pages.cs b/src/UglyToad.Pdf/Content/Pages.cs index f61939a2..6d71566b 100644 --- a/src/UglyToad.Pdf/Content/Pages.cs +++ b/src/UglyToad.Pdf/Content/Pages.cs @@ -75,7 +75,7 @@ if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary)) { - throw new InvalidOperationException("Could not find the page with number: " + pageNumber); + throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber); } var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing); diff --git a/src/UglyToad.Pdf/Parser/BaseParser.cs b/src/UglyToad.Pdf/Parser/BaseParser.cs deleted file mode 100644 index ccc88311..00000000 --- a/src/UglyToad.Pdf/Parser/BaseParser.cs +++ /dev/null @@ -1,941 +0,0 @@ -namespace UglyToad.Pdf.Parser -{ - using System; - using System.IO; - using System.Text; - using Cos; - using IO; - using Util; - - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /** - * This class is used to contain parsing logic that will be used by both the - * PDFParser and the COSStreamParser. - * - * @author Ben Litchfield - */ - public abstract class BaseParser - { - private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L; - - private static readonly long GENERATION_NUMBER_THRESHOLD = 65535; - - static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length; - - /** - * Log instance. - */ - protected static readonly int E = 'e'; - protected static readonly int N = 'n'; - protected static readonly int D = 'd'; - - protected static readonly int S = 's'; - protected static readonly int T = 't'; - protected static readonly int R = 'r'; - protected static readonly int A = 'a'; - protected static readonly int M = 'm'; - - protected static readonly int O = 'o'; - protected static readonly int B = 'b'; - protected static readonly int J = 'j'; - - /** - * This is a string constant that will be used for comparisons. - */ - public static readonly string DEF = "def"; - /** - * This is a string constant that will be used for comparisons. - */ - protected static readonly string ENDOBJ_string = "endobj"; - /** - * This is a string constant that will be used for comparisons. - */ - protected static readonly string ENDSTREAM_string = "endstream"; - /** - * This is a string constant that will be used for comparisons. - */ - protected static readonly string STREAM_string = "stream"; - /** - * This is a string constant that will be used for comparisons. - */ - private static readonly string TRUE = "true"; - /** - * This is a string constant that will be used for comparisons. - */ - private static readonly string FALSE = "false"; - /** - * This is a string constant that will be used for comparisons. - */ - private static readonly string NULL = "null"; - - /** - * ASCII code for line feed. - */ - protected static readonly byte ASCII_LF = 10; - /** - * ASCII code for carriage return. - */ - protected static readonly byte ASCII_CR = 13; - private static readonly byte ASCII_ZERO = 48; - private static readonly byte ASCII_NINE = 57; - private static readonly byte ASCII_SPACE = 32; - - /** - * This is the stream that will be read from. - */ - protected readonly SequentialSource seqSource; - - /** - * This is the document that will be parsed. - */ - protected COSDocument document; - - /** - * Default constructor. - */ - public BaseParser(SequentialSource pdfSource) - { - this.seqSource = pdfSource; - } - - private static bool isHexDigit(char ch) - { - return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); - } - - protected void skipWhiteSpaces() - { - //PDF Ref 3.2.7 A stream must be followed by either - //a CRLF or LF but nothing else. - - int whitespace = seqSource.read(); - - //see brother_scan_cover.pdf, it adds whitespaces - //after the stream but before the start of the - //data, so just read those first - while (ASCII_SPACE == whitespace) - { - whitespace = seqSource.read(); - } - - if (ASCII_CR == whitespace) - { - whitespace = seqSource.read(); - if (ASCII_LF != whitespace) - { - seqSource.unread(whitespace); - //The spec says this is invalid but it happens in the real - //world so we must support it. - } - } - else if (ASCII_LF != whitespace) - { - //we are in an error. - //but again we will do a lenient parsing and just assume that everything - //is fine - seqSource.unread(whitespace); - } - } - - /** - * This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this - * format: /Title ( (5) /Creator which was patched in 1 place. - * - * However it missed the case where the number of opening and closing parenthesis isn't balanced - * - * The second bug was in this format /Title (c:\) /Producer - * - * This patch moves this code out of the parseCOSstring method, so it can be used twice. - * - * @param bracesParameter the number of braces currently open. - * - * @return the corrected value of the brace counter - * @throws IOException - */ - private int checkForEndOfstring(int bracesParameter) - { - int braces = bracesParameter; - byte[] - nextThreeBytes = new byte[3]; - int amountRead = seqSource.read(nextThreeBytes); - - // Check the next 3 bytes if available - // The following cases are valid indicators for the end of the string - // 1. Next line contains another COSObject: CR + LF + '/' - // 2. CosDictionary ends in the next line: CR + LF + '>' - // 3. Next line contains another COSObject: CR + '/' - // 4. CosDictionary ends in the next line: CR + '>' - if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR) - { - if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>') - || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>') - { - braces = 0; - } - } - if (amountRead > 0) - { - seqSource.unread(nextThreeBytes, 0, amountRead); - } - return braces; - } - - /** - * This will parse a PDF string. - * - * @return The parsed PDF string. - * - * @throws IOException If there is an error reading from the stream. - */ - protected CosString parseCOSstring() - { - char nextChar = (char)seqSource.read(); - if (nextChar == '<') - { - return parseCOSHexstring(); - } - else if (nextChar != '(') - { - throw new IOException("parseCOSstring string should start with '(' or '<' and not '" + - nextChar + "' " + seqSource); - } - - var charLf = (char)ASCII_LF; - - using (var memoryStream = new MemoryStream()) - using (var writer = new StreamWriter(memoryStream)) - { - // This is the number of braces read - int braces = 1; - int c = seqSource.read(); - while (braces > 0 && c != -1) - { - char ch = (char)c; - int nextc = -2; // not yet read - - if (ch == ')') - { - - braces--; - braces = checkForEndOfstring(braces); - if (braces != 0) - { - writer.Write(ch); - } - } - else if (ch == '(') - { - braces++; - writer.Write(ch); - } - else if (ch == '\\') - { - //patched by ram - char next = (char)seqSource.read(); - switch (next) - { - case 'n': - writer.Write('\n'); - break; - case 'r': - writer.Write('\r'); - break; - case 't': - writer.Write('\t'); - break; - case 'b': - writer.Write('\b'); - break; - case 'f': - writer.Write('\f'); - break; - case ')': - // PDFBox 276 /Title (c:\) - braces = checkForEndOfstring(braces); - if (braces != 0) - { - writer.Write(next); - } - else - { - writer.Write('\\'); - } - break; - case '(': - case '\\': - writer.Write(next); - break; - //case charLf: - // case ASCII_CR: - //this is a break in the line so ignore it and the newline and continue - c = seqSource.read(); - while (isEOL(c) && c != -1) - { - c = seqSource.read(); - } - nextc = c; - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - { - var octal = new StringBuilder(); - octal.Append(next); - c = seqSource.read(); - char digit = (char)c; - if (digit >= '0' && digit <= '7') - { - octal.Append(digit); - c = seqSource.read(); - digit = (char)c; - if (digit >= '0' && digit <= '7') - { - octal.Append(digit); - } - else - { - nextc = c; - } - } - else - { - nextc = c; - } - - int character = 0; - try - { - character = Convert.ToInt32(octal.ToString(), 8); - } - catch (FormatException e) - { - throw new IOException("Error: Expected octal character, actual='" + octal + "'", e); - } - writer.Write(character); - break; - } - default: - - // dropping the backslash - // see 7.3.4.2 Literal strings for further information - writer.Write(next); - break; - - } - } - else - { - writer.Write(ch); - } - if (nextc != -2) - { - c = nextc; - } - else - { - c = seqSource.read(); - } - } - if (c != -1) - { - seqSource.unread(c); - } - return new CosString(memoryStream.ToArray()); - } - - } - - /** - * This will parse a PDF HEX string with fail fast semantic - * meaning that we stop if a not allowed character is found. - * This is necessary in order to detect malformed input and - * be able to skip to next object start. - * - * We assume starting '<' was already read. - * - * @return The parsed PDF string. - * - * @throws IOException If there is an error reading from the stream. - */ - private CosString parseCOSHexstring() - { - var sBuf = new StringBuilder(); - while (true) - { - int c = seqSource.read(); - if (isHexDigit((char)c)) - { - sBuf.Append((char)c); - } - else if (c == '>') - { - break; - } - else if (c < 0) - { - throw new IOException("Missing closing bracket for hex string. Reached EOS."); - } - else if ((c == ' ') || (c == '\n') || - (c == '\t') || (c == '\r') || - (c == '\b') || (c == '\f')) - { - continue; - } - else - { - // if invalid chars was found: discard last - // hex character if it is not part of a pair - if (sBuf.Length % 2 != 0) - { - sBuf.Remove(sBuf.Length - 1, 1); - } - - // read till the closing bracket was found - do - { - c = seqSource.read(); - } - while (c != '>' && c >= 0); - - // might have reached EOF while looking for the closing bracket - // this can happen for malformed PDFs only. Make sure that there is - // no endless loop. - if (c < 0) - { - throw new IOException("Missing closing bracket for hex string. Reached EOS."); - } - - // exit loop - break; - } - } - return CosString.ParseHex(sBuf.ToString()); - } - - - /** - * Determine if a character terminates a PDF name. - * - * @param ch The character - * @return true if the character terminates a PDF name, otherwise false. - */ - protected bool isEndOfName(int ch) - { - return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' || - ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' || - ch == 0 || ch == '\f'; - } - - /** - * Returns true if a byte sequence is valid UTF-8. - */ - private bool isValidUTF8(byte[] input) - { - try - { - Decoder d = Encoding.UTF8.GetDecoder(); - var charLength = d.GetCharCount(input, 0, input.Length); - var chars = new char[charLength]; - d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _); - return true; - } - catch (Exception e) - { - return false; - } - } - - - /** - * This will parse a bool object from the stream. - * - * @return The parsed bool object. - * - * @throws IOException If an IO error occurs during parsing. - */ - protected CosBoolean parsebool() - { - CosBoolean retval = null; - char c = (char)seqSource.peek(); - if (c == 't') - { - string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4)); - if (!truestring.Equals(TRUE)) - { - throw new IOException("Error parsing bool: expected='true' actual='" + truestring - + "' at offset " + seqSource.getPosition()); - } - else - { - retval = CosBoolean.True; - } - } - else if (c == 'f') - { - string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5)); - if (!falsestring.Equals(FALSE)) - { - throw new IOException("Error parsing bool: expected='true' actual='" + falsestring - + "' at offset " + seqSource.getPosition()); - } - else - { - retval = CosBoolean.False; - } - } - else - { - throw new IOException("Error parsing bool expected='t or f' actual='" + c - + "' at offset " + seqSource.getPosition()); - } - return retval; - } - - /** - * This will read the next string from the stream. - * - * @return The string that was read from the stream, never null. - * - * @throws IOException If there is an error reading from the stream. - */ - protected string readstring() - { - SkipSpaces(); - StringBuilder buffer = new StringBuilder(); - int c = seqSource.read(); - while (!isEndOfName((char)c) && c != -1) - { - buffer.Append((char)c); - c = seqSource.read(); - } - if (c != -1) - { - seqSource.unread(c); - } - return buffer.ToString(); - } - - /** - * Read one string and throw an exception if it is not the expected value. - * - * @param expectedstring the string value that is expected. - * @throws IOException if the string char is not the expected value or if an - * I/O error occurs. - */ - protected void readExpectedstring(string expectedstring) - { - readExpectedstring(expectedstring, false); - } - - /** - * Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted. - * - * @param expectedstring pattern to be skipped - * @param skipSpaces if set to true spaces before and after the string will be skipped - * @throws IOException if pattern could not be read - */ - protected void readExpectedstring(string expectedstring, bool skipSpaces) - { - SkipSpaces(); - foreach (var c in expectedstring) - { - if (seqSource.read() != c) - { - throw new IOException("Expected string '" + expectedstring - + "' but missed at character '" + c + "' at offset " - + seqSource.getPosition()); - } - } - SkipSpaces(); - } - - /** - * Read one char and throw an exception if it is not the expected value. - * - * @param ec the char value that is expected. - * @throws IOException if the read char is not the expected value or if an - * I/O error occurs. - */ - protected void readExpectedChar(char ec) - { - char c = (char)seqSource.read(); - if (c != ec) - { - throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition()); - } - } - - /** - * This will read the next string from the stream up to a certain length. - * - * @param length The length to stop reading at. - * - * @return The string that was read from the stream of length 0 to length. - * - * @throws IOException If there is an error reading from the stream. - */ - protected string readstring(int length) - { - SkipSpaces(); - - int c = seqSource.read(); - - //average string size is around 2 and the normal string buffer size is - //about 16 so lets save some space. - StringBuilder buffer = new StringBuilder(length); - while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length && - c != '[' && - c != '<' && - c != '(' && - c != '/') - { - buffer.Append((char)c); - c = seqSource.read(); - } - if (c != -1) - { - seqSource.unread(c); - } - return buffer.ToString(); - } - - /** - * This will tell if the next character is a closing brace( close of PDF array ). - * - * @return true if the next byte is ']', false otherwise. - * - * @throws IOException If an IO error occurs. - */ - protected bool isClosing() - { - return isClosing(seqSource.peek()); - } - - /** - * This will tell if the next character is a closing brace( close of PDF array ). - * - * @param c The character to check against end of line - * @return true if the next byte is ']', false otherwise. - */ - protected bool isClosing(int c) - { - return c == ']'; - } - - /** - * This will read bytes until the first end of line marker occurs. - * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes - * which is an important detail if one wants to unread the line. - * - * @return The characters between the current position and the end of the line. - * - * @throws IOException If there is an error reading from the stream. - */ - protected string readLine() - { - if (seqSource.isEOF()) - { - throw new IOException("Error: End-of-File, expected line"); - } - - StringBuilder buffer = new StringBuilder(11); - - int c; - while ((c = seqSource.read()) != -1) - { - // CR and LF are valid EOLs - if (isEOL(c)) - { - break; - } - buffer.Append((char)c); - } - // CR+LF is also a valid EOL - if (isCR(c) && isLF(seqSource.peek())) - { - seqSource.read(); - } - return buffer.ToString(); - } - - /** - * This will tell if the next byte to be read is an end of line byte. - * - * @return true if the next byte is 0x0A or 0x0D. - * - * @throws IOException If there is an error reading from the stream. - */ - protected bool isEOL() - { - return isEOL(seqSource.peek()); - } - - /** - * This will tell if the next byte to be read is an end of line byte. - * - * @param c The character to check against end of line - * @return true if the next byte is 0x0A or 0x0D. - */ - protected bool isEOL(int c) - { - return isLF(c) || isCR(c); - } - - private bool isLF(int c) - { - return ASCII_LF == c; - } - - private bool isCR(int c) - { - return ASCII_CR == c; - } - - /** - * This will tell if the next byte is whitespace or not. - * - * @return true if the next byte in the stream is a whitespace character. - * - * @throws IOException If there is an error reading from the stream. - */ - protected bool isWhitespace() - { - return isWhitespace(seqSource.peek()); - } - - /** - * This will tell if a character is whitespace or not. These values are - * specified in table 1 (page 12) of ISO 32000-1:2008. - * @param c The character to check against whitespace - * @return true if the character is a whitespace character. - */ - protected bool isWhitespace(int c) - { - return c == 0 || c == 9 || c == 12 || c == ASCII_LF - || c == ASCII_CR || c == ASCII_SPACE; - } - - /** - * This will tell if the next byte is a space or not. - * - * @return true if the next byte in the stream is a space character. - * - * @throws IOException If there is an error reading from the stream. - */ - protected bool isSpace() - { - return isSpace(seqSource.peek()); - } - - /** - * This will tell if the given value is a space or not. - * - * @param c The character to check against space - * @return true if the next byte in the stream is a space character. - */ - protected bool isSpace(int c) - { - return ASCII_SPACE == c; - } - - /** - * This will tell if the next byte is a digit or not. - * - * @return true if the next byte in the stream is a digit. - * - * @throws IOException If there is an error reading from the stream. - */ - protected bool isDigit() - { - return isDigit(seqSource.peek()); - } - - /** - * This will tell if the given value is a digit or not. - * - * @param c The character to be checked - * @return true if the next byte in the stream is a digit. - */ - protected static bool isDigit(int c) - { - return c >= ASCII_ZERO && c <= ASCII_NINE; - } - - /** - * This will skip all spaces and comments that are present. - * - * @throws IOException If there is an error reading from the stream. - */ - protected void SkipSpaces() - { - int c = seqSource.read(); - // 37 is the % character, a comment - while (isWhitespace(c) || c == 37) - { - if (c == 37) - { - // skip past the comment section - c = seqSource.read(); - while (!isEOL(c) && c != -1) - { - c = seqSource.read(); - } - } - else - { - c = seqSource.read(); - } - } - if (c != -1) - { - seqSource.unread(c); - } - } - - /** - * This will read a long from the Stream and throw an {@link IOException} if - * the long value is negative or has more than 10 digits (i.e. : bigger than - * {@link #OBJECT_NUMBER_THRESHOLD}) - * - * @return the object number being read. - * @throws IOException if an I/O error occurs - */ - protected long readObjectNumber() - { - long retval = readLong(); - if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD) - { - throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative"); - } - return retval; - } - - /** - * This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value - * has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD}) - * @return the generation number being read. - * @throws IOException if an I/O error occurs - */ - protected int readGenerationNumber() - { - int retval = readInt(); - if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD) - { - throw new IOException("Generation Number '" + retval + "' has more than 5 digits"); - } - return retval; - } - - /** - * This will read an integer from the stream. - * - * @return The integer that was read from the stream. - * - * @throws IOException If there is an error reading from the stream. - */ - protected int readInt() - { - SkipSpaces(); - int retval = 0; - - StringBuilder intBuffer = readstringNumber(); - - try - { - retval = int.Parse(intBuffer.ToString()); - } - catch (FormatException e) - { - seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString())); - throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e); - } - return retval; - } - - - /** - * This will read an long from the stream. - * - * @return The long that was read from the stream. - * - * @throws IOException If there is an error reading from the stream. - */ - protected long readLong() - { - SkipSpaces(); - long retval = 0; - - StringBuilder longBuffer = readstringNumber(); - - try - { - retval = long.Parse(longBuffer.ToString()); - } - catch (FormatException e) - { - seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString())); - - throw new IOException( - $"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e); - } - - return retval; - } - - /** - * This method is used to read a token by the {@linkplain #readInt()} method - * and the {@linkplain #readLong()} method. - * - * @return the token to parse as integer or long by the calling method. - * @throws IOException throws by the {@link #seqSource} methods. - */ - protected StringBuilder readstringNumber() - { - int lastByte = 0; - StringBuilder buffer = new StringBuilder(); - while ((lastByte = seqSource.read()) != ASCII_SPACE && - lastByte != ASCII_LF && - lastByte != ASCII_CR && - lastByte != 60 && //see sourceforge bug 1714707 - lastByte != '[' && // PDFBOX-1845 - lastByte != '(' && // PDFBOX-2579 - lastByte != 0 && //See sourceforge bug 853328 - lastByte != -1) - { - buffer.Append((char)lastByte); - if (buffer.Length > MAX_LENGTH_LONG) - { - throw new IOException("Number '" + buffer + - "' is getting too long, stop reading at offset " + seqSource.getPosition()); - } - } - if (lastByte != -1) - { - seqSource.unread(lastByte); - } - return buffer; - } - } - -} diff --git a/src/UglyToad.Pdf/PdfDocument.cs b/src/UglyToad.Pdf/PdfDocument.cs index b320d8ba..584589c7 100644 --- a/src/UglyToad.Pdf/PdfDocument.cs +++ b/src/UglyToad.Pdf/PdfDocument.cs @@ -25,10 +25,15 @@ private readonly ParsingCachingProviders cachingProviders; [NotNull] - public Catalog Catalog { get; } + internal Catalog Catalog { get; } [NotNull] - public Pages Pages { get; } + internal Pages Pages { get; } + + /// + /// Get the number of pages in this document. + /// + public int NumberOfPages => Pages.Count; internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable, bool isLenientParsing, @@ -50,6 +55,16 @@ public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options); public static PdfDocument Open(string filename, ParsingOptions options = null) => PdfDocumentFactory.Open(filename, options); + /// + /// Get the page with the specified page number. + /// + /// The number of the page to return, this starts from 1. + /// The page. + public Page GetPage(int pageNumber) + { + return Pages.GetPage(pageNumber); + } + public void Dispose() { try