encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

This commit is contained in:
Eliot Jones 2017-12-28 13:14:03 +00:00
parent 940c51e2fb
commit b1d28a5af8
8 changed files with 437 additions and 966 deletions

View File

@ -3,4 +3,40 @@
[![Build status](https://ci.appveyor.com/api/projects/status/ni7et2j2ml60pdi3?svg=true)](https://ci.appveyor.com/project/EliotJones/pdf)
[![codecov](https://codecov.io/gh/UglyToad/Pdf/branch/master/graph/badge.svg)](https://codecov.io/gh/UglyToad/Pdf)
Convert the [PdfBox](https://github.com/apache/pdfbox) code to C#.
The aim of this project is to convert the [PdfBox](https://github.com/apache/pdfbox) code to C# in order to provide a properly open source (i.e. no copyleft) solution for inspecting PDF documents. This uses the Apache 2.0 licence.
## Status ##
There is a lot left to do for this project, the initial minimum viable project when released to Alpha will provide:
+ Page counts and sizes (in points) for a document.
+ Access to the text contents of each page. Note that since PDF has no concept of a "word" it will be up to the consumer of the text to work out where the words are within the text.
+ (Possible) The locations and bounds of each letter on the page.
For the initial alpha release all files will be opened rather than streamed so this will not support large files.
Eventually the library should support all existing PdfBox operations such as accessing graphical elements, form elements as well as creating PDF documents.
## Usage ##
The initial public API will be as limited as possible to allow extensive refactoring to take place. The proposed usage is as follows:
using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf"))
{
int pageCount = document.NumberOfPages;
Page page = document.GetPage(1);
decimal widthInPoints = page.Width;
decimal heightInPoints = page.Height;
string text = page.Text;
}
The ```PdfDocument``` will also support opening from byte arrays (as well as streams eventually):
byte[] fileBytes = File.ReadAllBytes(@"C:\my-file.pdf");
(using PdfDocument document = PdfDocument.Open(fileBytes))
{
int numberOfPages = document.NumberOfPages;
}

View File

@ -0,0 +1,39 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
public class AssertablePositionData
{
public decimal X { get; set; }
public decimal Y { get; set; }
public decimal Width { get; set; }
public string Text { get; set; }
public decimal FontSize { get; set; }
public string FontName { get; set; }
public static AssertablePositionData Parse(string line)
{
var parts = line.Split('\t', StringSplitOptions.None);
if (parts.Length != 6)
{
throw new ArgumentException($"Expected 6 parts to the line, instead got {parts.Length}");
}
return new AssertablePositionData
{
X = decimal.Parse(parts[0]),
Y = decimal.Parse(parts[1]),
Width = decimal.Parse(parts[2]),
Text = parts[3],
FontSize = decimal.Parse(parts[4]),
FontName = parts[5]
};
}
}
}

View File

@ -43,25 +43,6 @@
public class PdfParserTests
{
[Fact]
public void CanParseSimpleGoogleDocsDocument()
{
// To see the text as shown in Visual Studio or Notepad++, use the OtherEncodings.BytesAsLatin1String()
var file = GetNthFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(1, document.Pages.Count);
var page = document.Pages.GetPage(1);
Assert.Equal(1, page.Number);
var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty);
Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty));
}
}
[Fact]
public void CanDecompressNormalObjectStream()
{

View File

@ -0,0 +1,328 @@
// ReSharper disable AccessToDisposedClosure
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Xunit;
public class SinglePageSimpleTests
{
private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
{
"\u200B"
};
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
}
[Fact]
public void HasCorrectNumberOfPages()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(1, document.NumberOfPages);
}
}
[Fact]
public void CanAccessPage()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
Assert.NotNull(page);
Assert.Equal(1, page.Number);
}
}
[Fact]
public void AccessPageLowerThanOneThrows()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Action action = () => document.GetPage(0);
Assert.Throws<ArgumentOutOfRangeException>(action);
}
}
[Fact]
public void PageHasCorrectDimensions()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
Assert.Equal(612, page.Width);
Assert.Equal(792, page.Height);
}
}
[Fact]
public void PageHasCorrectTextIgnoringHiddenCharacters()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
var text = string.Join(string.Empty, page.Letters.Select(x => x.Value).Where(x => !IgnoredHiddenCharacters.Contains(x)));
const string expected =
"This is the document title There is some lede text here And then another line of text. ";
Assert.Equal(expected, text);
}
}
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
{
// X Y Width Letter FontSize Font
const string fromPdfBox = @"72 105 9.771912 T 21 ArialMT
81.77106 105 8.897049 h 21 ArialMT
90.66733 105 3.554138 i 21 ArialMT
94.22115 105 7.998741 s 21 ArialMT
102.2192 105 0 21 Gautami
106.6634 105 0 21 Gautami
106.6634 105 3.554131 i 21 ArialMT
110.2173 105 7.998749 s 21 ArialMT
118.2153 105 0 21 Gautami
122.6595 105 0 21 Gautami
122.6595 105 4.444618 t 21 ArialMT
127.1038 105 8.897049 h 21 ArialMT
136 105 8.897049 e 21 ArialMT
144.8963 105 0 21 Gautami
149.3405 105 0 21 Gautami
149.3405 105 8.897049 d 21 ArialMT
158.2368 105 8.897049 o 21 ArialMT
167.1331 105 7.998749 c 21 ArialMT
175.1311 105 8.897049 u 21 ArialMT
184.0274 105 13.32605 m 21 ArialMT
197.3523 105 8.897049 e 21 ArialMT
206.2485 105 8.897049 n 21 ArialMT
215.1448 105 4.444611 t 21 ArialMT
219.5891 105 0 21 Gautami
224.0333 105 0 21 Gautami
224.0333 105 4.444611 t 21 ArialMT
228.4775 105 3.554138 i 21 ArialMT
232.0313 105 4.444611 t 21 ArialMT
236.4756 105 3.554123 l 21 ArialMT
240.0294 105 8.897049 e 21 ArialMT
72 143.25 6.716187 T 14 ArialMT
78.71446 143.25 6.114899 h 14 ArialMT
84.8278 143.25 6.114891 e 14 ArialMT
90.94113 143.25 3.661423 r 14 ArialMT
94.60161 143.25 6.114899 e 14 ArialMT
100.7149 143.25 0 14 Gautami
103.7689 143.25 0 14 Gautami
103.7689 143.25 2.442749 i 14 ArialMT
106.211 143.25 5.497505 s 14 ArialMT
111.7071 143.25 0 14 Gautami
114.7611 143.25 0 14 Gautami
114.7611 143.25 5.497505 s 14 ArialMT
120.2572 143.25 6.114899 o 14 ArialMT
126.3705 143.25 9.158928 m 14 ArialMT
135.5271 143.25 6.114899 e 14 ArialMT
141.6404 143.25 0 14 Gautami
144.6944 143.25 0 14 Gautami
144.6944 143.25 2.442749 l 14 ArialMT
147.1365 143.25 6.114899 e 14 ArialMT
153.2499 143.25 6.114899 d 14 ArialMT
159.3632 143.25 6.114899 e 14 ArialMT
165.4765 143.25 0 14 Gautami
168.5305 143.25 0 14 Gautami
168.5305 143.25 3.054749 t 14 ArialMT
171.5845 143.25 6.114899 e 14 ArialMT
177.6978 143.25 5.497498 x 14 ArialMT
183.1939 143.25 3.054764 t 14 ArialMT
186.2479 143.25 0 14 Gautami
189.3019 143.25 0 14 Gautami
189.3019 143.25 6.114899 h 14 ArialMT
195.4152 143.25 6.114899 e 14 ArialMT
201.5285 143.25 3.661423 r 14 ArialMT
205.189 143.25 6.114899 e 14 ArialMT
72 173.25 7.33358 A 14 ArialMT
79.3317 173.25 6.114891 n 14 ArialMT
85.44504 173.25 6.114891 d 14 ArialMT
91.55836 173.25 0 14 Gautami
94.61235 173.25 0 14 Gautami
94.61235 173.25 3.054756 t 14 ArialMT
97.66633 173.25 6.114899 h 14 ArialMT
103.7797 173.25 6.114899 e 14 ArialMT
109.893 173.25 6.114899 n 14 ArialMT
116.0063 173.25 0 14 Gautami
119.0603 173.25 0 14 Gautami
119.0603 173.25 6.114899 a 14 ArialMT
125.1736 173.25 6.114899 n 14 ArialMT
131.287 173.25 6.114899 o 14 ArialMT
137.4003 173.25 3.054749 t 14 ArialMT
140.4543 173.25 6.114899 h 14 ArialMT
146.5676 173.25 6.114899 e 14 ArialMT
152.6809 173.25 3.661423 r 14 ArialMT
156.3414 173.25 0 14 Gautami
159.3954 173.25 0 14 Gautami
159.3954 173.25 2.442749 l 14 ArialMT
161.8375 173.25 2.442734 i 14 ArialMT
164.2796 173.25 6.114899 n 14 ArialMT
170.393 173.25 6.114899 e 14 ArialMT
176.5063 173.25 0 14 Gautami
179.5603 173.25 0 14 Gautami
179.5603 173.25 6.114899 o 14 ArialMT
185.6736 173.25 3.054764 f 14 ArialMT
188.7276 173.25 0 14 Gautami
191.7816 173.25 0 14 Gautami
191.7816 173.25 3.054764 t 14 ArialMT
194.8355 173.25 6.114899 e 14 ArialMT
200.9489 173.25 5.497482 x 14 ArialMT
206.445 173.25 3.054764 t 14 ArialMT
209.499 173.25 3.054764 . 14 ArialMT";
return fromPdfBox.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
.Select(AssertablePositionData.Parse)
.ToList();
}
private static IReadOnlyList<AssertablePositionData> GetOtherPositionData1()
{
// These do not include the font information
const string fromOther = @"72 105 9.758476 T 0 ArialMT
81.77106 105 8.894608 h 0 ArialMT
90.66733 105 3.551445 i 0 ArialMT
94.22115 105 7.998749 s 0 ArialMT
102.2192 105 4.431305 0 ArialMT
102.2192 105 0 0 ArialMT
106.6634 105 3.551445 i 0 ArialMT
106.6634 105 0 0 ArialMT
110.2173 105 7.998749 s 0 ArialMT
118.2153 105 0 0 ArialMT
118.2153 105 4.431305 0 ArialMT
122.6595 105 4.431305 t 0 ArialMT
122.6595 105 0 0 ArialMT
127.1038 105 8.894608 h 0 ArialMT
136 105 8.894608 e 0 ArialMT
144.8963 105 4.431305 0 ArialMT
144.8963 105 0 0 ArialMT
149.3405 105 8.894608 d 0 ArialMT
149.3405 105 0 0 ArialMT
158.2368 105 8.894608 o 0 ArialMT
167.1331 105 7.998749 c 0 ArialMT
175.1311 105 8.894608 u 0 ArialMT
184.0274 105 13.32591 m 0 ArialMT
197.3523 105 8.894608 e 0 ArialMT
206.2485 105 8.894608 n 0 ArialMT
215.1448 105 4.431305 t 0 ArialMT
219.5891 105 4.431305 0 ArialMT
219.5891 105 0 0 ArialMT
224.0333 105 4.431305 t 0 ArialMT
224.0333 105 0 0 ArialMT
228.4775 105 3.551453 i 0 ArialMT
232.0313 105 4.431305 t 0 ArialMT
236.4756 105 3.551453 l 0 ArialMT
240.0294 105 8.894608 e 0 ArialMT
248.918 105 4.431305 0 ArialMT
72 128.25 3.045616 0 ArialMT
72 143.25 6.706947 T 0 ArialMT
78.71446 143.25 6.11322 h 0 ArialMT
84.8278 143.25 6.11322 e 0 ArialMT
90.94113 143.25 3.661331 r 0 ArialMT
94.60161 143.25 6.11322 e 0 ArialMT
100.7149 143.25 3.045616 0 ArialMT
100.7149 143.25 0 0 ArialMT
103.7689 143.25 2.440887 i 0 ArialMT
103.7689 143.25 0 0 ArialMT
106.211 143.25 5.497498 s 0 ArialMT
111.7071 143.25 3.045616 0 ArialMT
111.7071 143.25 0 0 ArialMT
114.7611 143.25 0 0 ArialMT
114.7611 143.25 5.497498 s 0 ArialMT
120.2572 143.25 6.11322 o 0 ArialMT
126.3705 143.25 9.158836 m 0 ArialMT
135.5271 143.25 6.11322 e 0 ArialMT
141.6404 143.25 0 0 ArialMT
141.6404 143.25 3.045609 0 ArialMT
144.6944 143.25 2.440887 l 0 ArialMT
144.6944 143.25 0 0 ArialMT
147.1365 143.25 6.11322 e 0 ArialMT
153.2499 143.25 6.11322 d 0 ArialMT
159.3632 143.25 6.11322 e 0 ArialMT
165.4765 143.25 0 0 ArialMT
165.4765 143.25 3.045609 0 ArialMT
168.5305 143.25 3.045609 t 0 ArialMT
168.5305 143.25 0 0 ArialMT
171.5845 143.25 6.11322 e 0 ArialMT
177.6978 143.25 5.497498 x 0 ArialMT
183.1939 143.25 3.045609 t 0 ArialMT
186.2479 143.25 0 0 ArialMT
186.2479 143.25 3.045609 0 ArialMT
189.3019 143.25 6.11322 h 0 ArialMT
189.3019 143.25 0 0 ArialMT
195.4152 143.25 6.11322 e 0 ArialMT
201.5285 143.25 3.661331 r 0 ArialMT
205.189 143.25 6.11322 e 0 ArialMT
211.3008 143.25 3.045609 0 ArialMT
72 158.25 3.045616 0 ArialMT
72 173.25 7.32267 A 0 ArialMT
79.3317 173.25 6.11322 n 0 ArialMT
85.44504 173.25 6.11322 d 0 ArialMT
91.55836 173.25 3.045616 0 ArialMT
91.55836 173.25 0 0 ArialMT
94.61235 173.25 0 0 ArialMT
94.61235 173.25 3.045616 t 0 ArialMT
97.66633 173.25 6.11322 h 0 ArialMT
103.7797 173.25 6.11322 e 0 ArialMT
109.893 173.25 6.11322 n 0 ArialMT
116.0063 173.25 0 0 ArialMT
116.0063 173.25 3.045616 0 ArialMT
119.0603 173.25 6.11322 a 0 ArialMT
119.0603 173.25 0 0 ArialMT
125.1736 173.25 6.11322 n 0 ArialMT
131.287 173.25 6.11322 o 0 ArialMT
137.4003 173.25 3.045609 t 0 ArialMT
140.4543 173.25 6.11322 h 0 ArialMT
146.5676 173.25 6.11322 e 0 ArialMT
152.6809 173.25 3.661331 r 0 ArialMT
156.3414 173.25 3.045609 0 ArialMT
156.3414 173.25 0 0 ArialMT
159.3954 173.25 2.440887 l 0 ArialMT
159.3954 173.25 0 0 ArialMT
161.8375 173.25 2.440887 i 0 ArialMT
164.2796 173.25 6.11322 n 0 ArialMT
170.393 173.25 6.11322 e 0 ArialMT
176.5063 173.25 3.045609 0 ArialMT
176.5063 173.25 0 0 ArialMT
179.5603 173.25 6.11322 o 0 ArialMT
179.5603 173.25 0 0 ArialMT
185.6736 173.25 3.045609 f 0 ArialMT
188.7276 173.25 0 0 ArialMT
188.7276 173.25 3.045609 0 ArialMT
191.7816 173.25 3.045609 t 0 ArialMT
191.7816 173.25 0 0 ArialMT
194.8355 173.25 6.11322 e 0 ArialMT
200.9489 173.25 5.497498 x 0 ArialMT
206.445 173.25 3.045609 t 0 ArialMT
209.499 173.25 3.045609 . 0 ArialMT
212.543 173.25 3.045609 0 ArialMT";
return fromOther.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
.Select(AssertablePositionData.Parse)
.ToList();
}
}
}

View File

@ -10,11 +10,21 @@
/// </summary>
public int Number { get; }
public MediaBox MediaBox { get; }
internal MediaBox MediaBox { get; }
internal PageContent Content { get; }
public IReadOnlyList<Letter> Text => Content?.Letters ?? new Letter[0];
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
/// <summary>
/// Gets the width of the page in points.
/// </summary>
public decimal Width { get; }
/// <summary>
/// Gets the height of the page in points.
/// </summary>
public decimal Height { get; }
internal Page(int number, MediaBox mediaBox, PageContent content)
{
@ -26,6 +36,9 @@
Number = number;
MediaBox = mediaBox;
Content = content;
Width = mediaBox.Bounds.Width;
Height = mediaBox.Bounds.Height;
}
}
}

View File

@ -75,7 +75,7 @@
if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary))
{
throw new InvalidOperationException("Could not find the page with number: " + pageNumber);
throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
}
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);

View File

@ -1,941 +0,0 @@
namespace UglyToad.Pdf.Parser
{
using System;
using System.IO;
using System.Text;
using Cos;
using IO;
using Util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class is used to contain parsing logic that will be used by both the
* PDFParser and the COSStreamParser.
*
* @author Ben Litchfield
*/
public abstract class BaseParser
{
private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L;
private static readonly long GENERATION_NUMBER_THRESHOLD = 65535;
static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length;
/**
* Log instance.
*/
protected static readonly int E = 'e';
protected static readonly int N = 'n';
protected static readonly int D = 'd';
protected static readonly int S = 's';
protected static readonly int T = 't';
protected static readonly int R = 'r';
protected static readonly int A = 'a';
protected static readonly int M = 'm';
protected static readonly int O = 'o';
protected static readonly int B = 'b';
protected static readonly int J = 'j';
/**
* This is a string constant that will be used for comparisons.
*/
public static readonly string DEF = "def";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string ENDOBJ_string = "endobj";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string ENDSTREAM_string = "endstream";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string STREAM_string = "stream";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string TRUE = "true";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string FALSE = "false";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string NULL = "null";
/**
* ASCII code for line feed.
*/
protected static readonly byte ASCII_LF = 10;
/**
* ASCII code for carriage return.
*/
protected static readonly byte ASCII_CR = 13;
private static readonly byte ASCII_ZERO = 48;
private static readonly byte ASCII_NINE = 57;
private static readonly byte ASCII_SPACE = 32;
/**
* This is the stream that will be read from.
*/
protected readonly SequentialSource seqSource;
/**
* This is the document that will be parsed.
*/
protected COSDocument document;
/**
* Default constructor.
*/
public BaseParser(SequentialSource pdfSource)
{
this.seqSource = pdfSource;
}
private static bool isHexDigit(char ch)
{
return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
protected void skipWhiteSpaces()
{
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
int whitespace = seqSource.read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (ASCII_SPACE == whitespace)
{
whitespace = seqSource.read();
}
if (ASCII_CR == whitespace)
{
whitespace = seqSource.read();
if (ASCII_LF != whitespace)
{
seqSource.unread(whitespace);
//The spec says this is invalid but it happens in the real
//world so we must support it.
}
}
else if (ASCII_LF != whitespace)
{
//we are in an error.
//but again we will do a lenient parsing and just assume that everything
//is fine
seqSource.unread(whitespace);
}
}
/**
* This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
* format: /Title ( (5) /Creator which was patched in 1 place.
*
* However it missed the case where the number of opening and closing parenthesis isn't balanced
*
* The second bug was in this format /Title (c:\) /Producer
*
* This patch moves this code out of the parseCOSstring method, so it can be used twice.
*
* @param bracesParameter the number of braces currently open.
*
* @return the corrected value of the brace counter
* @throws IOException
*/
private int checkForEndOfstring(int bracesParameter)
{
int braces = bracesParameter;
byte[]
nextThreeBytes = new byte[3];
int amountRead = seqSource.read(nextThreeBytes);
// Check the next 3 bytes if available
// The following cases are valid indicators for the end of the string
// 1. Next line contains another COSObject: CR + LF + '/'
// 2. CosDictionary ends in the next line: CR + LF + '>'
// 3. Next line contains another COSObject: CR + '/'
// 4. CosDictionary ends in the next line: CR + '>'
if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR)
{
if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>')
|| nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
{
braces = 0;
}
}
if (amountRead > 0)
{
seqSource.unread(nextThreeBytes, 0, amountRead);
}
return braces;
}
/**
* This will parse a PDF string.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
protected CosString parseCOSstring()
{
char nextChar = (char)seqSource.read();
if (nextChar == '<')
{
return parseCOSHexstring();
}
else if (nextChar != '(')
{
throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
nextChar + "' " + seqSource);
}
var charLf = (char)ASCII_LF;
using (var memoryStream = new MemoryStream())
using (var writer = new StreamWriter(memoryStream))
{
// This is the number of braces read
int braces = 1;
int c = seqSource.read();
while (braces > 0 && c != -1)
{
char ch = (char)c;
int nextc = -2; // not yet read
if (ch == ')')
{
braces--;
braces = checkForEndOfstring(braces);
if (braces != 0)
{
writer.Write(ch);
}
}
else if (ch == '(')
{
braces++;
writer.Write(ch);
}
else if (ch == '\\')
{
//patched by ram
char next = (char)seqSource.read();
switch (next)
{
case 'n':
writer.Write('\n');
break;
case 'r':
writer.Write('\r');
break;
case 't':
writer.Write('\t');
break;
case 'b':
writer.Write('\b');
break;
case 'f':
writer.Write('\f');
break;
case ')':
// PDFBox 276 /Title (c:\)
braces = checkForEndOfstring(braces);
if (braces != 0)
{
writer.Write(next);
}
else
{
writer.Write('\\');
}
break;
case '(':
case '\\':
writer.Write(next);
break;
//case charLf:
// case ASCII_CR:
//this is a break in the line so ignore it and the newline and continue
c = seqSource.read();
while (isEOL(c) && c != -1)
{
c = seqSource.read();
}
nextc = c;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
var octal = new StringBuilder();
octal.Append(next);
c = seqSource.read();
char digit = (char)c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
c = seqSource.read();
digit = (char)c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
}
else
{
nextc = c;
}
}
else
{
nextc = c;
}
int character = 0;
try
{
character = Convert.ToInt32(octal.ToString(), 8);
}
catch (FormatException e)
{
throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
}
writer.Write(character);
break;
}
default:
// dropping the backslash
// see 7.3.4.2 Literal strings for further information
writer.Write(next);
break;
}
}
else
{
writer.Write(ch);
}
if (nextc != -2)
{
c = nextc;
}
else
{
c = seqSource.read();
}
}
if (c != -1)
{
seqSource.unread(c);
}
return new CosString(memoryStream.ToArray());
}
}
/**
* This will parse a PDF HEX string with fail fast semantic
* meaning that we stop if a not allowed character is found.
* This is necessary in order to detect malformed input and
* be able to skip to next object start.
*
* We assume starting '&lt;' was already read.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
private CosString parseCOSHexstring()
{
var sBuf = new StringBuilder();
while (true)
{
int c = seqSource.read();
if (isHexDigit((char)c))
{
sBuf.Append((char)c);
}
else if (c == '>')
{
break;
}
else if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
else if ((c == ' ') || (c == '\n') ||
(c == '\t') || (c == '\r') ||
(c == '\b') || (c == '\f'))
{
continue;
}
else
{
// if invalid chars was found: discard last
// hex character if it is not part of a pair
if (sBuf.Length % 2 != 0)
{
sBuf.Remove(sBuf.Length - 1, 1);
}
// read till the closing bracket was found
do
{
c = seqSource.read();
}
while (c != '>' && c >= 0);
// might have reached EOF while looking for the closing bracket
// this can happen for malformed PDFs only. Make sure that there is
// no endless loop.
if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
// exit loop
break;
}
}
return CosString.ParseHex(sBuf.ToString());
}
/**
* Determine if a character terminates a PDF name.
*
* @param ch The character
* @return true if the character terminates a PDF name, otherwise false.
*/
protected bool isEndOfName(int ch)
{
return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' ||
ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' ||
ch == 0 || ch == '\f';
}
/**
* Returns true if a byte sequence is valid UTF-8.
*/
private bool isValidUTF8(byte[] input)
{
try
{
Decoder d = Encoding.UTF8.GetDecoder();
var charLength = d.GetCharCount(input, 0, input.Length);
var chars = new char[charLength];
d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _);
return true;
}
catch (Exception e)
{
return false;
}
}
/**
* This will parse a bool object from the stream.
*
* @return The parsed bool object.
*
* @throws IOException If an IO error occurs during parsing.
*/
protected CosBoolean parsebool()
{
CosBoolean retval = null;
char c = (char)seqSource.peek();
if (c == 't')
{
string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4));
if (!truestring.Equals(TRUE))
{
throw new IOException("Error parsing bool: expected='true' actual='" + truestring
+ "' at offset " + seqSource.getPosition());
}
else
{
retval = CosBoolean.True;
}
}
else if (c == 'f')
{
string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5));
if (!falsestring.Equals(FALSE))
{
throw new IOException("Error parsing bool: expected='true' actual='" + falsestring
+ "' at offset " + seqSource.getPosition());
}
else
{
retval = CosBoolean.False;
}
}
else
{
throw new IOException("Error parsing bool expected='t or f' actual='" + c
+ "' at offset " + seqSource.getPosition());
}
return retval;
}
/**
* This will read the next string from the stream.
*
* @return The string that was read from the stream, never null.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readstring()
{
SkipSpaces();
StringBuilder buffer = new StringBuilder();
int c = seqSource.read();
while (!isEndOfName((char)c) && c != -1)
{
buffer.Append((char)c);
c = seqSource.read();
}
if (c != -1)
{
seqSource.unread(c);
}
return buffer.ToString();
}
/**
* Read one string and throw an exception if it is not the expected value.
*
* @param expectedstring the string value that is expected.
* @throws IOException if the string char is not the expected value or if an
* I/O error occurs.
*/
protected void readExpectedstring(string expectedstring)
{
readExpectedstring(expectedstring, false);
}
/**
* Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
*
* @param expectedstring pattern to be skipped
* @param skipSpaces if set to true spaces before and after the string will be skipped
* @throws IOException if pattern could not be read
*/
protected void readExpectedstring(string expectedstring, bool skipSpaces)
{
SkipSpaces();
foreach (var c in expectedstring)
{
if (seqSource.read() != c)
{
throw new IOException("Expected string '" + expectedstring
+ "' but missed at character '" + c + "' at offset "
+ seqSource.getPosition());
}
}
SkipSpaces();
}
/**
* Read one char and throw an exception if it is not the expected value.
*
* @param ec the char value that is expected.
* @throws IOException if the read char is not the expected value or if an
* I/O error occurs.
*/
protected void readExpectedChar(char ec)
{
char c = (char)seqSource.read();
if (c != ec)
{
throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition());
}
}
/**
* This will read the next string from the stream up to a certain length.
*
* @param length The length to stop reading at.
*
* @return The string that was read from the stream of length 0 to length.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readstring(int length)
{
SkipSpaces();
int c = seqSource.read();
//average string size is around 2 and the normal string buffer size is
//about 16 so lets save some space.
StringBuilder buffer = new StringBuilder(length);
while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length &&
c != '[' &&
c != '<' &&
c != '(' &&
c != '/')
{
buffer.Append((char)c);
c = seqSource.read();
}
if (c != -1)
{
seqSource.unread(c);
}
return buffer.ToString();
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @return true if the next byte is ']', false otherwise.
*
* @throws IOException If an IO error occurs.
*/
protected bool isClosing()
{
return isClosing(seqSource.peek());
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @param c The character to check against end of line
* @return true if the next byte is ']', false otherwise.
*/
protected bool isClosing(int c)
{
return c == ']';
}
/**
* This will read bytes until the first end of line marker occurs.
* NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
* which is an important detail if one wants to unread the line.
*
* @return The characters between the current position and the end of the line.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readLine()
{
if (seqSource.isEOF())
{
throw new IOException("Error: End-of-File, expected line");
}
StringBuilder buffer = new StringBuilder(11);
int c;
while ((c = seqSource.read()) != -1)
{
// CR and LF are valid EOLs
if (isEOL(c))
{
break;
}
buffer.Append((char)c);
}
// CR+LF is also a valid EOL
if (isCR(c) && isLF(seqSource.peek()))
{
seqSource.read();
}
return buffer.ToString();
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @return true if the next byte is 0x0A or 0x0D.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isEOL()
{
return isEOL(seqSource.peek());
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @param c The character to check against end of line
* @return true if the next byte is 0x0A or 0x0D.
*/
protected bool isEOL(int c)
{
return isLF(c) || isCR(c);
}
private bool isLF(int c)
{
return ASCII_LF == c;
}
private bool isCR(int c)
{
return ASCII_CR == c;
}
/**
* This will tell if the next byte is whitespace or not.
*
* @return true if the next byte in the stream is a whitespace character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isWhitespace()
{
return isWhitespace(seqSource.peek());
}
/**
* This will tell if a character is whitespace or not. These values are
* specified in table 1 (page 12) of ISO 32000-1:2008.
* @param c The character to check against whitespace
* @return true if the character is a whitespace character.
*/
protected bool isWhitespace(int c)
{
return c == 0 || c == 9 || c == 12 || c == ASCII_LF
|| c == ASCII_CR || c == ASCII_SPACE;
}
/**
* This will tell if the next byte is a space or not.
*
* @return true if the next byte in the stream is a space character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isSpace()
{
return isSpace(seqSource.peek());
}
/**
* This will tell if the given value is a space or not.
*
* @param c The character to check against space
* @return true if the next byte in the stream is a space character.
*/
protected bool isSpace(int c)
{
return ASCII_SPACE == c;
}
/**
* This will tell if the next byte is a digit or not.
*
* @return true if the next byte in the stream is a digit.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isDigit()
{
return isDigit(seqSource.peek());
}
/**
* This will tell if the given value is a digit or not.
*
* @param c The character to be checked
* @return true if the next byte in the stream is a digit.
*/
protected static bool isDigit(int c)
{
return c >= ASCII_ZERO && c <= ASCII_NINE;
}
/**
* This will skip all spaces and comments that are present.
*
* @throws IOException If there is an error reading from the stream.
*/
protected void SkipSpaces()
{
int c = seqSource.read();
// 37 is the % character, a comment
while (isWhitespace(c) || c == 37)
{
if (c == 37)
{
// skip past the comment section
c = seqSource.read();
while (!isEOL(c) && c != -1)
{
c = seqSource.read();
}
}
else
{
c = seqSource.read();
}
}
if (c != -1)
{
seqSource.unread(c);
}
}
/**
* This will read a long from the Stream and throw an {@link IOException} if
* the long value is negative or has more than 10 digits (i.e. : bigger than
* {@link #OBJECT_NUMBER_THRESHOLD})
*
* @return the object number being read.
* @throws IOException if an I/O error occurs
*/
protected long readObjectNumber()
{
long retval = readLong();
if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
{
throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
}
return retval;
}
/**
* This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
* has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
* @return the generation number being read.
* @throws IOException if an I/O error occurs
*/
protected int readGenerationNumber()
{
int retval = readInt();
if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
{
throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
}
return retval;
}
/**
* This will read an integer from the stream.
*
* @return The integer that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected int readInt()
{
SkipSpaces();
int retval = 0;
StringBuilder intBuffer = readstringNumber();
try
{
retval = int.Parse(intBuffer.ToString());
}
catch (FormatException e)
{
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e);
}
return retval;
}
/**
* This will read an long from the stream.
*
* @return The long that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected long readLong()
{
SkipSpaces();
long retval = 0;
StringBuilder longBuffer = readstringNumber();
try
{
retval = long.Parse(longBuffer.ToString());
}
catch (FormatException e)
{
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString()));
throw new IOException(
$"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e);
}
return retval;
}
/**
* This method is used to read a token by the {@linkplain #readInt()} method
* and the {@linkplain #readLong()} method.
*
* @return the token to parse as integer or long by the calling method.
* @throws IOException throws by the {@link #seqSource} methods.
*/
protected StringBuilder readstringNumber()
{
int lastByte = 0;
StringBuilder buffer = new StringBuilder();
while ((lastByte = seqSource.read()) != ASCII_SPACE &&
lastByte != ASCII_LF &&
lastByte != ASCII_CR &&
lastByte != 60 && //see sourceforge bug 1714707
lastByte != '[' && // PDFBOX-1845
lastByte != '(' && // PDFBOX-2579
lastByte != 0 && //See sourceforge bug 853328
lastByte != -1)
{
buffer.Append((char)lastByte);
if (buffer.Length > MAX_LENGTH_LONG)
{
throw new IOException("Number '" + buffer +
"' is getting too long, stop reading at offset " + seqSource.getPosition());
}
}
if (lastByte != -1)
{
seqSource.unread(lastByte);
}
return buffer;
}
}
}

View File

@ -25,10 +25,15 @@
private readonly ParsingCachingProviders cachingProviders;
[NotNull]
public Catalog Catalog { get; }
internal Catalog Catalog { get; }
[NotNull]
public Pages Pages { get; }
internal Pages Pages { get; }
/// <summary>
/// Get the number of pages in this document.
/// </summary>
public int NumberOfPages => Pages.Count;
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
bool isLenientParsing,
@ -50,6 +55,16 @@
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
public static PdfDocument Open(string filename, ParsingOptions options = null) => PdfDocumentFactory.Open(filename, options);
/// <summary>
/// Get the page with the specified page number.
/// </summary>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
public Page GetPage(int pageNumber)
{
return Pages.GetPage(pageNumber);
}
public void Dispose()
{
try