encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme

This commit is contained in:
Eliot Jones
2017-12-28 13:14:03 +00:00
parent 940c51e2fb
commit b1d28a5af8
8 changed files with 437 additions and 966 deletions

View File

@@ -0,0 +1,39 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
public class AssertablePositionData
{
public decimal X { get; set; }
public decimal Y { get; set; }
public decimal Width { get; set; }
public string Text { get; set; }
public decimal FontSize { get; set; }
public string FontName { get; set; }
public static AssertablePositionData Parse(string line)
{
var parts = line.Split('\t', StringSplitOptions.None);
if (parts.Length != 6)
{
throw new ArgumentException($"Expected 6 parts to the line, instead got {parts.Length}");
}
return new AssertablePositionData
{
X = decimal.Parse(parts[0]),
Y = decimal.Parse(parts[1]),
Width = decimal.Parse(parts[2]),
Text = parts[3],
FontSize = decimal.Parse(parts[4]),
FontName = parts[5]
};
}
}
}

View File

@@ -43,25 +43,6 @@
public class PdfParserTests
{
[Fact]
public void CanParseSimpleGoogleDocsDocument()
{
// To see the text as shown in Visual Studio or Notepad++, use the OtherEncodings.BytesAsLatin1String()
var file = GetNthFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(1, document.Pages.Count);
var page = document.Pages.GetPage(1);
Assert.Equal(1, page.Number);
var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty);
Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty));
}
}
[Fact]
public void CanDecompressNormalObjectStream()
{

View File

@@ -0,0 +1,328 @@
// ReSharper disable AccessToDisposedClosure
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Xunit;
public class SinglePageSimpleTests
{
private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
{
"\u200B"
};
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
}
[Fact]
public void HasCorrectNumberOfPages()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(1, document.NumberOfPages);
}
}
[Fact]
public void CanAccessPage()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
Assert.NotNull(page);
Assert.Equal(1, page.Number);
}
}
[Fact]
public void AccessPageLowerThanOneThrows()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Action action = () => document.GetPage(0);
Assert.Throws<ArgumentOutOfRangeException>(action);
}
}
[Fact]
public void PageHasCorrectDimensions()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
Assert.Equal(612, page.Width);
Assert.Equal(792, page.Height);
}
}
[Fact]
public void PageHasCorrectTextIgnoringHiddenCharacters()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
var page = document.GetPage(1);
var text = string.Join(string.Empty, page.Letters.Select(x => x.Value).Where(x => !IgnoredHiddenCharacters.Contains(x)));
const string expected =
"This is the document title There is some lede text here And then another line of text. ";
Assert.Equal(expected, text);
}
}
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
{
// X Y Width Letter FontSize Font
const string fromPdfBox = @"72 105 9.771912 T 21 ArialMT
81.77106 105 8.897049 h 21 ArialMT
90.66733 105 3.554138 i 21 ArialMT
94.22115 105 7.998741 s 21 ArialMT
102.2192 105 0 21 Gautami
106.6634 105 0 21 Gautami
106.6634 105 3.554131 i 21 ArialMT
110.2173 105 7.998749 s 21 ArialMT
118.2153 105 0 21 Gautami
122.6595 105 0 21 Gautami
122.6595 105 4.444618 t 21 ArialMT
127.1038 105 8.897049 h 21 ArialMT
136 105 8.897049 e 21 ArialMT
144.8963 105 0 21 Gautami
149.3405 105 0 21 Gautami
149.3405 105 8.897049 d 21 ArialMT
158.2368 105 8.897049 o 21 ArialMT
167.1331 105 7.998749 c 21 ArialMT
175.1311 105 8.897049 u 21 ArialMT
184.0274 105 13.32605 m 21 ArialMT
197.3523 105 8.897049 e 21 ArialMT
206.2485 105 8.897049 n 21 ArialMT
215.1448 105 4.444611 t 21 ArialMT
219.5891 105 0 21 Gautami
224.0333 105 0 21 Gautami
224.0333 105 4.444611 t 21 ArialMT
228.4775 105 3.554138 i 21 ArialMT
232.0313 105 4.444611 t 21 ArialMT
236.4756 105 3.554123 l 21 ArialMT
240.0294 105 8.897049 e 21 ArialMT
72 143.25 6.716187 T 14 ArialMT
78.71446 143.25 6.114899 h 14 ArialMT
84.8278 143.25 6.114891 e 14 ArialMT
90.94113 143.25 3.661423 r 14 ArialMT
94.60161 143.25 6.114899 e 14 ArialMT
100.7149 143.25 0 14 Gautami
103.7689 143.25 0 14 Gautami
103.7689 143.25 2.442749 i 14 ArialMT
106.211 143.25 5.497505 s 14 ArialMT
111.7071 143.25 0 14 Gautami
114.7611 143.25 0 14 Gautami
114.7611 143.25 5.497505 s 14 ArialMT
120.2572 143.25 6.114899 o 14 ArialMT
126.3705 143.25 9.158928 m 14 ArialMT
135.5271 143.25 6.114899 e 14 ArialMT
141.6404 143.25 0 14 Gautami
144.6944 143.25 0 14 Gautami
144.6944 143.25 2.442749 l 14 ArialMT
147.1365 143.25 6.114899 e 14 ArialMT
153.2499 143.25 6.114899 d 14 ArialMT
159.3632 143.25 6.114899 e 14 ArialMT
165.4765 143.25 0 14 Gautami
168.5305 143.25 0 14 Gautami
168.5305 143.25 3.054749 t 14 ArialMT
171.5845 143.25 6.114899 e 14 ArialMT
177.6978 143.25 5.497498 x 14 ArialMT
183.1939 143.25 3.054764 t 14 ArialMT
186.2479 143.25 0 14 Gautami
189.3019 143.25 0 14 Gautami
189.3019 143.25 6.114899 h 14 ArialMT
195.4152 143.25 6.114899 e 14 ArialMT
201.5285 143.25 3.661423 r 14 ArialMT
205.189 143.25 6.114899 e 14 ArialMT
72 173.25 7.33358 A 14 ArialMT
79.3317 173.25 6.114891 n 14 ArialMT
85.44504 173.25 6.114891 d 14 ArialMT
91.55836 173.25 0 14 Gautami
94.61235 173.25 0 14 Gautami
94.61235 173.25 3.054756 t 14 ArialMT
97.66633 173.25 6.114899 h 14 ArialMT
103.7797 173.25 6.114899 e 14 ArialMT
109.893 173.25 6.114899 n 14 ArialMT
116.0063 173.25 0 14 Gautami
119.0603 173.25 0 14 Gautami
119.0603 173.25 6.114899 a 14 ArialMT
125.1736 173.25 6.114899 n 14 ArialMT
131.287 173.25 6.114899 o 14 ArialMT
137.4003 173.25 3.054749 t 14 ArialMT
140.4543 173.25 6.114899 h 14 ArialMT
146.5676 173.25 6.114899 e 14 ArialMT
152.6809 173.25 3.661423 r 14 ArialMT
156.3414 173.25 0 14 Gautami
159.3954 173.25 0 14 Gautami
159.3954 173.25 2.442749 l 14 ArialMT
161.8375 173.25 2.442734 i 14 ArialMT
164.2796 173.25 6.114899 n 14 ArialMT
170.393 173.25 6.114899 e 14 ArialMT
176.5063 173.25 0 14 Gautami
179.5603 173.25 0 14 Gautami
179.5603 173.25 6.114899 o 14 ArialMT
185.6736 173.25 3.054764 f 14 ArialMT
188.7276 173.25 0 14 Gautami
191.7816 173.25 0 14 Gautami
191.7816 173.25 3.054764 t 14 ArialMT
194.8355 173.25 6.114899 e 14 ArialMT
200.9489 173.25 5.497482 x 14 ArialMT
206.445 173.25 3.054764 t 14 ArialMT
209.499 173.25 3.054764 . 14 ArialMT";
return fromPdfBox.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
.Select(AssertablePositionData.Parse)
.ToList();
}
private static IReadOnlyList<AssertablePositionData> GetOtherPositionData1()
{
// These do not include the font information
const string fromOther = @"72 105 9.758476 T 0 ArialMT
81.77106 105 8.894608 h 0 ArialMT
90.66733 105 3.551445 i 0 ArialMT
94.22115 105 7.998749 s 0 ArialMT
102.2192 105 4.431305 0 ArialMT
102.2192 105 0 0 ArialMT
106.6634 105 3.551445 i 0 ArialMT
106.6634 105 0 0 ArialMT
110.2173 105 7.998749 s 0 ArialMT
118.2153 105 0 0 ArialMT
118.2153 105 4.431305 0 ArialMT
122.6595 105 4.431305 t 0 ArialMT
122.6595 105 0 0 ArialMT
127.1038 105 8.894608 h 0 ArialMT
136 105 8.894608 e 0 ArialMT
144.8963 105 4.431305 0 ArialMT
144.8963 105 0 0 ArialMT
149.3405 105 8.894608 d 0 ArialMT
149.3405 105 0 0 ArialMT
158.2368 105 8.894608 o 0 ArialMT
167.1331 105 7.998749 c 0 ArialMT
175.1311 105 8.894608 u 0 ArialMT
184.0274 105 13.32591 m 0 ArialMT
197.3523 105 8.894608 e 0 ArialMT
206.2485 105 8.894608 n 0 ArialMT
215.1448 105 4.431305 t 0 ArialMT
219.5891 105 4.431305 0 ArialMT
219.5891 105 0 0 ArialMT
224.0333 105 4.431305 t 0 ArialMT
224.0333 105 0 0 ArialMT
228.4775 105 3.551453 i 0 ArialMT
232.0313 105 4.431305 t 0 ArialMT
236.4756 105 3.551453 l 0 ArialMT
240.0294 105 8.894608 e 0 ArialMT
248.918 105 4.431305 0 ArialMT
72 128.25 3.045616 0 ArialMT
72 143.25 6.706947 T 0 ArialMT
78.71446 143.25 6.11322 h 0 ArialMT
84.8278 143.25 6.11322 e 0 ArialMT
90.94113 143.25 3.661331 r 0 ArialMT
94.60161 143.25 6.11322 e 0 ArialMT
100.7149 143.25 3.045616 0 ArialMT
100.7149 143.25 0 0 ArialMT
103.7689 143.25 2.440887 i 0 ArialMT
103.7689 143.25 0 0 ArialMT
106.211 143.25 5.497498 s 0 ArialMT
111.7071 143.25 3.045616 0 ArialMT
111.7071 143.25 0 0 ArialMT
114.7611 143.25 0 0 ArialMT
114.7611 143.25 5.497498 s 0 ArialMT
120.2572 143.25 6.11322 o 0 ArialMT
126.3705 143.25 9.158836 m 0 ArialMT
135.5271 143.25 6.11322 e 0 ArialMT
141.6404 143.25 0 0 ArialMT
141.6404 143.25 3.045609 0 ArialMT
144.6944 143.25 2.440887 l 0 ArialMT
144.6944 143.25 0 0 ArialMT
147.1365 143.25 6.11322 e 0 ArialMT
153.2499 143.25 6.11322 d 0 ArialMT
159.3632 143.25 6.11322 e 0 ArialMT
165.4765 143.25 0 0 ArialMT
165.4765 143.25 3.045609 0 ArialMT
168.5305 143.25 3.045609 t 0 ArialMT
168.5305 143.25 0 0 ArialMT
171.5845 143.25 6.11322 e 0 ArialMT
177.6978 143.25 5.497498 x 0 ArialMT
183.1939 143.25 3.045609 t 0 ArialMT
186.2479 143.25 0 0 ArialMT
186.2479 143.25 3.045609 0 ArialMT
189.3019 143.25 6.11322 h 0 ArialMT
189.3019 143.25 0 0 ArialMT
195.4152 143.25 6.11322 e 0 ArialMT
201.5285 143.25 3.661331 r 0 ArialMT
205.189 143.25 6.11322 e 0 ArialMT
211.3008 143.25 3.045609 0 ArialMT
72 158.25 3.045616 0 ArialMT
72 173.25 7.32267 A 0 ArialMT
79.3317 173.25 6.11322 n 0 ArialMT
85.44504 173.25 6.11322 d 0 ArialMT
91.55836 173.25 3.045616 0 ArialMT
91.55836 173.25 0 0 ArialMT
94.61235 173.25 0 0 ArialMT
94.61235 173.25 3.045616 t 0 ArialMT
97.66633 173.25 6.11322 h 0 ArialMT
103.7797 173.25 6.11322 e 0 ArialMT
109.893 173.25 6.11322 n 0 ArialMT
116.0063 173.25 0 0 ArialMT
116.0063 173.25 3.045616 0 ArialMT
119.0603 173.25 6.11322 a 0 ArialMT
119.0603 173.25 0 0 ArialMT
125.1736 173.25 6.11322 n 0 ArialMT
131.287 173.25 6.11322 o 0 ArialMT
137.4003 173.25 3.045609 t 0 ArialMT
140.4543 173.25 6.11322 h 0 ArialMT
146.5676 173.25 6.11322 e 0 ArialMT
152.6809 173.25 3.661331 r 0 ArialMT
156.3414 173.25 3.045609 0 ArialMT
156.3414 173.25 0 0 ArialMT
159.3954 173.25 2.440887 l 0 ArialMT
159.3954 173.25 0 0 ArialMT
161.8375 173.25 2.440887 i 0 ArialMT
164.2796 173.25 6.11322 n 0 ArialMT
170.393 173.25 6.11322 e 0 ArialMT
176.5063 173.25 3.045609 0 ArialMT
176.5063 173.25 0 0 ArialMT
179.5603 173.25 6.11322 o 0 ArialMT
179.5603 173.25 0 0 ArialMT
185.6736 173.25 3.045609 f 0 ArialMT
188.7276 173.25 0 0 ArialMT
188.7276 173.25 3.045609 0 ArialMT
191.7816 173.25 3.045609 t 0 ArialMT
191.7816 173.25 0 0 ArialMT
194.8355 173.25 6.11322 e 0 ArialMT
200.9489 173.25 5.497498 x 0 ArialMT
206.445 173.25 3.045609 t 0 ArialMT
209.499 173.25 3.045609 . 0 ArialMT
212.543 173.25 3.045609 0 ArialMT";
return fromOther.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
.Select(AssertablePositionData.Parse)
.ToList();
}
}
}

View File

@@ -10,11 +10,21 @@
/// </summary>
public int Number { get; }
public MediaBox MediaBox { get; }
internal MediaBox MediaBox { get; }
internal PageContent Content { get; }
public IReadOnlyList<Letter> Text => Content?.Letters ?? new Letter[0];
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
/// <summary>
/// Gets the width of the page in points.
/// </summary>
public decimal Width { get; }
/// <summary>
/// Gets the height of the page in points.
/// </summary>
public decimal Height { get; }
internal Page(int number, MediaBox mediaBox, PageContent content)
{
@@ -26,6 +36,9 @@
Number = number;
MediaBox = mediaBox;
Content = content;
Width = mediaBox.Bounds.Width;
Height = mediaBox.Bounds.Height;
}
}
}

View File

@@ -75,7 +75,7 @@
if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary))
{
throw new InvalidOperationException("Could not find the page with number: " + pageNumber);
throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
}
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);

View File

@@ -1,941 +0,0 @@
namespace UglyToad.Pdf.Parser
{
using System;
using System.IO;
using System.Text;
using Cos;
using IO;
using Util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class is used to contain parsing logic that will be used by both the
* PDFParser and the COSStreamParser.
*
* @author Ben Litchfield
*/
public abstract class BaseParser
{
private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L;
private static readonly long GENERATION_NUMBER_THRESHOLD = 65535;
static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length;
/**
* Log instance.
*/
protected static readonly int E = 'e';
protected static readonly int N = 'n';
protected static readonly int D = 'd';
protected static readonly int S = 's';
protected static readonly int T = 't';
protected static readonly int R = 'r';
protected static readonly int A = 'a';
protected static readonly int M = 'm';
protected static readonly int O = 'o';
protected static readonly int B = 'b';
protected static readonly int J = 'j';
/**
* This is a string constant that will be used for comparisons.
*/
public static readonly string DEF = "def";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string ENDOBJ_string = "endobj";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string ENDSTREAM_string = "endstream";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string STREAM_string = "stream";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string TRUE = "true";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string FALSE = "false";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string NULL = "null";
/**
* ASCII code for line feed.
*/
protected static readonly byte ASCII_LF = 10;
/**
* ASCII code for carriage return.
*/
protected static readonly byte ASCII_CR = 13;
private static readonly byte ASCII_ZERO = 48;
private static readonly byte ASCII_NINE = 57;
private static readonly byte ASCII_SPACE = 32;
/**
* This is the stream that will be read from.
*/
protected readonly SequentialSource seqSource;
/**
* This is the document that will be parsed.
*/
protected COSDocument document;
/**
* Default constructor.
*/
public BaseParser(SequentialSource pdfSource)
{
this.seqSource = pdfSource;
}
private static bool isHexDigit(char ch)
{
return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
protected void skipWhiteSpaces()
{
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
int whitespace = seqSource.read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (ASCII_SPACE == whitespace)
{
whitespace = seqSource.read();
}
if (ASCII_CR == whitespace)
{
whitespace = seqSource.read();
if (ASCII_LF != whitespace)
{
seqSource.unread(whitespace);
//The spec says this is invalid but it happens in the real
//world so we must support it.
}
}
else if (ASCII_LF != whitespace)
{
//we are in an error.
//but again we will do a lenient parsing and just assume that everything
//is fine
seqSource.unread(whitespace);
}
}
/**
* This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
* format: /Title ( (5) /Creator which was patched in 1 place.
*
* However it missed the case where the number of opening and closing parenthesis isn't balanced
*
* The second bug was in this format /Title (c:\) /Producer
*
* This patch moves this code out of the parseCOSstring method, so it can be used twice.
*
* @param bracesParameter the number of braces currently open.
*
* @return the corrected value of the brace counter
* @throws IOException
*/
private int checkForEndOfstring(int bracesParameter)
{
int braces = bracesParameter;
byte[]
nextThreeBytes = new byte[3];
int amountRead = seqSource.read(nextThreeBytes);
// Check the next 3 bytes if available
// The following cases are valid indicators for the end of the string
// 1. Next line contains another COSObject: CR + LF + '/'
// 2. CosDictionary ends in the next line: CR + LF + '>'
// 3. Next line contains another COSObject: CR + '/'
// 4. CosDictionary ends in the next line: CR + '>'
if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR)
{
if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>')
|| nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
{
braces = 0;
}
}
if (amountRead > 0)
{
seqSource.unread(nextThreeBytes, 0, amountRead);
}
return braces;
}
/**
* This will parse a PDF string.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
protected CosString parseCOSstring()
{
char nextChar = (char)seqSource.read();
if (nextChar == '<')
{
return parseCOSHexstring();
}
else if (nextChar != '(')
{
throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
nextChar + "' " + seqSource);
}
var charLf = (char)ASCII_LF;
using (var memoryStream = new MemoryStream())
using (var writer = new StreamWriter(memoryStream))
{
// This is the number of braces read
int braces = 1;
int c = seqSource.read();
while (braces > 0 && c != -1)
{
char ch = (char)c;
int nextc = -2; // not yet read
if (ch == ')')
{
braces--;
braces = checkForEndOfstring(braces);
if (braces != 0)
{
writer.Write(ch);
}
}
else if (ch == '(')
{
braces++;
writer.Write(ch);
}
else if (ch == '\\')
{
//patched by ram
char next = (char)seqSource.read();
switch (next)
{
case 'n':
writer.Write('\n');
break;
case 'r':
writer.Write('\r');
break;
case 't':
writer.Write('\t');
break;
case 'b':
writer.Write('\b');
break;
case 'f':
writer.Write('\f');
break;
case ')':
// PDFBox 276 /Title (c:\)
braces = checkForEndOfstring(braces);
if (braces != 0)
{
writer.Write(next);
}
else
{
writer.Write('\\');
}
break;
case '(':
case '\\':
writer.Write(next);
break;
//case charLf:
// case ASCII_CR:
//this is a break in the line so ignore it and the newline and continue
c = seqSource.read();
while (isEOL(c) && c != -1)
{
c = seqSource.read();
}
nextc = c;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
var octal = new StringBuilder();
octal.Append(next);
c = seqSource.read();
char digit = (char)c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
c = seqSource.read();
digit = (char)c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
}
else
{
nextc = c;
}
}
else
{
nextc = c;
}
int character = 0;
try
{
character = Convert.ToInt32(octal.ToString(), 8);
}
catch (FormatException e)
{
throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
}
writer.Write(character);
break;
}
default:
// dropping the backslash
// see 7.3.4.2 Literal strings for further information
writer.Write(next);
break;
}
}
else
{
writer.Write(ch);
}
if (nextc != -2)
{
c = nextc;
}
else
{
c = seqSource.read();
}
}
if (c != -1)
{
seqSource.unread(c);
}
return new CosString(memoryStream.ToArray());
}
}
/**
* This will parse a PDF HEX string with fail fast semantic
* meaning that we stop if a not allowed character is found.
* This is necessary in order to detect malformed input and
* be able to skip to next object start.
*
* We assume starting '&lt;' was already read.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
private CosString parseCOSHexstring()
{
var sBuf = new StringBuilder();
while (true)
{
int c = seqSource.read();
if (isHexDigit((char)c))
{
sBuf.Append((char)c);
}
else if (c == '>')
{
break;
}
else if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
else if ((c == ' ') || (c == '\n') ||
(c == '\t') || (c == '\r') ||
(c == '\b') || (c == '\f'))
{
continue;
}
else
{
// if invalid chars was found: discard last
// hex character if it is not part of a pair
if (sBuf.Length % 2 != 0)
{
sBuf.Remove(sBuf.Length - 1, 1);
}
// read till the closing bracket was found
do
{
c = seqSource.read();
}
while (c != '>' && c >= 0);
// might have reached EOF while looking for the closing bracket
// this can happen for malformed PDFs only. Make sure that there is
// no endless loop.
if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
// exit loop
break;
}
}
return CosString.ParseHex(sBuf.ToString());
}
/**
* Determine if a character terminates a PDF name.
*
* @param ch The character
* @return true if the character terminates a PDF name, otherwise false.
*/
protected bool isEndOfName(int ch)
{
return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' ||
ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' ||
ch == 0 || ch == '\f';
}
/**
* Returns true if a byte sequence is valid UTF-8.
*/
private bool isValidUTF8(byte[] input)
{
try
{
Decoder d = Encoding.UTF8.GetDecoder();
var charLength = d.GetCharCount(input, 0, input.Length);
var chars = new char[charLength];
d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _);
return true;
}
catch (Exception e)
{
return false;
}
}
/**
* This will parse a bool object from the stream.
*
* @return The parsed bool object.
*
* @throws IOException If an IO error occurs during parsing.
*/
protected CosBoolean parsebool()
{
CosBoolean retval = null;
char c = (char)seqSource.peek();
if (c == 't')
{
string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4));
if (!truestring.Equals(TRUE))
{
throw new IOException("Error parsing bool: expected='true' actual='" + truestring
+ "' at offset " + seqSource.getPosition());
}
else
{
retval = CosBoolean.True;
}
}
else if (c == 'f')
{
string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5));
if (!falsestring.Equals(FALSE))
{
throw new IOException("Error parsing bool: expected='true' actual='" + falsestring
+ "' at offset " + seqSource.getPosition());
}
else
{
retval = CosBoolean.False;
}
}
else
{
throw new IOException("Error parsing bool expected='t or f' actual='" + c
+ "' at offset " + seqSource.getPosition());
}
return retval;
}
/**
* This will read the next string from the stream.
*
* @return The string that was read from the stream, never null.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readstring()
{
SkipSpaces();
StringBuilder buffer = new StringBuilder();
int c = seqSource.read();
while (!isEndOfName((char)c) && c != -1)
{
buffer.Append((char)c);
c = seqSource.read();
}
if (c != -1)
{
seqSource.unread(c);
}
return buffer.ToString();
}
/**
* Read one string and throw an exception if it is not the expected value.
*
* @param expectedstring the string value that is expected.
* @throws IOException if the string char is not the expected value or if an
* I/O error occurs.
*/
protected void readExpectedstring(string expectedstring)
{
readExpectedstring(expectedstring, false);
}
/**
* Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
*
* @param expectedstring pattern to be skipped
* @param skipSpaces if set to true spaces before and after the string will be skipped
* @throws IOException if pattern could not be read
*/
protected void readExpectedstring(string expectedstring, bool skipSpaces)
{
SkipSpaces();
foreach (var c in expectedstring)
{
if (seqSource.read() != c)
{
throw new IOException("Expected string '" + expectedstring
+ "' but missed at character '" + c + "' at offset "
+ seqSource.getPosition());
}
}
SkipSpaces();
}
/**
* Read one char and throw an exception if it is not the expected value.
*
* @param ec the char value that is expected.
* @throws IOException if the read char is not the expected value or if an
* I/O error occurs.
*/
protected void readExpectedChar(char ec)
{
char c = (char)seqSource.read();
if (c != ec)
{
throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition());
}
}
/**
* This will read the next string from the stream up to a certain length.
*
* @param length The length to stop reading at.
*
* @return The string that was read from the stream of length 0 to length.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readstring(int length)
{
SkipSpaces();
int c = seqSource.read();
//average string size is around 2 and the normal string buffer size is
//about 16 so lets save some space.
StringBuilder buffer = new StringBuilder(length);
while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length &&
c != '[' &&
c != '<' &&
c != '(' &&
c != '/')
{
buffer.Append((char)c);
c = seqSource.read();
}
if (c != -1)
{
seqSource.unread(c);
}
return buffer.ToString();
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @return true if the next byte is ']', false otherwise.
*
* @throws IOException If an IO error occurs.
*/
protected bool isClosing()
{
return isClosing(seqSource.peek());
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @param c The character to check against end of line
* @return true if the next byte is ']', false otherwise.
*/
protected bool isClosing(int c)
{
return c == ']';
}
/**
* This will read bytes until the first end of line marker occurs.
* NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
* which is an important detail if one wants to unread the line.
*
* @return The characters between the current position and the end of the line.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readLine()
{
if (seqSource.isEOF())
{
throw new IOException("Error: End-of-File, expected line");
}
StringBuilder buffer = new StringBuilder(11);
int c;
while ((c = seqSource.read()) != -1)
{
// CR and LF are valid EOLs
if (isEOL(c))
{
break;
}
buffer.Append((char)c);
}
// CR+LF is also a valid EOL
if (isCR(c) && isLF(seqSource.peek()))
{
seqSource.read();
}
return buffer.ToString();
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @return true if the next byte is 0x0A or 0x0D.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isEOL()
{
return isEOL(seqSource.peek());
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @param c The character to check against end of line
* @return true if the next byte is 0x0A or 0x0D.
*/
protected bool isEOL(int c)
{
return isLF(c) || isCR(c);
}
private bool isLF(int c)
{
return ASCII_LF == c;
}
private bool isCR(int c)
{
return ASCII_CR == c;
}
/**
* This will tell if the next byte is whitespace or not.
*
* @return true if the next byte in the stream is a whitespace character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isWhitespace()
{
return isWhitespace(seqSource.peek());
}
/**
* This will tell if a character is whitespace or not. These values are
* specified in table 1 (page 12) of ISO 32000-1:2008.
* @param c The character to check against whitespace
* @return true if the character is a whitespace character.
*/
protected bool isWhitespace(int c)
{
return c == 0 || c == 9 || c == 12 || c == ASCII_LF
|| c == ASCII_CR || c == ASCII_SPACE;
}
/**
* This will tell if the next byte is a space or not.
*
* @return true if the next byte in the stream is a space character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isSpace()
{
return isSpace(seqSource.peek());
}
/**
* This will tell if the given value is a space or not.
*
* @param c The character to check against space
* @return true if the next byte in the stream is a space character.
*/
protected bool isSpace(int c)
{
return ASCII_SPACE == c;
}
/**
* This will tell if the next byte is a digit or not.
*
* @return true if the next byte in the stream is a digit.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isDigit()
{
return isDigit(seqSource.peek());
}
/**
* This will tell if the given value is a digit or not.
*
* @param c The character to be checked
* @return true if the next byte in the stream is a digit.
*/
protected static bool isDigit(int c)
{
return c >= ASCII_ZERO && c <= ASCII_NINE;
}
/**
* This will skip all spaces and comments that are present.
*
* @throws IOException If there is an error reading from the stream.
*/
protected void SkipSpaces()
{
int c = seqSource.read();
// 37 is the % character, a comment
while (isWhitespace(c) || c == 37)
{
if (c == 37)
{
// skip past the comment section
c = seqSource.read();
while (!isEOL(c) && c != -1)
{
c = seqSource.read();
}
}
else
{
c = seqSource.read();
}
}
if (c != -1)
{
seqSource.unread(c);
}
}
/**
* This will read a long from the Stream and throw an {@link IOException} if
* the long value is negative or has more than 10 digits (i.e. : bigger than
* {@link #OBJECT_NUMBER_THRESHOLD})
*
* @return the object number being read.
* @throws IOException if an I/O error occurs
*/
protected long readObjectNumber()
{
long retval = readLong();
if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
{
throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
}
return retval;
}
/**
* This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
* has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
* @return the generation number being read.
* @throws IOException if an I/O error occurs
*/
protected int readGenerationNumber()
{
int retval = readInt();
if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
{
throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
}
return retval;
}
/**
* This will read an integer from the stream.
*
* @return The integer that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected int readInt()
{
SkipSpaces();
int retval = 0;
StringBuilder intBuffer = readstringNumber();
try
{
retval = int.Parse(intBuffer.ToString());
}
catch (FormatException e)
{
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e);
}
return retval;
}
/**
* This will read an long from the stream.
*
* @return The long that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected long readLong()
{
SkipSpaces();
long retval = 0;
StringBuilder longBuffer = readstringNumber();
try
{
retval = long.Parse(longBuffer.ToString());
}
catch (FormatException e)
{
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString()));
throw new IOException(
$"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e);
}
return retval;
}
/**
* This method is used to read a token by the {@linkplain #readInt()} method
* and the {@linkplain #readLong()} method.
*
* @return the token to parse as integer or long by the calling method.
* @throws IOException throws by the {@link #seqSource} methods.
*/
protected StringBuilder readstringNumber()
{
int lastByte = 0;
StringBuilder buffer = new StringBuilder();
while ((lastByte = seqSource.read()) != ASCII_SPACE &&
lastByte != ASCII_LF &&
lastByte != ASCII_CR &&
lastByte != 60 && //see sourceforge bug 1714707
lastByte != '[' && // PDFBOX-1845
lastByte != '(' && // PDFBOX-2579
lastByte != 0 && //See sourceforge bug 853328
lastByte != -1)
{
buffer.Append((char)lastByte);
if (buffer.Length > MAX_LENGTH_LONG)
{
throw new IOException("Number '" + buffer +
"' is getting too long, stop reading at offset " + seqSource.getPosition());
}
}
if (lastByte != -1)
{
seqSource.unread(lastByte);
}
return buffer;
}
}
}

View File

@@ -25,10 +25,15 @@
private readonly ParsingCachingProviders cachingProviders;
[NotNull]
public Catalog Catalog { get; }
internal Catalog Catalog { get; }
[NotNull]
public Pages Pages { get; }
internal Pages Pages { get; }
/// <summary>
/// Get the number of pages in this document.
/// </summary>
public int NumberOfPages => Pages.Count;
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
bool isLenientParsing,
@@ -50,6 +55,16 @@
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
public static PdfDocument Open(string filename, ParsingOptions options = null) => PdfDocumentFactory.Open(filename, options);
/// <summary>
/// Get the page with the specified page number.
/// </summary>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
public Page GetPage(int pageNumber)
{
return Pages.GetPage(pageNumber);
}
public void Dispose()
{
try