mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-06-28 15:30:17 +08:00
encapsulate the internals better and improve the api for pdfdocument, delete old code and tidy tests. expand readme
This commit is contained in:
parent
940c51e2fb
commit
b1d28a5af8
38
README.md
38
README.md
@ -3,4 +3,40 @@
|
|||||||
[](https://ci.appveyor.com/project/EliotJones/pdf)
|
[](https://ci.appveyor.com/project/EliotJones/pdf)
|
||||||
[](https://codecov.io/gh/UglyToad/Pdf)
|
[](https://codecov.io/gh/UglyToad/Pdf)
|
||||||
|
|
||||||
Convert the [PdfBox](https://github.com/apache/pdfbox) code to C#.
|
The aim of this project is to convert the [PdfBox](https://github.com/apache/pdfbox) code to C# in order to provide a properly open source (i.e. no copyleft) solution for inspecting PDF documents. This uses the Apache 2.0 licence.
|
||||||
|
|
||||||
|
## Status ##
|
||||||
|
|
||||||
|
There is a lot left to do for this project, the initial minimum viable project when released to Alpha will provide:
|
||||||
|
|
||||||
|
+ Page counts and sizes (in points) for a document.
|
||||||
|
+ Access to the text contents of each page. Note that since PDF has no concept of a "word" it will be up to the consumer of the text to work out where the words are within the text.
|
||||||
|
+ (Possible) The locations and bounds of each letter on the page.
|
||||||
|
|
||||||
|
For the initial alpha release all files will be opened rather than streamed so this will not support large files.
|
||||||
|
|
||||||
|
Eventually the library should support all existing PdfBox operations such as accessing graphical elements, form elements as well as creating PDF documents.
|
||||||
|
|
||||||
|
## Usage ##
|
||||||
|
|
||||||
|
The initial public API will be as limited as possible to allow extensive refactoring to take place. The proposed usage is as follows:
|
||||||
|
|
||||||
|
using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf"))
|
||||||
|
{
|
||||||
|
int pageCount = document.NumberOfPages;
|
||||||
|
|
||||||
|
Page page = document.GetPage(1);
|
||||||
|
|
||||||
|
decimal widthInPoints = page.Width;
|
||||||
|
decimal heightInPoints = page.Height;
|
||||||
|
|
||||||
|
string text = page.Text;
|
||||||
|
}
|
||||||
|
|
||||||
|
The ```PdfDocument``` will also support opening from byte arrays (as well as streams eventually):
|
||||||
|
|
||||||
|
byte[] fileBytes = File.ReadAllBytes(@"C:\my-file.pdf");
|
||||||
|
(using PdfDocument document = PdfDocument.Open(fileBytes))
|
||||||
|
{
|
||||||
|
int numberOfPages = document.NumberOfPages;
|
||||||
|
}
|
||||||
|
39
src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs
Normal file
39
src/UglyToad.Pdf.Tests/Integration/AssertablePositionData.cs
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
namespace UglyToad.Pdf.Tests.Integration
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
|
||||||
|
public class AssertablePositionData
|
||||||
|
{
|
||||||
|
public decimal X { get; set; }
|
||||||
|
|
||||||
|
public decimal Y { get; set; }
|
||||||
|
|
||||||
|
public decimal Width { get; set; }
|
||||||
|
|
||||||
|
public string Text { get; set; }
|
||||||
|
|
||||||
|
public decimal FontSize { get; set; }
|
||||||
|
|
||||||
|
public string FontName { get; set; }
|
||||||
|
|
||||||
|
public static AssertablePositionData Parse(string line)
|
||||||
|
{
|
||||||
|
var parts = line.Split('\t', StringSplitOptions.None);
|
||||||
|
|
||||||
|
if (parts.Length != 6)
|
||||||
|
{
|
||||||
|
throw new ArgumentException($"Expected 6 parts to the line, instead got {parts.Length}");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new AssertablePositionData
|
||||||
|
{
|
||||||
|
X = decimal.Parse(parts[0]),
|
||||||
|
Y = decimal.Parse(parts[1]),
|
||||||
|
Width = decimal.Parse(parts[2]),
|
||||||
|
Text = parts[3],
|
||||||
|
FontSize = decimal.Parse(parts[4]),
|
||||||
|
FontName = parts[5]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -43,25 +43,6 @@
|
|||||||
|
|
||||||
public class PdfParserTests
|
public class PdfParserTests
|
||||||
{
|
{
|
||||||
[Fact]
|
|
||||||
public void CanParseSimpleGoogleDocsDocument()
|
|
||||||
{
|
|
||||||
// To see the text as shown in Visual Studio or Notepad++, use the OtherEncodings.BytesAsLatin1String()
|
|
||||||
var file = GetNthFilename();
|
|
||||||
|
|
||||||
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
|
||||||
{
|
|
||||||
Assert.Equal(1, document.Pages.Count);
|
|
||||||
|
|
||||||
var page = document.Pages.GetPage(1);
|
|
||||||
Assert.Equal(1, page.Number);
|
|
||||||
|
|
||||||
var text = string.Join(string.Empty, page.Content.Letters.Select(x => x.Value)).Replace("\u200B", string.Empty);
|
|
||||||
|
|
||||||
Assert.Equal("This is the document title There is some lede text here And then another line of text.".Replace(" ", string.Empty), text.Replace(" ", string.Empty));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void CanDecompressNormalObjectStream()
|
public void CanDecompressNormalObjectStream()
|
||||||
{
|
{
|
||||||
|
328
src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs
Normal file
328
src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleTests.cs
Normal file
@ -0,0 +1,328 @@
|
|||||||
|
// ReSharper disable AccessToDisposedClosure
|
||||||
|
namespace UglyToad.Pdf.Tests.Integration
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
public class SinglePageSimpleTests
|
||||||
|
{
|
||||||
|
private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
|
||||||
|
{
|
||||||
|
"\u200B"
|
||||||
|
};
|
||||||
|
|
||||||
|
private static string GetFilename()
|
||||||
|
{
|
||||||
|
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
|
||||||
|
|
||||||
|
return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HasCorrectNumberOfPages()
|
||||||
|
{
|
||||||
|
var file = GetFilename();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||||
|
{
|
||||||
|
Assert.Equal(1, document.NumberOfPages);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void CanAccessPage()
|
||||||
|
{
|
||||||
|
var file = GetFilename();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||||
|
{
|
||||||
|
var page = document.GetPage(1);
|
||||||
|
|
||||||
|
Assert.NotNull(page);
|
||||||
|
|
||||||
|
Assert.Equal(1, page.Number);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void AccessPageLowerThanOneThrows()
|
||||||
|
{
|
||||||
|
var file = GetFilename();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||||
|
{
|
||||||
|
Action action = () => document.GetPage(0);
|
||||||
|
|
||||||
|
Assert.Throws<ArgumentOutOfRangeException>(action);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void PageHasCorrectDimensions()
|
||||||
|
{
|
||||||
|
var file = GetFilename();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||||
|
{
|
||||||
|
var page = document.GetPage(1);
|
||||||
|
|
||||||
|
Assert.Equal(612, page.Width);
|
||||||
|
Assert.Equal(792, page.Height);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void PageHasCorrectTextIgnoringHiddenCharacters()
|
||||||
|
{
|
||||||
|
var file = GetFilename();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||||
|
{
|
||||||
|
var page = document.GetPage(1);
|
||||||
|
|
||||||
|
var text = string.Join(string.Empty, page.Letters.Select(x => x.Value).Where(x => !IgnoredHiddenCharacters.Contains(x)));
|
||||||
|
|
||||||
|
const string expected =
|
||||||
|
"This is the document title There is some lede text here And then another line of text. ";
|
||||||
|
|
||||||
|
Assert.Equal(expected, text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
|
||||||
|
{
|
||||||
|
// X Y Width Letter FontSize Font
|
||||||
|
const string fromPdfBox = @"72 105 9.771912 T 21 ArialMT
|
||||||
|
81.77106 105 8.897049 h 21 ArialMT
|
||||||
|
90.66733 105 3.554138 i 21 ArialMT
|
||||||
|
94.22115 105 7.998741 s 21 ArialMT
|
||||||
|
102.2192 105 0 21 Gautami
|
||||||
|
106.6634 105 0 21 Gautami
|
||||||
|
106.6634 105 3.554131 i 21 ArialMT
|
||||||
|
110.2173 105 7.998749 s 21 ArialMT
|
||||||
|
118.2153 105 0 21 Gautami
|
||||||
|
122.6595 105 0 21 Gautami
|
||||||
|
122.6595 105 4.444618 t 21 ArialMT
|
||||||
|
127.1038 105 8.897049 h 21 ArialMT
|
||||||
|
136 105 8.897049 e 21 ArialMT
|
||||||
|
144.8963 105 0 21 Gautami
|
||||||
|
149.3405 105 0 21 Gautami
|
||||||
|
149.3405 105 8.897049 d 21 ArialMT
|
||||||
|
158.2368 105 8.897049 o 21 ArialMT
|
||||||
|
167.1331 105 7.998749 c 21 ArialMT
|
||||||
|
175.1311 105 8.897049 u 21 ArialMT
|
||||||
|
184.0274 105 13.32605 m 21 ArialMT
|
||||||
|
197.3523 105 8.897049 e 21 ArialMT
|
||||||
|
206.2485 105 8.897049 n 21 ArialMT
|
||||||
|
215.1448 105 4.444611 t 21 ArialMT
|
||||||
|
219.5891 105 0 21 Gautami
|
||||||
|
224.0333 105 0 21 Gautami
|
||||||
|
224.0333 105 4.444611 t 21 ArialMT
|
||||||
|
228.4775 105 3.554138 i 21 ArialMT
|
||||||
|
232.0313 105 4.444611 t 21 ArialMT
|
||||||
|
236.4756 105 3.554123 l 21 ArialMT
|
||||||
|
240.0294 105 8.897049 e 21 ArialMT
|
||||||
|
72 143.25 6.716187 T 14 ArialMT
|
||||||
|
78.71446 143.25 6.114899 h 14 ArialMT
|
||||||
|
84.8278 143.25 6.114891 e 14 ArialMT
|
||||||
|
90.94113 143.25 3.661423 r 14 ArialMT
|
||||||
|
94.60161 143.25 6.114899 e 14 ArialMT
|
||||||
|
100.7149 143.25 0 14 Gautami
|
||||||
|
103.7689 143.25 0 14 Gautami
|
||||||
|
103.7689 143.25 2.442749 i 14 ArialMT
|
||||||
|
106.211 143.25 5.497505 s 14 ArialMT
|
||||||
|
111.7071 143.25 0 14 Gautami
|
||||||
|
114.7611 143.25 0 14 Gautami
|
||||||
|
114.7611 143.25 5.497505 s 14 ArialMT
|
||||||
|
120.2572 143.25 6.114899 o 14 ArialMT
|
||||||
|
126.3705 143.25 9.158928 m 14 ArialMT
|
||||||
|
135.5271 143.25 6.114899 e 14 ArialMT
|
||||||
|
141.6404 143.25 0 14 Gautami
|
||||||
|
144.6944 143.25 0 14 Gautami
|
||||||
|
144.6944 143.25 2.442749 l 14 ArialMT
|
||||||
|
147.1365 143.25 6.114899 e 14 ArialMT
|
||||||
|
153.2499 143.25 6.114899 d 14 ArialMT
|
||||||
|
159.3632 143.25 6.114899 e 14 ArialMT
|
||||||
|
165.4765 143.25 0 14 Gautami
|
||||||
|
168.5305 143.25 0 14 Gautami
|
||||||
|
168.5305 143.25 3.054749 t 14 ArialMT
|
||||||
|
171.5845 143.25 6.114899 e 14 ArialMT
|
||||||
|
177.6978 143.25 5.497498 x 14 ArialMT
|
||||||
|
183.1939 143.25 3.054764 t 14 ArialMT
|
||||||
|
186.2479 143.25 0 14 Gautami
|
||||||
|
189.3019 143.25 0 14 Gautami
|
||||||
|
189.3019 143.25 6.114899 h 14 ArialMT
|
||||||
|
195.4152 143.25 6.114899 e 14 ArialMT
|
||||||
|
201.5285 143.25 3.661423 r 14 ArialMT
|
||||||
|
205.189 143.25 6.114899 e 14 ArialMT
|
||||||
|
72 173.25 7.33358 A 14 ArialMT
|
||||||
|
79.3317 173.25 6.114891 n 14 ArialMT
|
||||||
|
85.44504 173.25 6.114891 d 14 ArialMT
|
||||||
|
91.55836 173.25 0 14 Gautami
|
||||||
|
94.61235 173.25 0 14 Gautami
|
||||||
|
94.61235 173.25 3.054756 t 14 ArialMT
|
||||||
|
97.66633 173.25 6.114899 h 14 ArialMT
|
||||||
|
103.7797 173.25 6.114899 e 14 ArialMT
|
||||||
|
109.893 173.25 6.114899 n 14 ArialMT
|
||||||
|
116.0063 173.25 0 14 Gautami
|
||||||
|
119.0603 173.25 0 14 Gautami
|
||||||
|
119.0603 173.25 6.114899 a 14 ArialMT
|
||||||
|
125.1736 173.25 6.114899 n 14 ArialMT
|
||||||
|
131.287 173.25 6.114899 o 14 ArialMT
|
||||||
|
137.4003 173.25 3.054749 t 14 ArialMT
|
||||||
|
140.4543 173.25 6.114899 h 14 ArialMT
|
||||||
|
146.5676 173.25 6.114899 e 14 ArialMT
|
||||||
|
152.6809 173.25 3.661423 r 14 ArialMT
|
||||||
|
156.3414 173.25 0 14 Gautami
|
||||||
|
159.3954 173.25 0 14 Gautami
|
||||||
|
159.3954 173.25 2.442749 l 14 ArialMT
|
||||||
|
161.8375 173.25 2.442734 i 14 ArialMT
|
||||||
|
164.2796 173.25 6.114899 n 14 ArialMT
|
||||||
|
170.393 173.25 6.114899 e 14 ArialMT
|
||||||
|
176.5063 173.25 0 14 Gautami
|
||||||
|
179.5603 173.25 0 14 Gautami
|
||||||
|
179.5603 173.25 6.114899 o 14 ArialMT
|
||||||
|
185.6736 173.25 3.054764 f 14 ArialMT
|
||||||
|
188.7276 173.25 0 14 Gautami
|
||||||
|
191.7816 173.25 0 14 Gautami
|
||||||
|
191.7816 173.25 3.054764 t 14 ArialMT
|
||||||
|
194.8355 173.25 6.114899 e 14 ArialMT
|
||||||
|
200.9489 173.25 5.497482 x 14 ArialMT
|
||||||
|
206.445 173.25 3.054764 t 14 ArialMT
|
||||||
|
209.499 173.25 3.054764 . 14 ArialMT";
|
||||||
|
|
||||||
|
return fromPdfBox.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
|
||||||
|
.Select(AssertablePositionData.Parse)
|
||||||
|
.ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyList<AssertablePositionData> GetOtherPositionData1()
|
||||||
|
{
|
||||||
|
// These do not include the font information
|
||||||
|
const string fromOther = @"72 105 9.758476 T 0 ArialMT
|
||||||
|
81.77106 105 8.894608 h 0 ArialMT
|
||||||
|
90.66733 105 3.551445 i 0 ArialMT
|
||||||
|
94.22115 105 7.998749 s 0 ArialMT
|
||||||
|
102.2192 105 4.431305 0 ArialMT
|
||||||
|
102.2192 105 0 0 ArialMT
|
||||||
|
106.6634 105 3.551445 i 0 ArialMT
|
||||||
|
106.6634 105 0 0 ArialMT
|
||||||
|
110.2173 105 7.998749 s 0 ArialMT
|
||||||
|
118.2153 105 0 0 ArialMT
|
||||||
|
118.2153 105 4.431305 0 ArialMT
|
||||||
|
122.6595 105 4.431305 t 0 ArialMT
|
||||||
|
122.6595 105 0 0 ArialMT
|
||||||
|
127.1038 105 8.894608 h 0 ArialMT
|
||||||
|
136 105 8.894608 e 0 ArialMT
|
||||||
|
144.8963 105 4.431305 0 ArialMT
|
||||||
|
144.8963 105 0 0 ArialMT
|
||||||
|
149.3405 105 8.894608 d 0 ArialMT
|
||||||
|
149.3405 105 0 0 ArialMT
|
||||||
|
158.2368 105 8.894608 o 0 ArialMT
|
||||||
|
167.1331 105 7.998749 c 0 ArialMT
|
||||||
|
175.1311 105 8.894608 u 0 ArialMT
|
||||||
|
184.0274 105 13.32591 m 0 ArialMT
|
||||||
|
197.3523 105 8.894608 e 0 ArialMT
|
||||||
|
206.2485 105 8.894608 n 0 ArialMT
|
||||||
|
215.1448 105 4.431305 t 0 ArialMT
|
||||||
|
219.5891 105 4.431305 0 ArialMT
|
||||||
|
219.5891 105 0 0 ArialMT
|
||||||
|
224.0333 105 4.431305 t 0 ArialMT
|
||||||
|
224.0333 105 0 0 ArialMT
|
||||||
|
228.4775 105 3.551453 i 0 ArialMT
|
||||||
|
232.0313 105 4.431305 t 0 ArialMT
|
||||||
|
236.4756 105 3.551453 l 0 ArialMT
|
||||||
|
240.0294 105 8.894608 e 0 ArialMT
|
||||||
|
248.918 105 4.431305 0 ArialMT
|
||||||
|
72 128.25 3.045616 0 ArialMT
|
||||||
|
72 143.25 6.706947 T 0 ArialMT
|
||||||
|
78.71446 143.25 6.11322 h 0 ArialMT
|
||||||
|
84.8278 143.25 6.11322 e 0 ArialMT
|
||||||
|
90.94113 143.25 3.661331 r 0 ArialMT
|
||||||
|
94.60161 143.25 6.11322 e 0 ArialMT
|
||||||
|
100.7149 143.25 3.045616 0 ArialMT
|
||||||
|
100.7149 143.25 0 0 ArialMT
|
||||||
|
103.7689 143.25 2.440887 i 0 ArialMT
|
||||||
|
103.7689 143.25 0 0 ArialMT
|
||||||
|
106.211 143.25 5.497498 s 0 ArialMT
|
||||||
|
111.7071 143.25 3.045616 0 ArialMT
|
||||||
|
111.7071 143.25 0 0 ArialMT
|
||||||
|
114.7611 143.25 0 0 ArialMT
|
||||||
|
114.7611 143.25 5.497498 s 0 ArialMT
|
||||||
|
120.2572 143.25 6.11322 o 0 ArialMT
|
||||||
|
126.3705 143.25 9.158836 m 0 ArialMT
|
||||||
|
135.5271 143.25 6.11322 e 0 ArialMT
|
||||||
|
141.6404 143.25 0 0 ArialMT
|
||||||
|
141.6404 143.25 3.045609 0 ArialMT
|
||||||
|
144.6944 143.25 2.440887 l 0 ArialMT
|
||||||
|
144.6944 143.25 0 0 ArialMT
|
||||||
|
147.1365 143.25 6.11322 e 0 ArialMT
|
||||||
|
153.2499 143.25 6.11322 d 0 ArialMT
|
||||||
|
159.3632 143.25 6.11322 e 0 ArialMT
|
||||||
|
165.4765 143.25 0 0 ArialMT
|
||||||
|
165.4765 143.25 3.045609 0 ArialMT
|
||||||
|
168.5305 143.25 3.045609 t 0 ArialMT
|
||||||
|
168.5305 143.25 0 0 ArialMT
|
||||||
|
171.5845 143.25 6.11322 e 0 ArialMT
|
||||||
|
177.6978 143.25 5.497498 x 0 ArialMT
|
||||||
|
183.1939 143.25 3.045609 t 0 ArialMT
|
||||||
|
186.2479 143.25 0 0 ArialMT
|
||||||
|
186.2479 143.25 3.045609 0 ArialMT
|
||||||
|
189.3019 143.25 6.11322 h 0 ArialMT
|
||||||
|
189.3019 143.25 0 0 ArialMT
|
||||||
|
195.4152 143.25 6.11322 e 0 ArialMT
|
||||||
|
201.5285 143.25 3.661331 r 0 ArialMT
|
||||||
|
205.189 143.25 6.11322 e 0 ArialMT
|
||||||
|
211.3008 143.25 3.045609 0 ArialMT
|
||||||
|
72 158.25 3.045616 0 ArialMT
|
||||||
|
72 173.25 7.32267 A 0 ArialMT
|
||||||
|
79.3317 173.25 6.11322 n 0 ArialMT
|
||||||
|
85.44504 173.25 6.11322 d 0 ArialMT
|
||||||
|
91.55836 173.25 3.045616 0 ArialMT
|
||||||
|
91.55836 173.25 0 0 ArialMT
|
||||||
|
94.61235 173.25 0 0 ArialMT
|
||||||
|
94.61235 173.25 3.045616 t 0 ArialMT
|
||||||
|
97.66633 173.25 6.11322 h 0 ArialMT
|
||||||
|
103.7797 173.25 6.11322 e 0 ArialMT
|
||||||
|
109.893 173.25 6.11322 n 0 ArialMT
|
||||||
|
116.0063 173.25 0 0 ArialMT
|
||||||
|
116.0063 173.25 3.045616 0 ArialMT
|
||||||
|
119.0603 173.25 6.11322 a 0 ArialMT
|
||||||
|
119.0603 173.25 0 0 ArialMT
|
||||||
|
125.1736 173.25 6.11322 n 0 ArialMT
|
||||||
|
131.287 173.25 6.11322 o 0 ArialMT
|
||||||
|
137.4003 173.25 3.045609 t 0 ArialMT
|
||||||
|
140.4543 173.25 6.11322 h 0 ArialMT
|
||||||
|
146.5676 173.25 6.11322 e 0 ArialMT
|
||||||
|
152.6809 173.25 3.661331 r 0 ArialMT
|
||||||
|
156.3414 173.25 3.045609 0 ArialMT
|
||||||
|
156.3414 173.25 0 0 ArialMT
|
||||||
|
159.3954 173.25 2.440887 l 0 ArialMT
|
||||||
|
159.3954 173.25 0 0 ArialMT
|
||||||
|
161.8375 173.25 2.440887 i 0 ArialMT
|
||||||
|
164.2796 173.25 6.11322 n 0 ArialMT
|
||||||
|
170.393 173.25 6.11322 e 0 ArialMT
|
||||||
|
176.5063 173.25 3.045609 0 ArialMT
|
||||||
|
176.5063 173.25 0 0 ArialMT
|
||||||
|
179.5603 173.25 6.11322 o 0 ArialMT
|
||||||
|
179.5603 173.25 0 0 ArialMT
|
||||||
|
185.6736 173.25 3.045609 f 0 ArialMT
|
||||||
|
188.7276 173.25 0 0 ArialMT
|
||||||
|
188.7276 173.25 3.045609 0 ArialMT
|
||||||
|
191.7816 173.25 3.045609 t 0 ArialMT
|
||||||
|
191.7816 173.25 0 0 ArialMT
|
||||||
|
194.8355 173.25 6.11322 e 0 ArialMT
|
||||||
|
200.9489 173.25 5.497498 x 0 ArialMT
|
||||||
|
206.445 173.25 3.045609 t 0 ArialMT
|
||||||
|
209.499 173.25 3.045609 . 0 ArialMT
|
||||||
|
212.543 173.25 3.045609 0 ArialMT";
|
||||||
|
|
||||||
|
return fromOther.Split("\r\n", StringSplitOptions.RemoveEmptyEntries)
|
||||||
|
.Select(AssertablePositionData.Parse)
|
||||||
|
.ToList();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -10,11 +10,21 @@
|
|||||||
/// </summary>
|
/// </summary>
|
||||||
public int Number { get; }
|
public int Number { get; }
|
||||||
|
|
||||||
public MediaBox MediaBox { get; }
|
internal MediaBox MediaBox { get; }
|
||||||
|
|
||||||
internal PageContent Content { get; }
|
internal PageContent Content { get; }
|
||||||
|
|
||||||
public IReadOnlyList<Letter> Text => Content?.Letters ?? new Letter[0];
|
public IReadOnlyList<Letter> Letters => Content?.Letters ?? new Letter[0];
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the width of the page in points.
|
||||||
|
/// </summary>
|
||||||
|
public decimal Width { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the height of the page in points.
|
||||||
|
/// </summary>
|
||||||
|
public decimal Height { get; }
|
||||||
|
|
||||||
internal Page(int number, MediaBox mediaBox, PageContent content)
|
internal Page(int number, MediaBox mediaBox, PageContent content)
|
||||||
{
|
{
|
||||||
@ -26,6 +36,9 @@
|
|||||||
Number = number;
|
Number = number;
|
||||||
MediaBox = mediaBox;
|
MediaBox = mediaBox;
|
||||||
Content = content;
|
Content = content;
|
||||||
|
|
||||||
|
Width = mediaBox.Bounds.Width;
|
||||||
|
Height = mediaBox.Bounds.Height;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -75,7 +75,7 @@
|
|||||||
|
|
||||||
if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary))
|
if (!isFound || !locatedPages.TryGetValue(pageNumber, out targetPageDictionary))
|
||||||
{
|
{
|
||||||
throw new InvalidOperationException("Could not find the page with number: " + pageNumber);
|
throw new ArgumentOutOfRangeException("Could not find the page with number: " + pageNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);
|
var page = pageFactory.Create(pageNumber, targetPageDictionary, new PageTreeMembers(), reader, isLenientParsing);
|
||||||
|
@ -1,941 +0,0 @@
|
|||||||
namespace UglyToad.Pdf.Parser
|
|
||||||
{
|
|
||||||
using System;
|
|
||||||
using System.IO;
|
|
||||||
using System.Text;
|
|
||||||
using Cos;
|
|
||||||
using IO;
|
|
||||||
using Util;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class is used to contain parsing logic that will be used by both the
|
|
||||||
* PDFParser and the COSStreamParser.
|
|
||||||
*
|
|
||||||
* @author Ben Litchfield
|
|
||||||
*/
|
|
||||||
public abstract class BaseParser
|
|
||||||
{
|
|
||||||
private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L;
|
|
||||||
|
|
||||||
private static readonly long GENERATION_NUMBER_THRESHOLD = 65535;
|
|
||||||
|
|
||||||
static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Log instance.
|
|
||||||
*/
|
|
||||||
protected static readonly int E = 'e';
|
|
||||||
protected static readonly int N = 'n';
|
|
||||||
protected static readonly int D = 'd';
|
|
||||||
|
|
||||||
protected static readonly int S = 's';
|
|
||||||
protected static readonly int T = 't';
|
|
||||||
protected static readonly int R = 'r';
|
|
||||||
protected static readonly int A = 'a';
|
|
||||||
protected static readonly int M = 'm';
|
|
||||||
|
|
||||||
protected static readonly int O = 'o';
|
|
||||||
protected static readonly int B = 'b';
|
|
||||||
protected static readonly int J = 'j';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is a string constant that will be used for comparisons.
|
|
||||||
*/
|
|
||||||
public static readonly string DEF = "def";
|
|
||||||
/**
|
|
||||||
* This is a string constant that will be used for comparisons.
|
|
||||||
*/
|
|
||||||
protected static readonly string ENDOBJ_string = "endobj";
|
|
||||||
/**
|
|
||||||
* This is a string constant that will be used for comparisons.
|
|
||||||
*/
|
|
||||||
protected static readonly string ENDSTREAM_string = "endstream";
|
|
||||||
/**
|
|
||||||
* This is a string constant that will be used for comparisons.
|
|
||||||
*/
|
|
||||||
protected static readonly string STREAM_string = "stream";
|
|
||||||
/**
|
|
||||||
* This is a string constant that will be used for comparisons.
|
|
||||||
*/
|
|
||||||
private static readonly string TRUE = "true";
|
|
||||||
/**
|
|
||||||
* This is a string constant that will be used for comparisons.
|
|
||||||
*/
|
|
||||||
private static readonly string FALSE = "false";
|
|
||||||
/**
|
|
||||||
* This is a string constant that will be used for comparisons.
|
|
||||||
*/
|
|
||||||
private static readonly string NULL = "null";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ASCII code for line feed.
|
|
||||||
*/
|
|
||||||
protected static readonly byte ASCII_LF = 10;
|
|
||||||
/**
|
|
||||||
* ASCII code for carriage return.
|
|
||||||
*/
|
|
||||||
protected static readonly byte ASCII_CR = 13;
|
|
||||||
private static readonly byte ASCII_ZERO = 48;
|
|
||||||
private static readonly byte ASCII_NINE = 57;
|
|
||||||
private static readonly byte ASCII_SPACE = 32;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is the stream that will be read from.
|
|
||||||
*/
|
|
||||||
protected readonly SequentialSource seqSource;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is the document that will be parsed.
|
|
||||||
*/
|
|
||||||
protected COSDocument document;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Default constructor.
|
|
||||||
*/
|
|
||||||
public BaseParser(SequentialSource pdfSource)
|
|
||||||
{
|
|
||||||
this.seqSource = pdfSource;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static bool isHexDigit(char ch)
|
|
||||||
{
|
|
||||||
return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void skipWhiteSpaces()
|
|
||||||
{
|
|
||||||
//PDF Ref 3.2.7 A stream must be followed by either
|
|
||||||
//a CRLF or LF but nothing else.
|
|
||||||
|
|
||||||
int whitespace = seqSource.read();
|
|
||||||
|
|
||||||
//see brother_scan_cover.pdf, it adds whitespaces
|
|
||||||
//after the stream but before the start of the
|
|
||||||
//data, so just read those first
|
|
||||||
while (ASCII_SPACE == whitespace)
|
|
||||||
{
|
|
||||||
whitespace = seqSource.read();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ASCII_CR == whitespace)
|
|
||||||
{
|
|
||||||
whitespace = seqSource.read();
|
|
||||||
if (ASCII_LF != whitespace)
|
|
||||||
{
|
|
||||||
seqSource.unread(whitespace);
|
|
||||||
//The spec says this is invalid but it happens in the real
|
|
||||||
//world so we must support it.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (ASCII_LF != whitespace)
|
|
||||||
{
|
|
||||||
//we are in an error.
|
|
||||||
//but again we will do a lenient parsing and just assume that everything
|
|
||||||
//is fine
|
|
||||||
seqSource.unread(whitespace);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
|
|
||||||
* format: /Title ( (5) /Creator which was patched in 1 place.
|
|
||||||
*
|
|
||||||
* However it missed the case where the number of opening and closing parenthesis isn't balanced
|
|
||||||
*
|
|
||||||
* The second bug was in this format /Title (c:\) /Producer
|
|
||||||
*
|
|
||||||
* This patch moves this code out of the parseCOSstring method, so it can be used twice.
|
|
||||||
*
|
|
||||||
* @param bracesParameter the number of braces currently open.
|
|
||||||
*
|
|
||||||
* @return the corrected value of the brace counter
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
private int checkForEndOfstring(int bracesParameter)
|
|
||||||
{
|
|
||||||
int braces = bracesParameter;
|
|
||||||
byte[]
|
|
||||||
nextThreeBytes = new byte[3];
|
|
||||||
int amountRead = seqSource.read(nextThreeBytes);
|
|
||||||
|
|
||||||
// Check the next 3 bytes if available
|
|
||||||
// The following cases are valid indicators for the end of the string
|
|
||||||
// 1. Next line contains another COSObject: CR + LF + '/'
|
|
||||||
// 2. CosDictionary ends in the next line: CR + LF + '>'
|
|
||||||
// 3. Next line contains another COSObject: CR + '/'
|
|
||||||
// 4. CosDictionary ends in the next line: CR + '>'
|
|
||||||
if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR)
|
|
||||||
{
|
|
||||||
if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>')
|
|
||||||
|| nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
|
|
||||||
{
|
|
||||||
braces = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (amountRead > 0)
|
|
||||||
{
|
|
||||||
seqSource.unread(nextThreeBytes, 0, amountRead);
|
|
||||||
}
|
|
||||||
return braces;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will parse a PDF string.
|
|
||||||
*
|
|
||||||
* @return The parsed PDF string.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected CosString parseCOSstring()
|
|
||||||
{
|
|
||||||
char nextChar = (char)seqSource.read();
|
|
||||||
if (nextChar == '<')
|
|
||||||
{
|
|
||||||
return parseCOSHexstring();
|
|
||||||
}
|
|
||||||
else if (nextChar != '(')
|
|
||||||
{
|
|
||||||
throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
|
|
||||||
nextChar + "' " + seqSource);
|
|
||||||
}
|
|
||||||
|
|
||||||
var charLf = (char)ASCII_LF;
|
|
||||||
|
|
||||||
using (var memoryStream = new MemoryStream())
|
|
||||||
using (var writer = new StreamWriter(memoryStream))
|
|
||||||
{
|
|
||||||
// This is the number of braces read
|
|
||||||
int braces = 1;
|
|
||||||
int c = seqSource.read();
|
|
||||||
while (braces > 0 && c != -1)
|
|
||||||
{
|
|
||||||
char ch = (char)c;
|
|
||||||
int nextc = -2; // not yet read
|
|
||||||
|
|
||||||
if (ch == ')')
|
|
||||||
{
|
|
||||||
|
|
||||||
braces--;
|
|
||||||
braces = checkForEndOfstring(braces);
|
|
||||||
if (braces != 0)
|
|
||||||
{
|
|
||||||
writer.Write(ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (ch == '(')
|
|
||||||
{
|
|
||||||
braces++;
|
|
||||||
writer.Write(ch);
|
|
||||||
}
|
|
||||||
else if (ch == '\\')
|
|
||||||
{
|
|
||||||
//patched by ram
|
|
||||||
char next = (char)seqSource.read();
|
|
||||||
switch (next)
|
|
||||||
{
|
|
||||||
case 'n':
|
|
||||||
writer.Write('\n');
|
|
||||||
break;
|
|
||||||
case 'r':
|
|
||||||
writer.Write('\r');
|
|
||||||
break;
|
|
||||||
case 't':
|
|
||||||
writer.Write('\t');
|
|
||||||
break;
|
|
||||||
case 'b':
|
|
||||||
writer.Write('\b');
|
|
||||||
break;
|
|
||||||
case 'f':
|
|
||||||
writer.Write('\f');
|
|
||||||
break;
|
|
||||||
case ')':
|
|
||||||
// PDFBox 276 /Title (c:\)
|
|
||||||
braces = checkForEndOfstring(braces);
|
|
||||||
if (braces != 0)
|
|
||||||
{
|
|
||||||
writer.Write(next);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
writer.Write('\\');
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case '(':
|
|
||||||
case '\\':
|
|
||||||
writer.Write(next);
|
|
||||||
break;
|
|
||||||
//case charLf:
|
|
||||||
// case ASCII_CR:
|
|
||||||
//this is a break in the line so ignore it and the newline and continue
|
|
||||||
c = seqSource.read();
|
|
||||||
while (isEOL(c) && c != -1)
|
|
||||||
{
|
|
||||||
c = seqSource.read();
|
|
||||||
}
|
|
||||||
nextc = c;
|
|
||||||
break;
|
|
||||||
case '0':
|
|
||||||
case '1':
|
|
||||||
case '2':
|
|
||||||
case '3':
|
|
||||||
case '4':
|
|
||||||
case '5':
|
|
||||||
case '6':
|
|
||||||
case '7':
|
|
||||||
{
|
|
||||||
var octal = new StringBuilder();
|
|
||||||
octal.Append(next);
|
|
||||||
c = seqSource.read();
|
|
||||||
char digit = (char)c;
|
|
||||||
if (digit >= '0' && digit <= '7')
|
|
||||||
{
|
|
||||||
octal.Append(digit);
|
|
||||||
c = seqSource.read();
|
|
||||||
digit = (char)c;
|
|
||||||
if (digit >= '0' && digit <= '7')
|
|
||||||
{
|
|
||||||
octal.Append(digit);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
nextc = c;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
nextc = c;
|
|
||||||
}
|
|
||||||
|
|
||||||
int character = 0;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
character = Convert.ToInt32(octal.ToString(), 8);
|
|
||||||
}
|
|
||||||
catch (FormatException e)
|
|
||||||
{
|
|
||||||
throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
|
|
||||||
}
|
|
||||||
writer.Write(character);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
|
|
||||||
// dropping the backslash
|
|
||||||
// see 7.3.4.2 Literal strings for further information
|
|
||||||
writer.Write(next);
|
|
||||||
break;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
writer.Write(ch);
|
|
||||||
}
|
|
||||||
if (nextc != -2)
|
|
||||||
{
|
|
||||||
c = nextc;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c = seqSource.read();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (c != -1)
|
|
||||||
{
|
|
||||||
seqSource.unread(c);
|
|
||||||
}
|
|
||||||
return new CosString(memoryStream.ToArray());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will parse a PDF HEX string with fail fast semantic
|
|
||||||
* meaning that we stop if a not allowed character is found.
|
|
||||||
* This is necessary in order to detect malformed input and
|
|
||||||
* be able to skip to next object start.
|
|
||||||
*
|
|
||||||
* We assume starting '<' was already read.
|
|
||||||
*
|
|
||||||
* @return The parsed PDF string.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
private CosString parseCOSHexstring()
|
|
||||||
{
|
|
||||||
var sBuf = new StringBuilder();
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
int c = seqSource.read();
|
|
||||||
if (isHexDigit((char)c))
|
|
||||||
{
|
|
||||||
sBuf.Append((char)c);
|
|
||||||
}
|
|
||||||
else if (c == '>')
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if (c < 0)
|
|
||||||
{
|
|
||||||
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
|
|
||||||
}
|
|
||||||
else if ((c == ' ') || (c == '\n') ||
|
|
||||||
(c == '\t') || (c == '\r') ||
|
|
||||||
(c == '\b') || (c == '\f'))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// if invalid chars was found: discard last
|
|
||||||
// hex character if it is not part of a pair
|
|
||||||
if (sBuf.Length % 2 != 0)
|
|
||||||
{
|
|
||||||
sBuf.Remove(sBuf.Length - 1, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// read till the closing bracket was found
|
|
||||||
do
|
|
||||||
{
|
|
||||||
c = seqSource.read();
|
|
||||||
}
|
|
||||||
while (c != '>' && c >= 0);
|
|
||||||
|
|
||||||
// might have reached EOF while looking for the closing bracket
|
|
||||||
// this can happen for malformed PDFs only. Make sure that there is
|
|
||||||
// no endless loop.
|
|
||||||
if (c < 0)
|
|
||||||
{
|
|
||||||
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
|
|
||||||
}
|
|
||||||
|
|
||||||
// exit loop
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return CosString.ParseHex(sBuf.ToString());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determine if a character terminates a PDF name.
|
|
||||||
*
|
|
||||||
* @param ch The character
|
|
||||||
* @return true if the character terminates a PDF name, otherwise false.
|
|
||||||
*/
|
|
||||||
protected bool isEndOfName(int ch)
|
|
||||||
{
|
|
||||||
return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' ||
|
|
||||||
ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' ||
|
|
||||||
ch == 0 || ch == '\f';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns true if a byte sequence is valid UTF-8.
|
|
||||||
*/
|
|
||||||
private bool isValidUTF8(byte[] input)
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
Decoder d = Encoding.UTF8.GetDecoder();
|
|
||||||
var charLength = d.GetCharCount(input, 0, input.Length);
|
|
||||||
var chars = new char[charLength];
|
|
||||||
d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
catch (Exception e)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will parse a bool object from the stream.
|
|
||||||
*
|
|
||||||
* @return The parsed bool object.
|
|
||||||
*
|
|
||||||
* @throws IOException If an IO error occurs during parsing.
|
|
||||||
*/
|
|
||||||
protected CosBoolean parsebool()
|
|
||||||
{
|
|
||||||
CosBoolean retval = null;
|
|
||||||
char c = (char)seqSource.peek();
|
|
||||||
if (c == 't')
|
|
||||||
{
|
|
||||||
string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4));
|
|
||||||
if (!truestring.Equals(TRUE))
|
|
||||||
{
|
|
||||||
throw new IOException("Error parsing bool: expected='true' actual='" + truestring
|
|
||||||
+ "' at offset " + seqSource.getPosition());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
retval = CosBoolean.True;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (c == 'f')
|
|
||||||
{
|
|
||||||
string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5));
|
|
||||||
if (!falsestring.Equals(FALSE))
|
|
||||||
{
|
|
||||||
throw new IOException("Error parsing bool: expected='true' actual='" + falsestring
|
|
||||||
+ "' at offset " + seqSource.getPosition());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
retval = CosBoolean.False;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
throw new IOException("Error parsing bool expected='t or f' actual='" + c
|
|
||||||
+ "' at offset " + seqSource.getPosition());
|
|
||||||
}
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will read the next string from the stream.
|
|
||||||
*
|
|
||||||
* @return The string that was read from the stream, never null.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected string readstring()
|
|
||||||
{
|
|
||||||
SkipSpaces();
|
|
||||||
StringBuilder buffer = new StringBuilder();
|
|
||||||
int c = seqSource.read();
|
|
||||||
while (!isEndOfName((char)c) && c != -1)
|
|
||||||
{
|
|
||||||
buffer.Append((char)c);
|
|
||||||
c = seqSource.read();
|
|
||||||
}
|
|
||||||
if (c != -1)
|
|
||||||
{
|
|
||||||
seqSource.unread(c);
|
|
||||||
}
|
|
||||||
return buffer.ToString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read one string and throw an exception if it is not the expected value.
|
|
||||||
*
|
|
||||||
* @param expectedstring the string value that is expected.
|
|
||||||
* @throws IOException if the string char is not the expected value or if an
|
|
||||||
* I/O error occurs.
|
|
||||||
*/
|
|
||||||
protected void readExpectedstring(string expectedstring)
|
|
||||||
{
|
|
||||||
readExpectedstring(expectedstring, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
|
|
||||||
*
|
|
||||||
* @param expectedstring pattern to be skipped
|
|
||||||
* @param skipSpaces if set to true spaces before and after the string will be skipped
|
|
||||||
* @throws IOException if pattern could not be read
|
|
||||||
*/
|
|
||||||
protected void readExpectedstring(string expectedstring, bool skipSpaces)
|
|
||||||
{
|
|
||||||
SkipSpaces();
|
|
||||||
foreach (var c in expectedstring)
|
|
||||||
{
|
|
||||||
if (seqSource.read() != c)
|
|
||||||
{
|
|
||||||
throw new IOException("Expected string '" + expectedstring
|
|
||||||
+ "' but missed at character '" + c + "' at offset "
|
|
||||||
+ seqSource.getPosition());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
SkipSpaces();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read one char and throw an exception if it is not the expected value.
|
|
||||||
*
|
|
||||||
* @param ec the char value that is expected.
|
|
||||||
* @throws IOException if the read char is not the expected value or if an
|
|
||||||
* I/O error occurs.
|
|
||||||
*/
|
|
||||||
protected void readExpectedChar(char ec)
|
|
||||||
{
|
|
||||||
char c = (char)seqSource.read();
|
|
||||||
if (c != ec)
|
|
||||||
{
|
|
||||||
throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will read the next string from the stream up to a certain length.
|
|
||||||
*
|
|
||||||
* @param length The length to stop reading at.
|
|
||||||
*
|
|
||||||
* @return The string that was read from the stream of length 0 to length.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected string readstring(int length)
|
|
||||||
{
|
|
||||||
SkipSpaces();
|
|
||||||
|
|
||||||
int c = seqSource.read();
|
|
||||||
|
|
||||||
//average string size is around 2 and the normal string buffer size is
|
|
||||||
//about 16 so lets save some space.
|
|
||||||
StringBuilder buffer = new StringBuilder(length);
|
|
||||||
while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length &&
|
|
||||||
c != '[' &&
|
|
||||||
c != '<' &&
|
|
||||||
c != '(' &&
|
|
||||||
c != '/')
|
|
||||||
{
|
|
||||||
buffer.Append((char)c);
|
|
||||||
c = seqSource.read();
|
|
||||||
}
|
|
||||||
if (c != -1)
|
|
||||||
{
|
|
||||||
seqSource.unread(c);
|
|
||||||
}
|
|
||||||
return buffer.ToString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the next character is a closing brace( close of PDF array ).
|
|
||||||
*
|
|
||||||
* @return true if the next byte is ']', false otherwise.
|
|
||||||
*
|
|
||||||
* @throws IOException If an IO error occurs.
|
|
||||||
*/
|
|
||||||
protected bool isClosing()
|
|
||||||
{
|
|
||||||
return isClosing(seqSource.peek());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the next character is a closing brace( close of PDF array ).
|
|
||||||
*
|
|
||||||
* @param c The character to check against end of line
|
|
||||||
* @return true if the next byte is ']', false otherwise.
|
|
||||||
*/
|
|
||||||
protected bool isClosing(int c)
|
|
||||||
{
|
|
||||||
return c == ']';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will read bytes until the first end of line marker occurs.
|
|
||||||
* NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
|
|
||||||
* which is an important detail if one wants to unread the line.
|
|
||||||
*
|
|
||||||
* @return The characters between the current position and the end of the line.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected string readLine()
|
|
||||||
{
|
|
||||||
if (seqSource.isEOF())
|
|
||||||
{
|
|
||||||
throw new IOException("Error: End-of-File, expected line");
|
|
||||||
}
|
|
||||||
|
|
||||||
StringBuilder buffer = new StringBuilder(11);
|
|
||||||
|
|
||||||
int c;
|
|
||||||
while ((c = seqSource.read()) != -1)
|
|
||||||
{
|
|
||||||
// CR and LF are valid EOLs
|
|
||||||
if (isEOL(c))
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
buffer.Append((char)c);
|
|
||||||
}
|
|
||||||
// CR+LF is also a valid EOL
|
|
||||||
if (isCR(c) && isLF(seqSource.peek()))
|
|
||||||
{
|
|
||||||
seqSource.read();
|
|
||||||
}
|
|
||||||
return buffer.ToString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the next byte to be read is an end of line byte.
|
|
||||||
*
|
|
||||||
* @return true if the next byte is 0x0A or 0x0D.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected bool isEOL()
|
|
||||||
{
|
|
||||||
return isEOL(seqSource.peek());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the next byte to be read is an end of line byte.
|
|
||||||
*
|
|
||||||
* @param c The character to check against end of line
|
|
||||||
* @return true if the next byte is 0x0A or 0x0D.
|
|
||||||
*/
|
|
||||||
protected bool isEOL(int c)
|
|
||||||
{
|
|
||||||
return isLF(c) || isCR(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
private bool isLF(int c)
|
|
||||||
{
|
|
||||||
return ASCII_LF == c;
|
|
||||||
}
|
|
||||||
|
|
||||||
private bool isCR(int c)
|
|
||||||
{
|
|
||||||
return ASCII_CR == c;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the next byte is whitespace or not.
|
|
||||||
*
|
|
||||||
* @return true if the next byte in the stream is a whitespace character.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected bool isWhitespace()
|
|
||||||
{
|
|
||||||
return isWhitespace(seqSource.peek());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if a character is whitespace or not. These values are
|
|
||||||
* specified in table 1 (page 12) of ISO 32000-1:2008.
|
|
||||||
* @param c The character to check against whitespace
|
|
||||||
* @return true if the character is a whitespace character.
|
|
||||||
*/
|
|
||||||
protected bool isWhitespace(int c)
|
|
||||||
{
|
|
||||||
return c == 0 || c == 9 || c == 12 || c == ASCII_LF
|
|
||||||
|| c == ASCII_CR || c == ASCII_SPACE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the next byte is a space or not.
|
|
||||||
*
|
|
||||||
* @return true if the next byte in the stream is a space character.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected bool isSpace()
|
|
||||||
{
|
|
||||||
return isSpace(seqSource.peek());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the given value is a space or not.
|
|
||||||
*
|
|
||||||
* @param c The character to check against space
|
|
||||||
* @return true if the next byte in the stream is a space character.
|
|
||||||
*/
|
|
||||||
protected bool isSpace(int c)
|
|
||||||
{
|
|
||||||
return ASCII_SPACE == c;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the next byte is a digit or not.
|
|
||||||
*
|
|
||||||
* @return true if the next byte in the stream is a digit.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected bool isDigit()
|
|
||||||
{
|
|
||||||
return isDigit(seqSource.peek());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will tell if the given value is a digit or not.
|
|
||||||
*
|
|
||||||
* @param c The character to be checked
|
|
||||||
* @return true if the next byte in the stream is a digit.
|
|
||||||
*/
|
|
||||||
protected static bool isDigit(int c)
|
|
||||||
{
|
|
||||||
return c >= ASCII_ZERO && c <= ASCII_NINE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will skip all spaces and comments that are present.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected void SkipSpaces()
|
|
||||||
{
|
|
||||||
int c = seqSource.read();
|
|
||||||
// 37 is the % character, a comment
|
|
||||||
while (isWhitespace(c) || c == 37)
|
|
||||||
{
|
|
||||||
if (c == 37)
|
|
||||||
{
|
|
||||||
// skip past the comment section
|
|
||||||
c = seqSource.read();
|
|
||||||
while (!isEOL(c) && c != -1)
|
|
||||||
{
|
|
||||||
c = seqSource.read();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
c = seqSource.read();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (c != -1)
|
|
||||||
{
|
|
||||||
seqSource.unread(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will read a long from the Stream and throw an {@link IOException} if
|
|
||||||
* the long value is negative or has more than 10 digits (i.e. : bigger than
|
|
||||||
* {@link #OBJECT_NUMBER_THRESHOLD})
|
|
||||||
*
|
|
||||||
* @return the object number being read.
|
|
||||||
* @throws IOException if an I/O error occurs
|
|
||||||
*/
|
|
||||||
protected long readObjectNumber()
|
|
||||||
{
|
|
||||||
long retval = readLong();
|
|
||||||
if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
|
|
||||||
{
|
|
||||||
throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
|
|
||||||
}
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
|
|
||||||
* has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
|
|
||||||
* @return the generation number being read.
|
|
||||||
* @throws IOException if an I/O error occurs
|
|
||||||
*/
|
|
||||||
protected int readGenerationNumber()
|
|
||||||
{
|
|
||||||
int retval = readInt();
|
|
||||||
if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
|
|
||||||
{
|
|
||||||
throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
|
|
||||||
}
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will read an integer from the stream.
|
|
||||||
*
|
|
||||||
* @return The integer that was read from the stream.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected int readInt()
|
|
||||||
{
|
|
||||||
SkipSpaces();
|
|
||||||
int retval = 0;
|
|
||||||
|
|
||||||
StringBuilder intBuffer = readstringNumber();
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
retval = int.Parse(intBuffer.ToString());
|
|
||||||
}
|
|
||||||
catch (FormatException e)
|
|
||||||
{
|
|
||||||
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
|
|
||||||
throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e);
|
|
||||||
}
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This will read an long from the stream.
|
|
||||||
*
|
|
||||||
* @return The long that was read from the stream.
|
|
||||||
*
|
|
||||||
* @throws IOException If there is an error reading from the stream.
|
|
||||||
*/
|
|
||||||
protected long readLong()
|
|
||||||
{
|
|
||||||
SkipSpaces();
|
|
||||||
long retval = 0;
|
|
||||||
|
|
||||||
StringBuilder longBuffer = readstringNumber();
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
retval = long.Parse(longBuffer.ToString());
|
|
||||||
}
|
|
||||||
catch (FormatException e)
|
|
||||||
{
|
|
||||||
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString()));
|
|
||||||
|
|
||||||
throw new IOException(
|
|
||||||
$"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
return retval;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This method is used to read a token by the {@linkplain #readInt()} method
|
|
||||||
* and the {@linkplain #readLong()} method.
|
|
||||||
*
|
|
||||||
* @return the token to parse as integer or long by the calling method.
|
|
||||||
* @throws IOException throws by the {@link #seqSource} methods.
|
|
||||||
*/
|
|
||||||
protected StringBuilder readstringNumber()
|
|
||||||
{
|
|
||||||
int lastByte = 0;
|
|
||||||
StringBuilder buffer = new StringBuilder();
|
|
||||||
while ((lastByte = seqSource.read()) != ASCII_SPACE &&
|
|
||||||
lastByte != ASCII_LF &&
|
|
||||||
lastByte != ASCII_CR &&
|
|
||||||
lastByte != 60 && //see sourceforge bug 1714707
|
|
||||||
lastByte != '[' && // PDFBOX-1845
|
|
||||||
lastByte != '(' && // PDFBOX-2579
|
|
||||||
lastByte != 0 && //See sourceforge bug 853328
|
|
||||||
lastByte != -1)
|
|
||||||
{
|
|
||||||
buffer.Append((char)lastByte);
|
|
||||||
if (buffer.Length > MAX_LENGTH_LONG)
|
|
||||||
{
|
|
||||||
throw new IOException("Number '" + buffer +
|
|
||||||
"' is getting too long, stop reading at offset " + seqSource.getPosition());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (lastByte != -1)
|
|
||||||
{
|
|
||||||
seqSource.unread(lastByte);
|
|
||||||
}
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -25,10 +25,15 @@
|
|||||||
private readonly ParsingCachingProviders cachingProviders;
|
private readonly ParsingCachingProviders cachingProviders;
|
||||||
|
|
||||||
[NotNull]
|
[NotNull]
|
||||||
public Catalog Catalog { get; }
|
internal Catalog Catalog { get; }
|
||||||
|
|
||||||
[NotNull]
|
[NotNull]
|
||||||
public Pages Pages { get; }
|
internal Pages Pages { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the number of pages in this document.
|
||||||
|
/// </summary>
|
||||||
|
public int NumberOfPages => Pages.Count;
|
||||||
|
|
||||||
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
|
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
|
||||||
bool isLenientParsing,
|
bool isLenientParsing,
|
||||||
@ -50,6 +55,16 @@
|
|||||||
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
|
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null) => PdfDocumentFactory.Open(fileBytes, options);
|
||||||
public static PdfDocument Open(string filename, ParsingOptions options = null) => PdfDocumentFactory.Open(filename, options);
|
public static PdfDocument Open(string filename, ParsingOptions options = null) => PdfDocumentFactory.Open(filename, options);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get the page with the specified page number.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
|
||||||
|
/// <returns>The page.</returns>
|
||||||
|
public Page GetPage(int pageNumber)
|
||||||
|
{
|
||||||
|
return Pages.GetPage(pageNumber);
|
||||||
|
}
|
||||||
|
|
||||||
public void Dispose()
|
public void Dispose()
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
|
Loading…
Reference in New Issue
Block a user