mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
move catalog parsing to its own factory. parse document information if present and expose publically. add test for itext generated document
This commit is contained in:
Binary file not shown.
@@ -8,7 +8,7 @@ namespace UglyToad.Pdf.Tests.Integration
|
|||||||
using Content;
|
using Content;
|
||||||
using Xunit;
|
using Xunit;
|
||||||
|
|
||||||
public class SinglePageSimpleTests
|
public class SinglePageSimpleGoogleChromeTests
|
||||||
{
|
{
|
||||||
private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
|
private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
|
||||||
{
|
{
|
||||||
39
src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleIText1.cs
Normal file
39
src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleIText1.cs
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
namespace UglyToad.Pdf.Tests.Integration
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.IO;
|
||||||
|
using Content;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
public class SinglePageSimpleIText1Tests
|
||||||
|
{
|
||||||
|
private static string GetFilename()
|
||||||
|
{
|
||||||
|
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
|
||||||
|
|
||||||
|
return Path.Combine(documentFolder, "Single Page Simple - from itext 1_1.pdf");
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HasCorrectNumberOfPages()
|
||||||
|
{
|
||||||
|
var file = GetFilename();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||||
|
{
|
||||||
|
Assert.Equal(1, document.NumberOfPages);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void HasCorrectPageSize()
|
||||||
|
{
|
||||||
|
using (var document = PdfDocument.Open(GetFilename()))
|
||||||
|
{
|
||||||
|
var page = document.GetPage(1);
|
||||||
|
|
||||||
|
Assert.Equal(PageSize.A4, page.Size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -14,6 +14,7 @@
|
|||||||
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
|
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
|
||||||
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
|
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
|
||||||
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
|
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
|
||||||
|
<None Remove="Integration\Documents\Single Page Simple - from itext 1_1.pdf" />
|
||||||
<None Remove="Integration\Documents\Single Page Simple - from open office.pdf" />
|
<None Remove="Integration\Documents\Single Page Simple - from open office.pdf" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
@@ -36,6 +37,9 @@
|
|||||||
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
|
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</Content>
|
</Content>
|
||||||
|
<Content Include="Integration\Documents\Single Page Simple - from itext 1_1.pdf">
|
||||||
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
|
</Content>
|
||||||
<Content Include="Integration\Documents\Single Page Simple - from open office.pdf">
|
<Content Include="Integration\Documents\Single Page Simple - from open office.pdf">
|
||||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||||
</Content>
|
</Content>
|
||||||
|
|||||||
@@ -8,9 +8,13 @@
|
|||||||
{
|
{
|
||||||
private readonly PdfDictionary catalogDictionary;
|
private readonly PdfDictionary catalogDictionary;
|
||||||
|
|
||||||
internal Catalog(PdfDictionary catalogDictionary)
|
public PdfDictionary PagesDictionary { get; }
|
||||||
|
|
||||||
|
internal Catalog(PdfDictionary catalogDictionary, PdfDictionary pagesDictionary)
|
||||||
{
|
{
|
||||||
this.catalogDictionary = catalogDictionary ?? throw new ArgumentNullException(nameof(catalogDictionary));
|
this.catalogDictionary = catalogDictionary ?? throw new ArgumentNullException(nameof(catalogDictionary));
|
||||||
|
|
||||||
|
PagesDictionary = pagesDictionary ?? throw new ArgumentNullException(nameof(pagesDictionary));
|
||||||
}
|
}
|
||||||
|
|
||||||
public CosBase Get(CosName name)
|
public CosBase Get(CosName name)
|
||||||
|
|||||||
88
src/UglyToad.Pdf/Content/DocumentInformation.cs
Normal file
88
src/UglyToad.Pdf/Content/DocumentInformation.cs
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
namespace UglyToad.Pdf.Content
|
||||||
|
{
|
||||||
|
using System.Text;
|
||||||
|
using Util.JetBrains.Annotations;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Metadata for the PDF document.
|
||||||
|
/// </summary>
|
||||||
|
public class DocumentInformation
|
||||||
|
{
|
||||||
|
internal static DocumentInformation Default { get; }
|
||||||
|
= new DocumentInformation(null, null, null, null, null, null);
|
||||||
|
|
||||||
|
private readonly string representation;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The title of this document if applicable.
|
||||||
|
/// </summary>
|
||||||
|
[CanBeNull]
|
||||||
|
public string Title { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The name of the person who created this document if applicable.
|
||||||
|
/// </summary>
|
||||||
|
[CanBeNull]
|
||||||
|
public string Author { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The subject of this document if applicable.
|
||||||
|
/// </summary>
|
||||||
|
[CanBeNull]
|
||||||
|
public string Subject { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Any keywords associated with this document if applicable.
|
||||||
|
/// </summary>
|
||||||
|
[CanBeNull]
|
||||||
|
public string Keywords { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The name of the application which created the original document before it was converted to PDF. if applicable.
|
||||||
|
/// </summary>
|
||||||
|
[CanBeNull]
|
||||||
|
public string Creator { get; }
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// The name of the application used to convert the original document to PDF if applicable.
|
||||||
|
/// </summary>
|
||||||
|
[CanBeNull]
|
||||||
|
public string Producer { get; }
|
||||||
|
|
||||||
|
internal DocumentInformation(string title, string author, string subject, string keywords, string creator, string producer)
|
||||||
|
{
|
||||||
|
Title = title;
|
||||||
|
Author = author;
|
||||||
|
Subject = subject;
|
||||||
|
Keywords = keywords;
|
||||||
|
Creator = creator;
|
||||||
|
Producer = producer;
|
||||||
|
|
||||||
|
var builder = new StringBuilder();
|
||||||
|
|
||||||
|
AppendPart("Title", title, builder);
|
||||||
|
AppendPart("Author", author, builder);
|
||||||
|
AppendPart("Subject", subject, builder);
|
||||||
|
AppendPart("Keywords", keywords, builder);
|
||||||
|
AppendPart("Creator", creator, builder);
|
||||||
|
AppendPart("Producer", producer, builder);
|
||||||
|
|
||||||
|
representation = builder.ToString() ?? string.Empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override string ToString()
|
||||||
|
{
|
||||||
|
return representation;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void AppendPart(string name, string value, StringBuilder builder)
|
||||||
|
{
|
||||||
|
if (value == null)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.Append(name).Append(": ").Append(value).Append(";");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -31,25 +31,9 @@
|
|||||||
throw new ArgumentNullException(nameof(catalog));
|
throw new ArgumentNullException(nameof(catalog));
|
||||||
}
|
}
|
||||||
|
|
||||||
var pages = catalog.Get(CosName.PAGES) as CosObject;
|
rootPageDictionary = catalog.PagesDictionary;
|
||||||
|
|
||||||
if (pages == null)
|
Count = rootPageDictionary.GetIntOrDefault(CosName.COUNT);
|
||||||
{
|
|
||||||
throw new InvalidOperationException("No pages were present in the catalog for this PDF document");
|
|
||||||
}
|
|
||||||
|
|
||||||
var pagesObject = pdfObjectParser.Parse(pages.ToIndirectReference(), reader, isLenientParsing);
|
|
||||||
|
|
||||||
if (!(pagesObject is PdfDictionary catalogPageDictionary))
|
|
||||||
{
|
|
||||||
throw new InvalidOperationException("Could not find the root pages object: " + pages);
|
|
||||||
}
|
|
||||||
|
|
||||||
var count = catalogPageDictionary.GetIntOrDefault(CosName.COUNT);
|
|
||||||
|
|
||||||
rootPageDictionary = catalogPageDictionary;
|
|
||||||
|
|
||||||
Count = count;
|
|
||||||
|
|
||||||
this.log = log;
|
this.log = log;
|
||||||
this.catalog = catalog;
|
this.catalog = catalog;
|
||||||
|
|||||||
34
src/UglyToad.Pdf/Exceptions/PdfDocumentFormatException.cs
Normal file
34
src/UglyToad.Pdf/Exceptions/PdfDocumentFormatException.cs
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
namespace UglyToad.Pdf.Exceptions
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Runtime.Serialization;
|
||||||
|
|
||||||
|
[Serializable]
|
||||||
|
public class PdfDocumentFormatException : Exception
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// For guidelines regarding the creation of new exception types, see
|
||||||
|
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/cpgenref/html/cpconerrorraisinghandlingguidelines.asp
|
||||||
|
// and
|
||||||
|
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dncscol/html/csharp07192001.asp
|
||||||
|
//
|
||||||
|
|
||||||
|
public PdfDocumentFormatException()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public PdfDocumentFormatException(string message) : base(message)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public PdfDocumentFormatException(string message, Exception inner) : base(message, inner)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
protected PdfDocumentFormatException(
|
||||||
|
SerializationInfo info,
|
||||||
|
StreamingContext context) : base(info, context)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
46
src/UglyToad.Pdf/Parser/CatalogFactory.cs
Normal file
46
src/UglyToad.Pdf/Parser/CatalogFactory.cs
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
namespace UglyToad.Pdf.Parser
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using Content;
|
||||||
|
using ContentStream;
|
||||||
|
using Cos;
|
||||||
|
using Exceptions;
|
||||||
|
using IO;
|
||||||
|
|
||||||
|
internal class CatalogFactory
|
||||||
|
{
|
||||||
|
private readonly IPdfObjectParser pdfObjectParser;
|
||||||
|
|
||||||
|
public CatalogFactory(IPdfObjectParser pdfObjectParser)
|
||||||
|
{
|
||||||
|
this.pdfObjectParser = pdfObjectParser;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Catalog Create(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
|
||||||
|
{
|
||||||
|
if (dictionary == null)
|
||||||
|
{
|
||||||
|
throw new ArgumentNullException(nameof(dictionary));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dictionary.TryGetName(CosName.TYPE, out var type) && !type.Equals(CosName.CATALOG))
|
||||||
|
{
|
||||||
|
throw new PdfDocumentFormatException($"The type of the catalog dictionary was not Catalog: {dictionary}.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!dictionary.TryGetItemOfType(CosName.PAGES, out CosObject value))
|
||||||
|
{
|
||||||
|
throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var pages = pdfObjectParser.Parse(value.ToIndirectReference(), reader, isLenientParsing);
|
||||||
|
|
||||||
|
if (!(pages is PdfDictionary pagesDictionary))
|
||||||
|
{
|
||||||
|
throw new PdfDocumentFormatException($"The pages entry in the catalog {value.ToIndirectReference()} did not link to a dictionary: {pages}.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Catalog(dictionary, pagesDictionary);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
47
src/UglyToad.Pdf/Parser/DocumentInformationFactory.cs
Normal file
47
src/UglyToad.Pdf/Parser/DocumentInformationFactory.cs
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
namespace UglyToad.Pdf.Parser
|
||||||
|
{
|
||||||
|
using Content;
|
||||||
|
using ContentStream;
|
||||||
|
using Cos;
|
||||||
|
using IO;
|
||||||
|
|
||||||
|
internal class DocumentInformationFactory
|
||||||
|
{
|
||||||
|
public DocumentInformation Create(IPdfObjectParser pdfObjectParser,
|
||||||
|
PdfDictionary rootDictionary, IRandomAccessRead reader,
|
||||||
|
bool isLenientParsing)
|
||||||
|
{
|
||||||
|
if (!rootDictionary.TryGetItemOfType(CosName.INFO, out CosObject infoBase))
|
||||||
|
{
|
||||||
|
return DocumentInformation.Default;
|
||||||
|
}
|
||||||
|
|
||||||
|
var infoParsed = pdfObjectParser.Parse(infoBase.ToIndirectReference(), reader, isLenientParsing);
|
||||||
|
|
||||||
|
if (!(infoParsed is PdfDictionary infoDictionary))
|
||||||
|
{
|
||||||
|
return DocumentInformation.Default;
|
||||||
|
}
|
||||||
|
|
||||||
|
var title = GetEntryOrDefault(infoDictionary, CosName.TITLE);
|
||||||
|
var author = GetEntryOrDefault(infoDictionary, CosName.AUTHOR);
|
||||||
|
var subject = GetEntryOrDefault(infoDictionary, CosName.SUBJECT);
|
||||||
|
var keywords = GetEntryOrDefault(infoDictionary, CosName.KEYWORDS);
|
||||||
|
var creator = GetEntryOrDefault(infoDictionary, CosName.CREATOR);
|
||||||
|
var producer = GetEntryOrDefault(infoDictionary, CosName.PRODUCER);
|
||||||
|
|
||||||
|
return new DocumentInformation(title, author, subject,
|
||||||
|
keywords, creator, producer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string GetEntryOrDefault(PdfDictionary infoDictionary, CosName key)
|
||||||
|
{
|
||||||
|
if (infoDictionary.TryGetItemOfType(key, out CosString str))
|
||||||
|
{
|
||||||
|
return str.GetAscii();
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,14 +1,13 @@
|
|||||||
namespace UglyToad.Pdf.Content
|
namespace UglyToad.Pdf.Parser
|
||||||
{
|
{
|
||||||
using System;
|
using System;
|
||||||
|
using Content;
|
||||||
using ContentStream;
|
using ContentStream;
|
||||||
using Cos;
|
using Cos;
|
||||||
using Filters;
|
using Filters;
|
||||||
using Geometry;
|
using Geometry;
|
||||||
using Graphics;
|
using Graphics;
|
||||||
using IO;
|
using IO;
|
||||||
using Parser;
|
|
||||||
using Util;
|
|
||||||
|
|
||||||
internal class PageFactory : IPageFactory
|
internal class PageFactory : IPageFactory
|
||||||
{
|
{
|
||||||
@@ -78,6 +78,8 @@
|
|||||||
var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory);
|
var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory);
|
||||||
|
|
||||||
var pageFactory = new PageFactory(resourceContainer, pdfObjectParser, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
|
var pageFactory = new PageFactory(resourceContainer, pdfObjectParser, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
|
||||||
|
var informationFactory = new DocumentInformationFactory();
|
||||||
|
var catalogFactory = new CatalogFactory(pdfObjectParser);
|
||||||
|
|
||||||
var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool,
|
var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool,
|
||||||
isLenientParsing);
|
isLenientParsing);
|
||||||
@@ -93,9 +95,13 @@
|
|||||||
rootDictionary.Set(CosName.TYPE, CosName.CATALOG);
|
rootDictionary.Set(CosName.TYPE, CosName.CATALOG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var information = informationFactory.Create(pdfObjectParser, crossReferenceTable.Dictionary, reader, isLenientParsing);
|
||||||
|
|
||||||
|
var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing);
|
||||||
|
|
||||||
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
|
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
|
||||||
|
|
||||||
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, new Catalog(rootDictionary));
|
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,
|
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,
|
||||||
|
|||||||
@@ -7,7 +7,6 @@
|
|||||||
using Logging;
|
using Logging;
|
||||||
using Parser;
|
using Parser;
|
||||||
using Parser.Parts;
|
using Parser.Parts;
|
||||||
using Util;
|
|
||||||
using Util.JetBrains.Annotations;
|
using Util.JetBrains.Annotations;
|
||||||
|
|
||||||
public class PdfDocument : IDisposable
|
public class PdfDocument : IDisposable
|
||||||
@@ -30,6 +29,9 @@
|
|||||||
[NotNull]
|
[NotNull]
|
||||||
internal Pages Pages { get; }
|
internal Pages Pages { get; }
|
||||||
|
|
||||||
|
[NotNull]
|
||||||
|
public DocumentInformation Information { get; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Get the number of pages in this document.
|
/// Get the number of pages in this document.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
@@ -40,7 +42,8 @@
|
|||||||
ParsingCachingProviders cachingProviders,
|
ParsingCachingProviders cachingProviders,
|
||||||
IPageFactory pageFactory,
|
IPageFactory pageFactory,
|
||||||
IPdfObjectParser pdfObjectParser,
|
IPdfObjectParser pdfObjectParser,
|
||||||
Catalog catalog)
|
Catalog catalog,
|
||||||
|
DocumentInformation information)
|
||||||
{
|
{
|
||||||
this.log = log;
|
this.log = log;
|
||||||
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||||
@@ -48,6 +51,7 @@
|
|||||||
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
|
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
|
||||||
this.isLenientParsing = isLenientParsing;
|
this.isLenientParsing = isLenientParsing;
|
||||||
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
||||||
|
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||||
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
|
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
|
||||||
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing);
|
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user