mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
move catalog parsing to its own factory. parse document information if present and expose publically. add test for itext generated document
This commit is contained in:
Binary file not shown.
@@ -8,7 +8,7 @@ namespace UglyToad.Pdf.Tests.Integration
|
||||
using Content;
|
||||
using Xunit;
|
||||
|
||||
public class SinglePageSimpleTests
|
||||
public class SinglePageSimpleGoogleChromeTests
|
||||
{
|
||||
private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
|
||||
{
|
39
src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleIText1.cs
Normal file
39
src/UglyToad.Pdf.Tests/Integration/SinglePageSimpleIText1.cs
Normal file
@@ -0,0 +1,39 @@
|
||||
namespace UglyToad.Pdf.Tests.Integration
|
||||
{
|
||||
using System;
|
||||
using System.IO;
|
||||
using Content;
|
||||
using Xunit;
|
||||
|
||||
public class SinglePageSimpleIText1Tests
|
||||
{
|
||||
private static string GetFilename()
|
||||
{
|
||||
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
|
||||
|
||||
return Path.Combine(documentFolder, "Single Page Simple - from itext 1_1.pdf");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasCorrectNumberOfPages()
|
||||
{
|
||||
var file = GetFilename();
|
||||
|
||||
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
|
||||
{
|
||||
Assert.Equal(1, document.NumberOfPages);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasCorrectPageSize()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename()))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
Assert.Equal(PageSize.A4, page.Size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -14,6 +14,7 @@
|
||||
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
|
||||
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
|
||||
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
|
||||
<None Remove="Integration\Documents\Single Page Simple - from itext 1_1.pdf" />
|
||||
<None Remove="Integration\Documents\Single Page Simple - from open office.pdf" />
|
||||
</ItemGroup>
|
||||
|
||||
@@ -36,6 +37,9 @@
|
||||
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
<Content Include="Integration\Documents\Single Page Simple - from itext 1_1.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
<Content Include="Integration\Documents\Single Page Simple - from open office.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
|
@@ -8,9 +8,13 @@
|
||||
{
|
||||
private readonly PdfDictionary catalogDictionary;
|
||||
|
||||
internal Catalog(PdfDictionary catalogDictionary)
|
||||
public PdfDictionary PagesDictionary { get; }
|
||||
|
||||
internal Catalog(PdfDictionary catalogDictionary, PdfDictionary pagesDictionary)
|
||||
{
|
||||
this.catalogDictionary = catalogDictionary ?? throw new ArgumentNullException(nameof(catalogDictionary));
|
||||
|
||||
PagesDictionary = pagesDictionary ?? throw new ArgumentNullException(nameof(pagesDictionary));
|
||||
}
|
||||
|
||||
public CosBase Get(CosName name)
|
||||
|
88
src/UglyToad.Pdf/Content/DocumentInformation.cs
Normal file
88
src/UglyToad.Pdf/Content/DocumentInformation.cs
Normal file
@@ -0,0 +1,88 @@
|
||||
namespace UglyToad.Pdf.Content
|
||||
{
|
||||
using System.Text;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <summary>
|
||||
/// Metadata for the PDF document.
|
||||
/// </summary>
|
||||
public class DocumentInformation
|
||||
{
|
||||
internal static DocumentInformation Default { get; }
|
||||
= new DocumentInformation(null, null, null, null, null, null);
|
||||
|
||||
private readonly string representation;
|
||||
|
||||
/// <summary>
|
||||
/// The title of this document if applicable.
|
||||
/// </summary>
|
||||
[CanBeNull]
|
||||
public string Title { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the person who created this document if applicable.
|
||||
/// </summary>
|
||||
[CanBeNull]
|
||||
public string Author { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The subject of this document if applicable.
|
||||
/// </summary>
|
||||
[CanBeNull]
|
||||
public string Subject { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Any keywords associated with this document if applicable.
|
||||
/// </summary>
|
||||
[CanBeNull]
|
||||
public string Keywords { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the application which created the original document before it was converted to PDF. if applicable.
|
||||
/// </summary>
|
||||
[CanBeNull]
|
||||
public string Creator { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the application used to convert the original document to PDF if applicable.
|
||||
/// </summary>
|
||||
[CanBeNull]
|
||||
public string Producer { get; }
|
||||
|
||||
internal DocumentInformation(string title, string author, string subject, string keywords, string creator, string producer)
|
||||
{
|
||||
Title = title;
|
||||
Author = author;
|
||||
Subject = subject;
|
||||
Keywords = keywords;
|
||||
Creator = creator;
|
||||
Producer = producer;
|
||||
|
||||
var builder = new StringBuilder();
|
||||
|
||||
AppendPart("Title", title, builder);
|
||||
AppendPart("Author", author, builder);
|
||||
AppendPart("Subject", subject, builder);
|
||||
AppendPart("Keywords", keywords, builder);
|
||||
AppendPart("Creator", creator, builder);
|
||||
AppendPart("Producer", producer, builder);
|
||||
|
||||
representation = builder.ToString() ?? string.Empty;
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return representation;
|
||||
}
|
||||
|
||||
private static void AppendPart(string name, string value, StringBuilder builder)
|
||||
{
|
||||
if (value == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
builder.Append(name).Append(": ").Append(value).Append(";");
|
||||
}
|
||||
}
|
||||
}
|
@@ -31,25 +31,9 @@
|
||||
throw new ArgumentNullException(nameof(catalog));
|
||||
}
|
||||
|
||||
var pages = catalog.Get(CosName.PAGES) as CosObject;
|
||||
rootPageDictionary = catalog.PagesDictionary;
|
||||
|
||||
if (pages == null)
|
||||
{
|
||||
throw new InvalidOperationException("No pages were present in the catalog for this PDF document");
|
||||
}
|
||||
|
||||
var pagesObject = pdfObjectParser.Parse(pages.ToIndirectReference(), reader, isLenientParsing);
|
||||
|
||||
if (!(pagesObject is PdfDictionary catalogPageDictionary))
|
||||
{
|
||||
throw new InvalidOperationException("Could not find the root pages object: " + pages);
|
||||
}
|
||||
|
||||
var count = catalogPageDictionary.GetIntOrDefault(CosName.COUNT);
|
||||
|
||||
rootPageDictionary = catalogPageDictionary;
|
||||
|
||||
Count = count;
|
||||
Count = rootPageDictionary.GetIntOrDefault(CosName.COUNT);
|
||||
|
||||
this.log = log;
|
||||
this.catalog = catalog;
|
||||
|
34
src/UglyToad.Pdf/Exceptions/PdfDocumentFormatException.cs
Normal file
34
src/UglyToad.Pdf/Exceptions/PdfDocumentFormatException.cs
Normal file
@@ -0,0 +1,34 @@
|
||||
namespace UglyToad.Pdf.Exceptions
|
||||
{
|
||||
using System;
|
||||
using System.Runtime.Serialization;
|
||||
|
||||
[Serializable]
|
||||
public class PdfDocumentFormatException : Exception
|
||||
{
|
||||
//
|
||||
// For guidelines regarding the creation of new exception types, see
|
||||
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/cpgenref/html/cpconerrorraisinghandlingguidelines.asp
|
||||
// and
|
||||
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dncscol/html/csharp07192001.asp
|
||||
//
|
||||
|
||||
public PdfDocumentFormatException()
|
||||
{
|
||||
}
|
||||
|
||||
public PdfDocumentFormatException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
public PdfDocumentFormatException(string message, Exception inner) : base(message, inner)
|
||||
{
|
||||
}
|
||||
|
||||
protected PdfDocumentFormatException(
|
||||
SerializationInfo info,
|
||||
StreamingContext context) : base(info, context)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
46
src/UglyToad.Pdf/Parser/CatalogFactory.cs
Normal file
46
src/UglyToad.Pdf/Parser/CatalogFactory.cs
Normal file
@@ -0,0 +1,46 @@
|
||||
namespace UglyToad.Pdf.Parser
|
||||
{
|
||||
using System;
|
||||
using Content;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using IO;
|
||||
|
||||
internal class CatalogFactory
|
||||
{
|
||||
private readonly IPdfObjectParser pdfObjectParser;
|
||||
|
||||
public CatalogFactory(IPdfObjectParser pdfObjectParser)
|
||||
{
|
||||
this.pdfObjectParser = pdfObjectParser;
|
||||
}
|
||||
|
||||
public Catalog Create(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
|
||||
{
|
||||
if (dictionary == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(dictionary));
|
||||
}
|
||||
|
||||
if (dictionary.TryGetName(CosName.TYPE, out var type) && !type.Equals(CosName.CATALOG))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The type of the catalog dictionary was not Catalog: {dictionary}.");
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetItemOfType(CosName.PAGES, out CosObject value))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}.");
|
||||
}
|
||||
|
||||
var pages = pdfObjectParser.Parse(value.ToIndirectReference(), reader, isLenientParsing);
|
||||
|
||||
if (!(pages is PdfDictionary pagesDictionary))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"The pages entry in the catalog {value.ToIndirectReference()} did not link to a dictionary: {pages}.");
|
||||
}
|
||||
|
||||
return new Catalog(dictionary, pagesDictionary);
|
||||
}
|
||||
}
|
||||
}
|
47
src/UglyToad.Pdf/Parser/DocumentInformationFactory.cs
Normal file
47
src/UglyToad.Pdf/Parser/DocumentInformationFactory.cs
Normal file
@@ -0,0 +1,47 @@
|
||||
namespace UglyToad.Pdf.Parser
|
||||
{
|
||||
using Content;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using IO;
|
||||
|
||||
internal class DocumentInformationFactory
|
||||
{
|
||||
public DocumentInformation Create(IPdfObjectParser pdfObjectParser,
|
||||
PdfDictionary rootDictionary, IRandomAccessRead reader,
|
||||
bool isLenientParsing)
|
||||
{
|
||||
if (!rootDictionary.TryGetItemOfType(CosName.INFO, out CosObject infoBase))
|
||||
{
|
||||
return DocumentInformation.Default;
|
||||
}
|
||||
|
||||
var infoParsed = pdfObjectParser.Parse(infoBase.ToIndirectReference(), reader, isLenientParsing);
|
||||
|
||||
if (!(infoParsed is PdfDictionary infoDictionary))
|
||||
{
|
||||
return DocumentInformation.Default;
|
||||
}
|
||||
|
||||
var title = GetEntryOrDefault(infoDictionary, CosName.TITLE);
|
||||
var author = GetEntryOrDefault(infoDictionary, CosName.AUTHOR);
|
||||
var subject = GetEntryOrDefault(infoDictionary, CosName.SUBJECT);
|
||||
var keywords = GetEntryOrDefault(infoDictionary, CosName.KEYWORDS);
|
||||
var creator = GetEntryOrDefault(infoDictionary, CosName.CREATOR);
|
||||
var producer = GetEntryOrDefault(infoDictionary, CosName.PRODUCER);
|
||||
|
||||
return new DocumentInformation(title, author, subject,
|
||||
keywords, creator, producer);
|
||||
}
|
||||
|
||||
private static string GetEntryOrDefault(PdfDictionary infoDictionary, CosName key)
|
||||
{
|
||||
if (infoDictionary.TryGetItemOfType(key, out CosString str))
|
||||
{
|
||||
return str.GetAscii();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,14 +1,13 @@
|
||||
namespace UglyToad.Pdf.Content
|
||||
namespace UglyToad.Pdf.Parser
|
||||
{
|
||||
using System;
|
||||
using Content;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Filters;
|
||||
using Geometry;
|
||||
using Graphics;
|
||||
using IO;
|
||||
using Parser;
|
||||
using Util;
|
||||
|
||||
internal class PageFactory : IPageFactory
|
||||
{
|
@@ -78,6 +78,8 @@
|
||||
var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory);
|
||||
|
||||
var pageFactory = new PageFactory(resourceContainer, pdfObjectParser, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
|
||||
var informationFactory = new DocumentInformationFactory();
|
||||
var catalogFactory = new CatalogFactory(pdfObjectParser);
|
||||
|
||||
var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool,
|
||||
isLenientParsing);
|
||||
@@ -93,9 +95,13 @@
|
||||
rootDictionary.Set(CosName.TYPE, CosName.CATALOG);
|
||||
}
|
||||
|
||||
var information = informationFactory.Create(pdfObjectParser, crossReferenceTable.Dictionary, reader, isLenientParsing);
|
||||
|
||||
var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing);
|
||||
|
||||
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
|
||||
|
||||
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, new Catalog(rootDictionary));
|
||||
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information);
|
||||
}
|
||||
|
||||
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,
|
||||
|
@@ -7,7 +7,6 @@
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Parser.Parts;
|
||||
using Util;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
public class PdfDocument : IDisposable
|
||||
@@ -30,6 +29,9 @@
|
||||
[NotNull]
|
||||
internal Pages Pages { get; }
|
||||
|
||||
[NotNull]
|
||||
public DocumentInformation Information { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Get the number of pages in this document.
|
||||
/// </summary>
|
||||
@@ -40,7 +42,8 @@
|
||||
ParsingCachingProviders cachingProviders,
|
||||
IPageFactory pageFactory,
|
||||
IPdfObjectParser pdfObjectParser,
|
||||
Catalog catalog)
|
||||
Catalog catalog,
|
||||
DocumentInformation information)
|
||||
{
|
||||
this.log = log;
|
||||
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
@@ -48,6 +51,7 @@
|
||||
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
|
||||
this.isLenientParsing = isLenientParsing;
|
||||
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
||||
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
|
||||
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing);
|
||||
}
|
||||
|
Reference in New Issue
Block a user