move catalog parsing to its own factory. parse document information if present and expose publically. add test for itext generated document

This commit is contained in:
Eliot Jones
2018-01-02 23:26:58 +00:00
parent 8b8f2941a5
commit 0ef33f5215
13 changed files with 281 additions and 26 deletions

View File

@@ -8,7 +8,7 @@ namespace UglyToad.Pdf.Tests.Integration
using Content;
using Xunit;
public class SinglePageSimpleTests
public class SinglePageSimpleGoogleChromeTests
{
private static readonly HashSet<string> IgnoredHiddenCharacters = new HashSet<string>
{

View File

@@ -0,0 +1,39 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.IO;
using Content;
using Xunit;
public class SinglePageSimpleIText1Tests
{
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Single Page Simple - from itext 1_1.pdf");
}
[Fact]
public void HasCorrectNumberOfPages()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(1, document.NumberOfPages);
}
}
[Fact]
public void HasCorrectPageSize()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.Equal(PageSize.A4, page.Size);
}
}
}
}

View File

@@ -14,6 +14,7 @@
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from itext 1_1.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from open office.pdf" />
</ItemGroup>
@@ -36,6 +37,9 @@
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Simple - from itext 1_1.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Simple - from open office.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>

View File

@@ -8,9 +8,13 @@
{
private readonly PdfDictionary catalogDictionary;
internal Catalog(PdfDictionary catalogDictionary)
public PdfDictionary PagesDictionary { get; }
internal Catalog(PdfDictionary catalogDictionary, PdfDictionary pagesDictionary)
{
this.catalogDictionary = catalogDictionary ?? throw new ArgumentNullException(nameof(catalogDictionary));
PagesDictionary = pagesDictionary ?? throw new ArgumentNullException(nameof(pagesDictionary));
}
public CosBase Get(CosName name)

View File

@@ -0,0 +1,88 @@
namespace UglyToad.Pdf.Content
{
using System.Text;
using Util.JetBrains.Annotations;
/// <summary>
/// Metadata for the PDF document.
/// </summary>
public class DocumentInformation
{
internal static DocumentInformation Default { get; }
= new DocumentInformation(null, null, null, null, null, null);
private readonly string representation;
/// <summary>
/// The title of this document if applicable.
/// </summary>
[CanBeNull]
public string Title { get; }
/// <summary>
/// The name of the person who created this document if applicable.
/// </summary>
[CanBeNull]
public string Author { get; }
/// <summary>
/// The subject of this document if applicable.
/// </summary>
[CanBeNull]
public string Subject { get; }
/// <summary>
/// Any keywords associated with this document if applicable.
/// </summary>
[CanBeNull]
public string Keywords { get; }
/// <summary>
/// The name of the application which created the original document before it was converted to PDF. if applicable.
/// </summary>
[CanBeNull]
public string Creator { get; }
/// <summary>
/// The name of the application used to convert the original document to PDF if applicable.
/// </summary>
[CanBeNull]
public string Producer { get; }
internal DocumentInformation(string title, string author, string subject, string keywords, string creator, string producer)
{
Title = title;
Author = author;
Subject = subject;
Keywords = keywords;
Creator = creator;
Producer = producer;
var builder = new StringBuilder();
AppendPart("Title", title, builder);
AppendPart("Author", author, builder);
AppendPart("Subject", subject, builder);
AppendPart("Keywords", keywords, builder);
AppendPart("Creator", creator, builder);
AppendPart("Producer", producer, builder);
representation = builder.ToString() ?? string.Empty;
}
public override string ToString()
{
return representation;
}
private static void AppendPart(string name, string value, StringBuilder builder)
{
if (value == null)
{
return;
}
builder.Append(name).Append(": ").Append(value).Append(";");
}
}
}

View File

@@ -31,25 +31,9 @@
throw new ArgumentNullException(nameof(catalog));
}
var pages = catalog.Get(CosName.PAGES) as CosObject;
rootPageDictionary = catalog.PagesDictionary;
if (pages == null)
{
throw new InvalidOperationException("No pages were present in the catalog for this PDF document");
}
var pagesObject = pdfObjectParser.Parse(pages.ToIndirectReference(), reader, isLenientParsing);
if (!(pagesObject is PdfDictionary catalogPageDictionary))
{
throw new InvalidOperationException("Could not find the root pages object: " + pages);
}
var count = catalogPageDictionary.GetIntOrDefault(CosName.COUNT);
rootPageDictionary = catalogPageDictionary;
Count = count;
Count = rootPageDictionary.GetIntOrDefault(CosName.COUNT);
this.log = log;
this.catalog = catalog;

View File

@@ -0,0 +1,34 @@
namespace UglyToad.Pdf.Exceptions
{
using System;
using System.Runtime.Serialization;
[Serializable]
public class PdfDocumentFormatException : Exception
{
//
// For guidelines regarding the creation of new exception types, see
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/cpgenref/html/cpconerrorraisinghandlingguidelines.asp
// and
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/dncscol/html/csharp07192001.asp
//
public PdfDocumentFormatException()
{
}
public PdfDocumentFormatException(string message) : base(message)
{
}
public PdfDocumentFormatException(string message, Exception inner) : base(message, inner)
{
}
protected PdfDocumentFormatException(
SerializationInfo info,
StreamingContext context) : base(info, context)
{
}
}
}

View File

@@ -0,0 +1,46 @@
namespace UglyToad.Pdf.Parser
{
using System;
using Content;
using ContentStream;
using Cos;
using Exceptions;
using IO;
internal class CatalogFactory
{
private readonly IPdfObjectParser pdfObjectParser;
public CatalogFactory(IPdfObjectParser pdfObjectParser)
{
this.pdfObjectParser = pdfObjectParser;
}
public Catalog Create(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
if (dictionary.TryGetName(CosName.TYPE, out var type) && !type.Equals(CosName.CATALOG))
{
throw new PdfDocumentFormatException($"The type of the catalog dictionary was not Catalog: {dictionary}.");
}
if (!dictionary.TryGetItemOfType(CosName.PAGES, out CosObject value))
{
throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}.");
}
var pages = pdfObjectParser.Parse(value.ToIndirectReference(), reader, isLenientParsing);
if (!(pages is PdfDictionary pagesDictionary))
{
throw new PdfDocumentFormatException($"The pages entry in the catalog {value.ToIndirectReference()} did not link to a dictionary: {pages}.");
}
return new Catalog(dictionary, pagesDictionary);
}
}
}

View File

@@ -0,0 +1,47 @@
namespace UglyToad.Pdf.Parser
{
using Content;
using ContentStream;
using Cos;
using IO;
internal class DocumentInformationFactory
{
public DocumentInformation Create(IPdfObjectParser pdfObjectParser,
PdfDictionary rootDictionary, IRandomAccessRead reader,
bool isLenientParsing)
{
if (!rootDictionary.TryGetItemOfType(CosName.INFO, out CosObject infoBase))
{
return DocumentInformation.Default;
}
var infoParsed = pdfObjectParser.Parse(infoBase.ToIndirectReference(), reader, isLenientParsing);
if (!(infoParsed is PdfDictionary infoDictionary))
{
return DocumentInformation.Default;
}
var title = GetEntryOrDefault(infoDictionary, CosName.TITLE);
var author = GetEntryOrDefault(infoDictionary, CosName.AUTHOR);
var subject = GetEntryOrDefault(infoDictionary, CosName.SUBJECT);
var keywords = GetEntryOrDefault(infoDictionary, CosName.KEYWORDS);
var creator = GetEntryOrDefault(infoDictionary, CosName.CREATOR);
var producer = GetEntryOrDefault(infoDictionary, CosName.PRODUCER);
return new DocumentInformation(title, author, subject,
keywords, creator, producer);
}
private static string GetEntryOrDefault(PdfDictionary infoDictionary, CosName key)
{
if (infoDictionary.TryGetItemOfType(key, out CosString str))
{
return str.GetAscii();
}
return null;
}
}
}

View File

@@ -1,14 +1,13 @@
namespace UglyToad.Pdf.Content
namespace UglyToad.Pdf.Parser
{
using System;
using Content;
using ContentStream;
using Cos;
using Filters;
using Geometry;
using Graphics;
using IO;
using Parser;
using Util;
internal class PageFactory : IPageFactory
{

View File

@@ -78,6 +78,8 @@
var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory);
var pageFactory = new PageFactory(resourceContainer, pdfObjectParser, filterProvider, new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
var informationFactory = new DocumentInformationFactory();
var catalogFactory = new CatalogFactory(pdfObjectParser);
var root = ParseTrailer(reader, crossReferenceTable, dynamicParser, bruteForceSearcher, pool,
isLenientParsing);
@@ -93,9 +95,13 @@
rootDictionary.Set(CosName.TYPE, CosName.CATALOG);
}
var information = informationFactory.Create(pdfObjectParser, crossReferenceTable.Dictionary, reader, isLenientParsing);
var catalog = catalogFactory.Create(rootDictionary, reader, isLenientParsing);
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, new Catalog(rootDictionary));
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information);
}
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,

View File

@@ -7,7 +7,6 @@
using Logging;
using Parser;
using Parser.Parts;
using Util;
using Util.JetBrains.Annotations;
public class PdfDocument : IDisposable
@@ -30,6 +29,9 @@
[NotNull]
internal Pages Pages { get; }
[NotNull]
public DocumentInformation Information { get; }
/// <summary>
/// Get the number of pages in this document.
/// </summary>
@@ -40,7 +42,8 @@
ParsingCachingProviders cachingProviders,
IPageFactory pageFactory,
IPdfObjectParser pdfObjectParser,
Catalog catalog)
Catalog catalog,
DocumentInformation information)
{
this.log = log;
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
@@ -48,6 +51,7 @@
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
this.isLenientParsing = isLenientParsing;
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
Information = information ?? throw new ArgumentNullException(nameof(information));
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing);
}