start passing the pdf scanner in to read the type 1 files

This commit is contained in:
Eliot Jones
2018-01-14 15:33:22 +00:00
parent 1fb6ec41d1
commit 615ee88a46
10 changed files with 148 additions and 17 deletions

View File

@@ -10,9 +10,9 @@
[Fact]
public void Tests()
{
//using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\ICML03-081.pdf"))
using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\ICML03-081.pdf"))
{
//var page = document.GetPage(1);
var page = document.GetPage(1);
}
}
}

View File

@@ -10,6 +10,7 @@
using Logging;
using Parser;
using Parser.Parts;
using Tokenization.Scanner;
internal class Pages
{
@@ -19,13 +20,14 @@
private readonly IPageFactory pageFactory;
private readonly IRandomAccessRead reader;
private readonly bool isLenientParsing;
private readonly IPdfObjectScanner pdfScanner;
private readonly PdfDictionary rootPageDictionary;
private readonly Dictionary<int, PdfDictionary> locatedPages = new Dictionary<int, PdfDictionary>();
public int Count { get; }
internal Pages(ILog log, Catalog catalog, IPdfObjectParser pdfObjectParser, IPageFactory pageFactory,
IRandomAccessRead reader, bool isLenientParsing)
internal Pages(ILog log, Catalog catalog, IPdfObjectParser pdfObjectParser, IPageFactory pageFactory,
IRandomAccessRead reader, bool isLenientParsing, IPdfObjectScanner pdfScanner)
{
if (catalog == null)
{
@@ -42,9 +44,9 @@
this.pageFactory = pageFactory;
this.reader = reader;
this.isLenientParsing = isLenientParsing;
this.pdfScanner = pdfScanner;
}
public Page GetPage(int pageNumber)
{
if (locatedPages.TryGetValue(pageNumber, out PdfDictionary targetPageDictionary))

View File

@@ -3,12 +3,14 @@
using System;
using Cos;
using Filters;
using Tokenization.Tokens;
internal class PdfRawStream : CosBase
{
private static readonly object Lock = new object();
private readonly byte[] streamBytes;
private readonly StreamToken stream;
private byte[] decodedBytes;
@@ -24,6 +26,11 @@
Dictionary = streamDictionary ?? throw new ArgumentNullException(nameof(streamDictionary));
}
public PdfRawStream(StreamToken stream)
{
this.stream = stream;
}
public byte[] Decode(IFilterProvider filterProvider)
{
lock (Lock)
@@ -33,6 +40,20 @@
return decodedBytes;
}
if (stream != null)
{
var tokenFilters = filterProvider.GetFilters(stream.StreamDictionary);
if (tokenFilters.Count > 0)
{
throw new NotImplementedException("Need to change everything to use this method.");
}
decodedBytes = stream.Data;
return decodedBytes;
}
var filters = filterProvider.GetFilters(Dictionary);
var transform = streamBytes;

View File

@@ -2,9 +2,12 @@
{
using System.Collections.Generic;
using ContentStream;
using Tokenization.Tokens;
internal interface IFilterProvider
{
IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary);
IReadOnlyList<IFilter> GetFilters(PdfDictionary streamDictionary);
IReadOnlyList<IFilter> GetAllFilters();

View File

@@ -5,7 +5,9 @@
using System.Linq;
using ContentStream;
using Cos;
using Exceptions;
using Logging;
using Tokenization.Tokens;
internal class MemoryFilterProvider : IFilterProvider
{
@@ -31,6 +33,30 @@
};
}
public IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary)
{
if (dictionary == null)
{
throw new ArgumentNullException(nameof(dictionary));
}
if (!dictionary.TryGetByName(CosName.FILTER, out var token))
{
return new IFilter[0];
}
switch (token)
{
case ArrayToken filters:
// TODO: presumably this may be invalid...
return filters.Data.Select(x => GetFilterStrict(((NameToken)x).Data)).ToList();
case NameToken name:
return new[] { GetFilterStrict(name.Data) };
default:
throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}.");
}
}
public IReadOnlyList<IFilter> GetFilters(PdfDictionary streamDictionary)
{
if (streamDictionary == null)

View File

@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Fonts.Parser.Handlers
{
using System;
using Cmap;
using ContentStream;
using Cos;
@@ -10,6 +11,8 @@
using Parts;
using PdfPig.Parser;
using Simple;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class Type1FontHandler : IFontHandler
{
@@ -18,15 +21,18 @@
private readonly IFilterProvider filterProvider;
private readonly FontDescriptorFactory fontDescriptorFactory;
private readonly IEncodingReader encodingReader;
private readonly IPdfObjectScanner scanner;
public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider,
FontDescriptorFactory fontDescriptorFactory, IEncodingReader encodingReader)
FontDescriptorFactory fontDescriptorFactory, IEncodingReader encodingReader,
IPdfObjectScanner scanner)
{
this.pdfObjectParser = pdfObjectParser;
this.cMapCache = cMapCache;
this.filterProvider = filterProvider;
this.fontDescriptorFactory = fontDescriptorFactory;
this.encodingReader = encodingReader;
this.scanner = scanner;
}
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
@@ -53,7 +59,9 @@
var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing);
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);
ParseType1Font(descriptor, isLenientParsing);
var name = FontDictionaryAccessHelper.GetName(pdfObjectParser, dictionary, descriptor, reader, isLenientParsing);
CMap toUnicodeCMap = null;
@@ -73,5 +81,41 @@
return new Type1Font(name, firstCharacter, lastCharacter, widths, descriptor, encoding, toUnicodeCMap);
}
private void ParseType1Font(FontDescriptor descriptor, bool isLenientParsing)
{
if (descriptor?.FontFile == null)
{
return;
}
if (descriptor.FontFile.ObjectKey.ObjectNumber == 0)
{
return;
}
try
{
var stream = scanner.Get(descriptor.FontFile.ObjectKey).Data as StreamToken;
if (stream == null)
{
return;
}
var raw = new PdfRawStream(stream);
var bytes = raw.Decode(filterProvider);
// TODO: parse
}
catch
{
if (!isLenientParsing)
{
throw;
}
}
}
}
}

View File

@@ -18,6 +18,7 @@
using Logging;
using Parts;
using Tokenization.Scanner;
using Tokenization.Tokens;
using Util;
internal static class PdfDocumentFactory
@@ -83,12 +84,13 @@
var cMapCache = new CMapCache(new CMapParser());
var pdfScanner = new PdfTokenScanner(inputBytes, new ObjectLocationProvider(crossReferenceTable, pool, bruteForceSearcher));
var fontFactory = new FontFactory(log, new Type0FontHandler(cidFontFactory,
cMapCache,
filterProvider,
pdfObjectParser),
new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader),
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, pdfScanner),
new Type3FontHandler(pdfObjectParser, cMapCache, filterProvider, encodingReader));
var dynamicParser = container.Get<DynamicParser>();
@@ -118,9 +120,9 @@
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
var pdfScanner = new PdfTokenScanner(inputBytes, null);
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information);
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information,
pdfScanner);
}
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,

View File

@@ -6,6 +6,7 @@
using IO;
using Logging;
using Parser;
using Tokenization.Scanner;
using Util.JetBrains.Annotations;
/// <inheritdoc />
@@ -26,6 +27,8 @@
[NotNull]
private readonly ParsingCachingProviders cachingProviders;
private readonly IPdfObjectScanner pdfScanner;
[NotNull]
internal Catalog Catalog { get; }
@@ -49,12 +52,12 @@
public int NumberOfPages => Pages.Count;
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
bool isLenientParsing,
bool isLenientParsing,
ParsingCachingProviders cachingProviders,
IPageFactory pageFactory,
IPdfObjectParser pdfObjectParser,
Catalog catalog,
DocumentInformation information)
DocumentInformation information, IPdfObjectScanner pdfScanner)
{
this.log = log;
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
@@ -62,9 +65,10 @@
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
this.isLenientParsing = isLenientParsing;
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
this.pdfScanner = pdfScanner;
Information = information ?? throw new ArgumentNullException(nameof(information));
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing);
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing, pdfScanner);
}
/// <summary>

View File

@@ -23,18 +23,24 @@
public ObjectLocationProvider(CrossReferenceTable crossReferenceTable, CosObjectPool pool, BruteForceSearcher searcher)
{
this.crossReferenceTable = crossReferenceTable;
foreach (var offset in crossReferenceTable.ObjectOffsets)
{
offsets[offset.Key] = offset.Value;
}
this.pool = pool;
this.searcher = searcher;
}
public bool TryGetOffset(IndirectReference reference, out long offset)
{
throw new System.NotImplementedException();
return offsets.TryGetValue(reference, out offset);
}
public void UpdateOffset(IndirectReference reference, long offset)
{
throw new System.NotImplementedException();
offsets[reference] = offset;
}
}
}

View File

@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
@@ -9,7 +10,12 @@
using Parser.Parts;
using Tokens;
internal class PdfTokenScanner : ISeekableTokenScanner
internal interface IPdfObjectScanner : ISeekableTokenScanner
{
ObjectToken Get(IndirectReference reference);
}
internal class PdfTokenScanner : IPdfObjectScanner
{
private readonly IInputBytes inputBytes;
private readonly IObjectLocationProvider objectLocationProvider;
@@ -449,5 +455,22 @@
{
coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
}
public ObjectToken Get(IndirectReference reference)
{
if (!objectLocationProvider.TryGetOffset(reference, out var offset))
{
throw new InvalidOperationException($"Could not find the object with reference: {reference}.");
}
Seek(offset);
if (!MoveNext())
{
throw new InvalidOperationException($"Could not parse the object with reference: {reference}.");
}
return (ObjectToken)CurrentToken;
}
}
}