mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-14 19:05:01 +08:00
start passing the pdf scanner in to read the type 1 files
This commit is contained in:
@@ -10,9 +10,9 @@
|
||||
[Fact]
|
||||
public void Tests()
|
||||
{
|
||||
//using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\ICML03-081.pdf"))
|
||||
using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\ICML03-081.pdf"))
|
||||
{
|
||||
//var page = document.GetPage(1);
|
||||
var page = document.GetPage(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -10,6 +10,7 @@
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Parser.Parts;
|
||||
using Tokenization.Scanner;
|
||||
|
||||
internal class Pages
|
||||
{
|
||||
@@ -19,13 +20,14 @@
|
||||
private readonly IPageFactory pageFactory;
|
||||
private readonly IRandomAccessRead reader;
|
||||
private readonly bool isLenientParsing;
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
private readonly PdfDictionary rootPageDictionary;
|
||||
private readonly Dictionary<int, PdfDictionary> locatedPages = new Dictionary<int, PdfDictionary>();
|
||||
|
||||
public int Count { get; }
|
||||
|
||||
internal Pages(ILog log, Catalog catalog, IPdfObjectParser pdfObjectParser, IPageFactory pageFactory,
|
||||
IRandomAccessRead reader, bool isLenientParsing)
|
||||
internal Pages(ILog log, Catalog catalog, IPdfObjectParser pdfObjectParser, IPageFactory pageFactory,
|
||||
IRandomAccessRead reader, bool isLenientParsing, IPdfObjectScanner pdfScanner)
|
||||
{
|
||||
if (catalog == null)
|
||||
{
|
||||
@@ -42,9 +44,9 @@
|
||||
this.pageFactory = pageFactory;
|
||||
this.reader = reader;
|
||||
this.isLenientParsing = isLenientParsing;
|
||||
this.pdfScanner = pdfScanner;
|
||||
}
|
||||
|
||||
|
||||
public Page GetPage(int pageNumber)
|
||||
{
|
||||
if (locatedPages.TryGetValue(pageNumber, out PdfDictionary targetPageDictionary))
|
||||
|
@@ -3,12 +3,14 @@
|
||||
using System;
|
||||
using Cos;
|
||||
using Filters;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class PdfRawStream : CosBase
|
||||
{
|
||||
private static readonly object Lock = new object();
|
||||
|
||||
private readonly byte[] streamBytes;
|
||||
private readonly StreamToken stream;
|
||||
|
||||
private byte[] decodedBytes;
|
||||
|
||||
@@ -24,6 +26,11 @@
|
||||
Dictionary = streamDictionary ?? throw new ArgumentNullException(nameof(streamDictionary));
|
||||
}
|
||||
|
||||
public PdfRawStream(StreamToken stream)
|
||||
{
|
||||
this.stream = stream;
|
||||
}
|
||||
|
||||
public byte[] Decode(IFilterProvider filterProvider)
|
||||
{
|
||||
lock (Lock)
|
||||
@@ -33,6 +40,20 @@
|
||||
return decodedBytes;
|
||||
}
|
||||
|
||||
if (stream != null)
|
||||
{
|
||||
var tokenFilters = filterProvider.GetFilters(stream.StreamDictionary);
|
||||
|
||||
if (tokenFilters.Count > 0)
|
||||
{
|
||||
throw new NotImplementedException("Need to change everything to use this method.");
|
||||
}
|
||||
|
||||
decodedBytes = stream.Data;
|
||||
|
||||
return decodedBytes;
|
||||
}
|
||||
|
||||
var filters = filterProvider.GetFilters(Dictionary);
|
||||
|
||||
var transform = streamBytes;
|
||||
|
@@ -2,9 +2,12 @@
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal interface IFilterProvider
|
||||
{
|
||||
IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary);
|
||||
|
||||
IReadOnlyList<IFilter> GetFilters(PdfDictionary streamDictionary);
|
||||
|
||||
IReadOnlyList<IFilter> GetAllFilters();
|
||||
|
@@ -5,7 +5,9 @@
|
||||
using System.Linq;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Exceptions;
|
||||
using Logging;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class MemoryFilterProvider : IFilterProvider
|
||||
{
|
||||
@@ -31,6 +33,30 @@
|
||||
};
|
||||
}
|
||||
|
||||
public IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary)
|
||||
{
|
||||
if (dictionary == null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(dictionary));
|
||||
}
|
||||
|
||||
if (!dictionary.TryGetByName(CosName.FILTER, out var token))
|
||||
{
|
||||
return new IFilter[0];
|
||||
}
|
||||
|
||||
switch (token)
|
||||
{
|
||||
case ArrayToken filters:
|
||||
// TODO: presumably this may be invalid...
|
||||
return filters.Data.Select(x => GetFilterStrict(((NameToken)x).Data)).ToList();
|
||||
case NameToken name:
|
||||
return new[] { GetFilterStrict(name.Data) };
|
||||
default:
|
||||
throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}.");
|
||||
}
|
||||
}
|
||||
|
||||
public IReadOnlyList<IFilter> GetFilters(PdfDictionary streamDictionary)
|
||||
{
|
||||
if (streamDictionary == null)
|
||||
|
@@ -1,5 +1,6 @@
|
||||
namespace UglyToad.PdfPig.Fonts.Parser.Handlers
|
||||
{
|
||||
using System;
|
||||
using Cmap;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
@@ -10,6 +11,8 @@
|
||||
using Parts;
|
||||
using PdfPig.Parser;
|
||||
using Simple;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class Type1FontHandler : IFontHandler
|
||||
{
|
||||
@@ -18,15 +21,18 @@
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly FontDescriptorFactory fontDescriptorFactory;
|
||||
private readonly IEncodingReader encodingReader;
|
||||
private readonly IPdfObjectScanner scanner;
|
||||
|
||||
public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider,
|
||||
FontDescriptorFactory fontDescriptorFactory, IEncodingReader encodingReader)
|
||||
FontDescriptorFactory fontDescriptorFactory, IEncodingReader encodingReader,
|
||||
IPdfObjectScanner scanner)
|
||||
{
|
||||
this.pdfObjectParser = pdfObjectParser;
|
||||
this.cMapCache = cMapCache;
|
||||
this.filterProvider = filterProvider;
|
||||
this.fontDescriptorFactory = fontDescriptorFactory;
|
||||
this.encodingReader = encodingReader;
|
||||
this.scanner = scanner;
|
||||
}
|
||||
|
||||
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
|
||||
@@ -53,7 +59,9 @@
|
||||
var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing);
|
||||
|
||||
var descriptor = FontDictionaryAccessHelper.GetFontDescriptor(pdfObjectParser, fontDescriptorFactory, dictionary, reader, isLenientParsing);
|
||||
|
||||
|
||||
ParseType1Font(descriptor, isLenientParsing);
|
||||
|
||||
var name = FontDictionaryAccessHelper.GetName(pdfObjectParser, dictionary, descriptor, reader, isLenientParsing);
|
||||
|
||||
CMap toUnicodeCMap = null;
|
||||
@@ -73,5 +81,41 @@
|
||||
|
||||
return new Type1Font(name, firstCharacter, lastCharacter, widths, descriptor, encoding, toUnicodeCMap);
|
||||
}
|
||||
|
||||
private void ParseType1Font(FontDescriptor descriptor, bool isLenientParsing)
|
||||
{
|
||||
if (descriptor?.FontFile == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (descriptor.FontFile.ObjectKey.ObjectNumber == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var stream = scanner.Get(descriptor.FontFile.ObjectKey).Data as StreamToken;
|
||||
|
||||
if (stream == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var raw = new PdfRawStream(stream);
|
||||
|
||||
var bytes = raw.Decode(filterProvider);
|
||||
|
||||
// TODO: parse
|
||||
}
|
||||
catch
|
||||
{
|
||||
if (!isLenientParsing)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -18,6 +18,7 @@
|
||||
using Logging;
|
||||
using Parts;
|
||||
using Tokenization.Scanner;
|
||||
using Tokenization.Tokens;
|
||||
using Util;
|
||||
|
||||
internal static class PdfDocumentFactory
|
||||
@@ -83,12 +84,13 @@
|
||||
|
||||
var cMapCache = new CMapCache(new CMapParser());
|
||||
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, new ObjectLocationProvider(crossReferenceTable, pool, bruteForceSearcher));
|
||||
var fontFactory = new FontFactory(log, new Type0FontHandler(cidFontFactory,
|
||||
cMapCache,
|
||||
filterProvider,
|
||||
pdfObjectParser),
|
||||
new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
|
||||
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader),
|
||||
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader, pdfScanner),
|
||||
new Type3FontHandler(pdfObjectParser, cMapCache, filterProvider, encodingReader));
|
||||
|
||||
var dynamicParser = container.Get<DynamicParser>();
|
||||
@@ -118,9 +120,9 @@
|
||||
|
||||
var caching = new ParsingCachingProviders(pool, bruteForceSearcher, resourceContainer);
|
||||
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, null);
|
||||
|
||||
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information);
|
||||
|
||||
return new PdfDocument(log, reader, version, crossReferenceTable, isLenientParsing, caching, pageFactory, pdfObjectParser, catalog, information,
|
||||
pdfScanner);
|
||||
}
|
||||
|
||||
private static CosBase ParseTrailer(IRandomAccessRead reader, CrossReferenceTable crossReferenceTable,
|
||||
|
@@ -6,6 +6,7 @@
|
||||
using IO;
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Tokenization.Scanner;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <inheritdoc />
|
||||
@@ -26,6 +27,8 @@
|
||||
[NotNull]
|
||||
private readonly ParsingCachingProviders cachingProviders;
|
||||
|
||||
private readonly IPdfObjectScanner pdfScanner;
|
||||
|
||||
[NotNull]
|
||||
internal Catalog Catalog { get; }
|
||||
|
||||
@@ -49,12 +52,12 @@
|
||||
public int NumberOfPages => Pages.Count;
|
||||
|
||||
internal PdfDocument(ILog log, IRandomAccessRead reader, HeaderVersion version, CrossReferenceTable crossReferenceTable,
|
||||
bool isLenientParsing,
|
||||
bool isLenientParsing,
|
||||
ParsingCachingProviders cachingProviders,
|
||||
IPageFactory pageFactory,
|
||||
IPdfObjectParser pdfObjectParser,
|
||||
Catalog catalog,
|
||||
DocumentInformation information)
|
||||
DocumentInformation information, IPdfObjectScanner pdfScanner)
|
||||
{
|
||||
this.log = log;
|
||||
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
|
||||
@@ -62,9 +65,10 @@
|
||||
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
|
||||
this.isLenientParsing = isLenientParsing;
|
||||
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
||||
this.pdfScanner = pdfScanner;
|
||||
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||
Catalog = catalog ?? throw new ArgumentNullException(nameof(catalog));
|
||||
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing);
|
||||
Pages = new Pages(log, Catalog, pdfObjectParser, pageFactory, reader, isLenientParsing, pdfScanner);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@@ -23,18 +23,24 @@
|
||||
public ObjectLocationProvider(CrossReferenceTable crossReferenceTable, CosObjectPool pool, BruteForceSearcher searcher)
|
||||
{
|
||||
this.crossReferenceTable = crossReferenceTable;
|
||||
|
||||
foreach (var offset in crossReferenceTable.ObjectOffsets)
|
||||
{
|
||||
offsets[offset.Key] = offset.Value;
|
||||
}
|
||||
|
||||
this.pool = pool;
|
||||
this.searcher = searcher;
|
||||
}
|
||||
|
||||
public bool TryGetOffset(IndirectReference reference, out long offset)
|
||||
{
|
||||
throw new System.NotImplementedException();
|
||||
return offsets.TryGetValue(reference, out offset);
|
||||
}
|
||||
|
||||
public void UpdateOffset(IndirectReference reference, long offset)
|
||||
{
|
||||
throw new System.NotImplementedException();
|
||||
offsets[reference] = offset;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1,5 +1,6 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
@@ -9,7 +10,12 @@
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
|
||||
internal class PdfTokenScanner : ISeekableTokenScanner
|
||||
internal interface IPdfObjectScanner : ISeekableTokenScanner
|
||||
{
|
||||
ObjectToken Get(IndirectReference reference);
|
||||
}
|
||||
|
||||
internal class PdfTokenScanner : IPdfObjectScanner
|
||||
{
|
||||
private readonly IInputBytes inputBytes;
|
||||
private readonly IObjectLocationProvider objectLocationProvider;
|
||||
@@ -449,5 +455,22 @@
|
||||
{
|
||||
coreTokenScanner.DeregisterCustomTokenizer(tokenizer);
|
||||
}
|
||||
|
||||
public ObjectToken Get(IndirectReference reference)
|
||||
{
|
||||
if (!objectLocationProvider.TryGetOffset(reference, out var offset))
|
||||
{
|
||||
throw new InvalidOperationException($"Could not find the object with reference: {reference}.");
|
||||
}
|
||||
|
||||
Seek(offset);
|
||||
|
||||
if (!MoveNext())
|
||||
{
|
||||
throw new InvalidOperationException($"Could not parse the object with reference: {reference}.");
|
||||
}
|
||||
|
||||
return (ObjectToken)CurrentToken;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user