Merge branch 'master' of https://github.com/mvantzet/PdfPig into ITokenWriter

This commit is contained in:
mvantzet
2022-12-20 11:08:13 +01:00
20 changed files with 230 additions and 51 deletions

View File

@@ -10,7 +10,8 @@
public static class AdvancedTextExtraction
{
public static void Run(string filePath)
{
{
#if YET_TO_BE_DONE
var sb = new StringBuilder();
using (var document = PdfDocument.Open(filePath))
@@ -86,6 +87,7 @@
}
Console.WriteLine(sb.ToString());
#endif
}
}
}

View File

@@ -45,9 +45,14 @@
},
{7,
("Advance text extraction using layout analysis algorithms",
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
}
};
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
},
{
8,
("Extract Words with newline detection (example with algorithm). Issue 512",
() => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "OPEN.RABBIT.ENGLISH.LOP.pdf")))
}
};
var choices = string.Join(Environment.NewLine, examples.Select(x => $"{x.Key}: {x.Value.name}"));

View File

@@ -0,0 +1,34 @@
namespace UglyToad.PdfPig.Fonts
{
using System;
using System.Runtime.Serialization;
/// <summary>
/// Thrown when a PDF contains an invalid compressed data stream.
/// </summary>
[Serializable]
public class CorruptCompressedDataException : Exception
{
/// <inheritdoc />
public CorruptCompressedDataException()
{
}
/// <inheritdoc />
public CorruptCompressedDataException(string message) : base(message)
{
}
/// <inheritdoc />
public CorruptCompressedDataException(string message, Exception inner) : base(message, inner)
{
}
/// <inheritdoc />
protected CorruptCompressedDataException(
SerializationInfo info,
StreamingContext context) : base(info, context)
{
}
}
}

View File

@@ -0,0 +1,54 @@
namespace UglyToad.PdfPig.Tests.Integration;
using System.Linq;
using Xunit;
public class IndexedPageSummaryFileTests
{
private static string GetFilename()
{
return IntegrationHelpers.GetDocumentPath("FICTIF_TABLE_INDEX.pdf");
}
[Fact]
public void HasCorrectNumberOfPages()
{
using (var document = PdfDocument.Open(GetFilename()))
{
Assert.Equal(14, document.NumberOfPages);
}
}
[Fact]
public void GetPagesWorks()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var pageCount = document.GetPages().Count();
Assert.Equal(14, pageCount);
}
}
[Theory]
[InlineData("M. HERNANDEZ DANIEL", 1)]
[InlineData("M. HERNANDEZ DANIEL", 2)]
[InlineData("Mme ALIBERT CHLOE AA", 3)]
[InlineData("Mme ALIBERT CHLOE AA", 4)]
[InlineData("M. SIMPSON BART AAA", 5)]
[InlineData("M. SIMPSON BART AAA", 6)]
[InlineData("M. BOND JAMES A", 7)]
[InlineData("M. BOND JAMES A", 8)]
[InlineData("M. DE BALZAC HONORE", 9)]
[InlineData("M. DE BALZAC HONORE", 10)]
[InlineData("M. STALLONE SILVESTER", 11)]
[InlineData("M. STALLONE SILVESTER", 12)]
[InlineData("M. SCOTT MICHAEL", 13)]
[InlineData("M. SCOTT MICHAEL", 14)]
public void CheckSpecificNamesPresence_InIndexedPageNumbersFile(string searchedName, int pageNumber)
{
using var document = PdfDocument.Open(GetFilename());
var page = document.GetPage(pageNumber);
Assert.Contains(searchedName, page.Text);
}
}

View File

@@ -51,7 +51,7 @@
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 7 : 9, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 7 : 9, result.OffsetInFile);
}
[Fact]
@@ -66,38 +66,42 @@
[Fact]
public void HeaderPrecededByJunkNonLenientDoesNotThrow()
{
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.2");
{
var input = @"one
%PDF-1.2";
var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, false, log);
Assert.Equal(1.2m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
}
[Fact]
public void HeaderPrecededByJunkLenientReads()
{
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.7");
{
var input = @"one
%PDF-1.7";
var scanner = StringBytesTestConverter.Scanner(input);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.7m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 12 : 13, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(input) ? 12 : 13, result.OffsetInFile);
}
[Fact]
public void HeaderPrecededByJunkDoesNotThrow()
{
var scanner = StringBytesTestConverter.Scanner(@"one two
three %PDF-1.6");
{
var s = @"one two
three %PDF-1.6";
var scanner = StringBytesTestConverter.Scanner(s);
var result = FileHeaderParser.Parse(scanner.scanner, scanner.bytes, true, log);
Assert.Equal(1.6m, result.Version);
Assert.Equal(TestEnvironment.IsUnixPlatform ? 14 : 15, result.OffsetInFile);
Assert.Equal(TestEnvironment.IsSingleByteNewLine(s) ? 14 : 15, result.OffsetInFile);
}
[Fact]

View File

@@ -4,6 +4,7 @@
public static class TestEnvironment
{
public static readonly bool IsUnixPlatform = Environment.NewLine.Length == 1;
public static bool IsSingleByteNewLine(string s) => s.IndexOf('\r') < 0;
}
}

View File

@@ -29,7 +29,12 @@
/// <summary>
/// The page tree for this document containing all pages, page numbers and their dictionaries.
/// </summary>
public PageTreeNode PageTree { get; }
public PageTreeNode PageTree { get; }
/// <summary>
/// Number of discovered pages.
/// </summary>
public int? NumberOfDiscoveredPages => pagesByNumber?.Count;
/// <summary>
/// Create a new <see cref="CatalogDictionary"/>.

View File

@@ -6,7 +6,7 @@
internal interface IResourceStore
{
void LoadResourceDictionary(DictionaryToken resourceDictionary);
void LoadResourceDictionary(DictionaryToken resourceDictionary, InternalParsingOptions parsingOptions);
/// <summary>
/// Remove any named resources and associated state for the last resource dictionary loaded.

View File

@@ -44,7 +44,7 @@
/// <summary>
/// Create a <see cref="PageRotationDegrees"/>.
/// </summary>
/// <param name="rotation">Rotation in degrees clockwise.</param>
/// <param name="rotation">Rotation in degrees clockwise, must be a multiple of 90.</param>
public PageRotationDegrees(int rotation)
{
if (rotation < 0)

View File

@@ -21,6 +21,13 @@
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
var CountOfPagesByPagesTree = catalog.PageTree.Children.Count;
var numberOfDiscoveredPages = catalog.NumberOfDiscoveredPages;
if (numberOfDiscoveredPages is null == false && Count != numberOfDiscoveredPages)
{
//log.Warning($"Dictionary Page Count {Count} different to discovered pages {numberOfDiscoveredPages}. Using {numberOfDiscoveredPages}.");
Count = numberOfDiscoveredPages.Value;
}
}
public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions)

View File

@@ -33,7 +33,7 @@
this.fontFactory = fontFactory;
}
public void LoadResourceDictionary(DictionaryToken resourceDictionary)
public void LoadResourceDictionary(DictionaryToken resourceDictionary, InternalParsingOptions parsingOptions)
{
lastLoadedFont = (null, null);
@@ -43,7 +43,7 @@
{
var fontDictionary = DirectObjectFinder.Get<DictionaryToken>(fontBase, scanner);
LoadFontDictionary(fontDictionary);
LoadFontDictionary(fontDictionary, parsingOptions);
}
if (resourceDictionary.TryGet(NameToken.Xobject, out var xobjectBase))
@@ -132,7 +132,7 @@
currentResourceState.Pop();
}
private void LoadFontDictionary(DictionaryToken fontDictionary)
private void LoadFontDictionary(DictionaryToken fontDictionary, InternalParsingOptions parsingOptions)
{
lastLoadedFont = (null, null);
@@ -157,7 +157,18 @@
continue;
}
loadedFonts[reference] = fontFactory.Get(fontObject);
try
{
loadedFonts[reference] = fontFactory.Get(fontObject);
}
catch
{
if (!parsingOptions.SkipMissingFonts)
{
throw;
}
}
}
else if (pair.Value is DictionaryToken fd)
{

View File

@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.Filters
{
using Fonts;
using System;
using System.Collections.Generic;
using System.IO;
@@ -79,10 +80,17 @@
memoryStream.ReadByte();
memoryStream.ReadByte();
using (var deflate = new DeflateStream(memoryStream, CompressionMode.Decompress))
try
{
deflate.CopyTo(output);
return output.ToArray();
using (var deflate = new DeflateStream(memoryStream, CompressionMode.Decompress))
{
deflate.CopyTo(output);
return output.ToArray();
}
}
catch (InvalidDataException ex)
{
throw new CorruptCompressedDataException("Invalid Flate compressed stream encountered", ex);
}
}
}

View File

@@ -479,7 +479,7 @@
var hasResources = formStream.StreamDictionary.TryGet<DictionaryToken>(NameToken.Resources, pdfScanner, out var formResources);
if (hasResources)
{
resourceStore.LoadResourceDictionary(formResources);
resourceStore.LoadResourceDictionary(formResources, parsingOptions);
}
// 1. Save current state.

View File

@@ -81,11 +81,13 @@
pageNumber.Increment();
return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray<PageTreeNode>.Instance);
}
//If we got here, we have to iterate till we manage to exit
}
//If we got here, we have to iterate till we manage to exit
HashSet<int> visitedTokens = new HashSet<int>(); // As we visit each token add to this list (the hashcode of the indirect reference)
var toProcess =
new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference,
@@ -102,8 +104,16 @@
do
{
var current = toProcess.Dequeue();
var current = toProcess.Dequeue();
var currentReferenceHash = current.reference.GetHashCode();
if (visitedTokens.Contains(currentReferenceHash))
{
continue; // don't revisit token already processed. break infinite loop. Issue #512
}
else
{
visitedTokens.Add(currentReferenceHash);
}
if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids))
{
if (!isLenientParsing)

View File

@@ -63,13 +63,13 @@
{
var resource = pageTreeMembers.ParentResources.Dequeue();
resourceStore.LoadResourceDictionary(resource);
resourceStore.LoadResourceDictionary(resource, parsingOptions);
stackDepth++;
}
if (dictionary.TryGet(NameToken.Resources, pdfScanner, out DictionaryToken resources))
{
resourceStore.LoadResourceDictionary(resources);
resourceStore.LoadResourceDictionary(resources, parsingOptions);
stackDepth++;
}

View File

@@ -176,7 +176,7 @@
const string searchTerm = "%%EOF";
var minimumEndOffset = bytes.Length - searchTerm.Length;
var minimumEndOffset = bytes.Length - searchTerm.Length + 1; // Issue #512 - Unable to open PDF - BruteForceScan starts from earlier of two EOF marker due to min end offset off by 1
bytes.Seek(minimumEndOffset);

View File

@@ -559,6 +559,11 @@ namespace UglyToad.PdfPig.Writer
pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize);
}
if (page.Value.rotation.HasValue)
{
pageDictionary[NameToken.Rotate] = new NumericToken(page.Value.rotation.Value);
}
// Adobe Acrobat errors if content streams ref'd by multiple pages, turn off
// dedup if on to avoid issues
var prev = context.AttemptDeduplication;

View File

@@ -46,6 +46,8 @@
private int imageKey = 1;
internal int? rotation;
internal IReadOnlyDictionary<string, IToken> Resources => pageDictionary.GetOrCreateDict(NameToken.Resources);
/// <summary>
@@ -131,7 +133,7 @@
/// <param name="from">The first point on the line.</param>
/// <param name="to">The last point on the line.</param>
/// <param name="lineWidth">The width of the line in user space units.</param>
public void DrawLine(PdfPoint from, PdfPoint to, decimal lineWidth = 1)
public PdfPageBuilder DrawLine(PdfPoint from, PdfPoint to, decimal lineWidth = 1)
{
if (lineWidth != 1)
{
@@ -146,6 +148,8 @@
{
currentStream.Add(new SetLineWidth(1));
}
return this;
}
/// <summary>
@@ -156,7 +160,7 @@
/// <param name="height">The height of the rectangle.</param>
/// <param name="lineWidth">The width of the line border of the rectangle.</param>
/// <param name="fill">Whether to fill with the color set by <see cref="SetTextAndFillColor"/>.</param>
public void DrawRectangle(PdfPoint position, decimal width, decimal height, decimal lineWidth = 1, bool fill = false)
public PdfPageBuilder DrawRectangle(PdfPoint position, decimal width, decimal height, decimal lineWidth = 1, bool fill = false)
{
if (lineWidth != 1)
{
@@ -178,6 +182,17 @@
{
currentStream.Add(new SetLineWidth(lineWidth));
}
return this;
}
/// <summary>
/// Set the number of degrees by which the page is rotated clockwise when displayed or printed.
/// </summary>
public PdfPageBuilder SetRotation(PageRotationDegrees degrees)
{
rotation = degrees.Value;
return this;
}
/// <summary>
@@ -188,7 +203,7 @@
/// <param name="point3">Position of the third corner of the triangle.</param>
/// <param name="lineWidth">The width of the line border of the triangle.</param>
/// <param name="fill">Whether to fill with the color set by <see cref="SetTextAndFillColor"/>.</param>
public void DrawTriangle(PdfPoint point1, PdfPoint point2, PdfPoint point3, decimal lineWidth = 1, bool fill = false)
public PdfPageBuilder DrawTriangle(PdfPoint point1, PdfPoint point2, PdfPoint point3, decimal lineWidth = 1, bool fill = false)
{
if (lineWidth != 1)
{
@@ -213,6 +228,8 @@
{
currentStream.Add(new SetLineWidth(lineWidth));
}
return this;
}
/// <summary>
@@ -222,9 +239,11 @@
/// <param name="diameter">The diameter of the circle.</param>
/// <param name="lineWidth">The width of the line border of the circle.</param>
/// <param name="fill">Whether to fill with the color set by <see cref="SetTextAndFillColor"/>.</param>
public void DrawCircle(PdfPoint center, decimal diameter, decimal lineWidth = 1, bool fill = false)
public PdfPageBuilder DrawCircle(PdfPoint center, decimal diameter, decimal lineWidth = 1, bool fill = false)
{
DrawEllipsis(center, diameter, diameter, lineWidth, fill);
return this;
}
/// <summary>
@@ -235,7 +254,7 @@
/// <param name="height">The height of the ellipsis.</param>
/// <param name="lineWidth">The width of the line border of the ellipsis.</param>
/// <param name="fill">Whether to fill with the color set by <see cref="SetTextAndFillColor"/>.</param>
public void DrawEllipsis(PdfPoint center, decimal width, decimal height, decimal lineWidth = 1, bool fill = false)
public PdfPageBuilder DrawEllipsis(PdfPoint center, decimal width, decimal height, decimal lineWidth = 1, bool fill = false)
{
width /= 2;
height /= 2;
@@ -283,6 +302,8 @@
{
currentStream.Add(new SetLineWidth(lineWidth));
}
return this;
}
/// <summary>
@@ -291,10 +312,12 @@
/// <param name="r">Red - 0 to 255</param>
/// <param name="g">Green - 0 to 255</param>
/// <param name="b">Blue - 0 to 255</param>
public void SetStrokeColor(byte r, byte g, byte b)
public PdfPageBuilder SetStrokeColor(byte r, byte g, byte b)
{
currentStream.Add(Push.Value);
currentStream.Add(new SetStrokeColorDeviceRgb(RgbToDecimal(r), RgbToDecimal(g), RgbToDecimal(b)));
return this;
}
/// <summary>
@@ -303,11 +326,13 @@
/// <param name="r">Red - 0 to 1</param>
/// <param name="g">Green - 0 to 1</param>
/// <param name="b">Blue - 0 to 1</param>
internal void SetStrokeColorExact(decimal r, decimal g, decimal b)
internal PdfPageBuilder SetStrokeColorExact(decimal r, decimal g, decimal b)
{
currentStream.Add(Push.Value);
currentStream.Add(new SetStrokeColorDeviceRgb(CheckRgbDecimal(r, nameof(r)),
CheckRgbDecimal(g, nameof(g)), CheckRgbDecimal(b, nameof(b))));
return this;
}
/// <summary>
@@ -316,18 +341,22 @@
/// <param name="r">Red - 0 to 255</param>
/// <param name="g">Green - 0 to 255</param>
/// <param name="b">Blue - 0 to 255</param>
public void SetTextAndFillColor(byte r, byte g, byte b)
public PdfPageBuilder SetTextAndFillColor(byte r, byte g, byte b)
{
currentStream.Add(Push.Value);
currentStream.Add(new SetNonStrokeColorDeviceRgb(RgbToDecimal(r), RgbToDecimal(g), RgbToDecimal(b)));
return this;
}
/// <summary>
/// Restores the stroke, text and fill color to default (black).
/// </summary>
public void ResetColor()
public PdfPageBuilder ResetColor()
{
currentStream.Add(Pop.Value);
return this;
}
/// <summary>
@@ -451,9 +480,11 @@
/// To insert invisible text, for example output of OCR, use <c>TextRenderingMode.Neither</c>.
/// </summary>
/// <param name="mode">Text rendering mode to set.</param>
public void SetTextRenderingMode(TextRenderingMode mode)
public PdfPageBuilder SetTextRenderingMode(TextRenderingMode mode)
{
currentStream.Add(new SetTextRenderingMode(mode));
return this;
}
private NameToken GetAddedFont(PdfDocumentBuilder.AddedFont font)
@@ -690,7 +721,7 @@
/// Copy a page from unknown source to this page
/// </summary>
/// <param name="srcPage">Page to be copied</param>
public void CopyFrom(Page srcPage)
public PdfPageBuilder CopyFrom(Page srcPage)
{
if (currentStream.Operations.Count > 0)
{
@@ -704,7 +735,7 @@
// If the page doesn't have resources, then we copy the entire content stream, since not operation would collide
// with the ones already written
destinationStream.Operations.AddRange(srcPage.Operations);
return;
return this;
}
// TODO: How should we handle any other token in the page dictionary (Eg. LastModified, MediaBox, CropBox, BleedBox, TrimBox, ArtBox,
@@ -828,6 +859,8 @@
}
destinationStream.Operations.AddRange(operations);
return this;
}
private List<Letter> DrawLetters(NameToken name, string text, IWritingFont font, TransformationMatrix fontMatrix, decimal fontSize, TransformationMatrix textMatrix)