mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 09:37:44 +08:00
Allow reading orders dectors to support any class that has a bounding box/PdfRectangle
- Introduce IBoundingBox interface
This commit is contained in:
parent
14e7024545
commit
0dd80c3c79
@ -219,7 +219,7 @@
|
||||
private AltoDocument.AltoIllustration ToAltoIllustration(IPdfImage pdfImage, double height)
|
||||
{
|
||||
illustrationCount++;
|
||||
var rectangle = pdfImage.Bounds;
|
||||
var rectangle = pdfImage.BoundingBox;
|
||||
|
||||
return new AltoDocument.AltoIllustration
|
||||
{
|
||||
|
||||
@ -273,7 +273,7 @@
|
||||
private string GetCode(IPdfImage pdfImage, double pageHeight, int level)
|
||||
{
|
||||
imageCount++;
|
||||
var bbox = pdfImage.Bounds;
|
||||
var bbox = pdfImage.BoundingBox;
|
||||
return GetIndent(level) + "<span class='ocr_image' id='image_" + pageCount + "_"
|
||||
+ imageCount + "' title='" + GetCode(bbox, pageHeight) + "' />";
|
||||
}
|
||||
|
||||
@ -0,0 +1,301 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.Export
|
||||
{
|
||||
using Content;
|
||||
using Core;
|
||||
using DocumentLayoutAnalysis;
|
||||
using Graphics.Colors;
|
||||
using PAGE;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Xml;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
/// <summary>
|
||||
/// PAGE-XML 2019-07-15 (XML) exporter for general case
|
||||
/// This is a rewrite of <see cref="PageXmlTextExporter"/> to be simple and handle a general case of text, image
|
||||
/// and custom implementer defined blocks
|
||||
/// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para>
|
||||
/// </summary>
|
||||
public class PageXmlGeneralExporter
|
||||
{
|
||||
private readonly double scale;
|
||||
private string indentChar;
|
||||
private int nextId;
|
||||
|
||||
/// <summary>
|
||||
/// PAGE-XML 2019-07-15 (XML) exporter for general case
|
||||
/// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para>
|
||||
/// </summary>
|
||||
/// <param name="scale"></param>
|
||||
/// <param name="indent"></param>
|
||||
public PageXmlGeneralExporter(double scale = 1.0, string indent = "\t")
|
||||
{
|
||||
this.scale = scale;
|
||||
indentChar = indent;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the PAGE-XML (XML) string of the pages layout using the <see cref="IBoundingBox"></see>'s as the page layout
|
||||
/// </summary>
|
||||
/// <param name="page">The Page</param>
|
||||
/// <param name="blocks">Blocks to be exported</param>
|
||||
/// <returns></returns>
|
||||
public string Get(Page page, IEnumerable<IBoundingBox> blocks)
|
||||
{
|
||||
PageXmlDocument pageXmlDocument = new PageXmlDocument()
|
||||
{
|
||||
Metadata = new PageXmlDocument.PageXmlMetadata()
|
||||
{
|
||||
Created = DateTime.UtcNow,
|
||||
LastChange = DateTime.UtcNow,
|
||||
Creator = "PdfPig",
|
||||
Comments = "",
|
||||
},
|
||||
PcGtsId = "pc-" + page.GetHashCode()
|
||||
};
|
||||
|
||||
var xmlPage = CreatePage(page.Height, page.Width, blocks);
|
||||
|
||||
pageXmlDocument.Page = xmlPage;
|
||||
|
||||
return Serialize(pageXmlDocument);
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlPage CreatePage(double pageHeight, double pageWidth, IEnumerable<IBoundingBox> blocks)
|
||||
{
|
||||
var pageXmlPage = new PageXmlDocument.PageXmlPage()
|
||||
{
|
||||
ImageFilename = "unknown",
|
||||
ImageHeight = (int)Math.Round(pageHeight * scale),
|
||||
ImageWidth = (int)Math.Round(pageWidth * scale),
|
||||
};
|
||||
|
||||
var regions = blocks
|
||||
.Select(b => ToRegion(b, pageWidth, pageHeight))
|
||||
.Where(x => x != null).ToList();
|
||||
pageXmlPage.Items = regions.ToArray();
|
||||
|
||||
var regionsOrder = regions.Select(x => x.Id);
|
||||
|
||||
var orderedRegions = GetOrderRegions(regionsOrder).ToArray();
|
||||
pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
|
||||
{
|
||||
Item = new PageXmlDocument.PageXmlOrderedGroup()
|
||||
{
|
||||
Items = orderedRegions,
|
||||
Id = "g" + NextId()
|
||||
}
|
||||
};
|
||||
|
||||
return pageXmlPage;
|
||||
}
|
||||
|
||||
private IEnumerable<PageXmlDocument.PageXmlRegionRefIndexed> GetOrderRegions(IEnumerable<string> idOrder)
|
||||
{
|
||||
var index = 1;
|
||||
foreach (var item in idOrder)
|
||||
{
|
||||
yield return new PageXmlDocument.PageXmlRegionRefIndexed()
|
||||
{
|
||||
RegionRef = item,
|
||||
Index = index++
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlRegion ToRegion(IBoundingBox block, double pageWidth, double pageHeight)
|
||||
{
|
||||
if (block is TextBlock textblock)
|
||||
{
|
||||
return ToPageXmlTextRegion(textblock, pageWidth, pageHeight);
|
||||
}
|
||||
|
||||
if (block is IPdfImage imageBlock)
|
||||
{
|
||||
return ToImageRegion(imageBlock.BoundingBox, pageWidth, pageHeight);
|
||||
}
|
||||
|
||||
// Default case
|
||||
return ToPageXmlSimpleTextRegion(block.BoundingBox, block.ToString(), pageWidth, pageHeight);
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlImageRegion ToImageRegion(PdfRectangle box, double pageWidth, double pageHeight)
|
||||
{
|
||||
return new PageXmlDocument.PageXmlImageRegion()
|
||||
{
|
||||
Coords = ToCoords(box, pageWidth, pageHeight),
|
||||
Id = "r" + NextId(),
|
||||
};
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlTableRegion ToTableRegion(PdfRectangle box, double pageWidth, double pageHeight)
|
||||
{
|
||||
return new PageXmlDocument.PageXmlTableRegion()
|
||||
{
|
||||
Coords = ToCoords(box, pageWidth, pageHeight),
|
||||
Id = "r" + NextId(),
|
||||
};
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlCustomRegion ToCustomRegion(PdfRectangle box, string text, double pageWidth, double pageHeight)
|
||||
{
|
||||
if (box.TopLeft.Equals(box.BottomRight))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return new PageXmlDocument.PageXmlCustomRegion()
|
||||
{
|
||||
Coords = ToCoords(box, pageWidth, pageHeight),
|
||||
Id = "r" + NextId(),
|
||||
Type = text
|
||||
};
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlTextRegion ToPageXmlSimpleTextRegion(PdfRectangle box, string text, double pageWidth, double pageHeight)
|
||||
{
|
||||
string regionId = "r" + NextId();
|
||||
|
||||
return new PageXmlDocument.PageXmlTextRegion()
|
||||
{
|
||||
Coords = ToCoords(box, pageWidth, pageHeight),
|
||||
Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph,
|
||||
TextLines = new PageXmlDocument.PageXmlTextLine[0],
|
||||
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = text } },
|
||||
Id = regionId
|
||||
};
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlTextRegion ToPageXmlTextRegion(TextBlock textBlock, double pageWidth, double pageHeight)
|
||||
{
|
||||
string regionId = "r" + NextId();
|
||||
|
||||
|
||||
return new PageXmlDocument.PageXmlTextRegion()
|
||||
{
|
||||
Coords = ToCoords(textBlock.BoundingBox, pageWidth, pageHeight),
|
||||
Type = PageXmlDocument.PageXmlTextSimpleType.Paragraph,
|
||||
TextLines = textBlock.TextLines.Select(l => ToPageXmlTextLine(l, pageWidth, pageHeight)).ToArray(),
|
||||
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textBlock.Text } },
|
||||
Id = regionId
|
||||
};
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlTextLine ToPageXmlTextLine(TextLine textLine, double pageWidth, double pageHeight)
|
||||
{
|
||||
return new PageXmlDocument.PageXmlTextLine()
|
||||
{
|
||||
Coords = ToCoords(textLine.BoundingBox, pageWidth, pageHeight),
|
||||
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
|
||||
Words = textLine.Words.Select(w => ToPageXmlWord(w, pageWidth, pageHeight)).ToArray(),
|
||||
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = textLine.Text } },
|
||||
Id = "l" + NextId()
|
||||
};
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlWord ToPageXmlWord(Word word, double pageWidth, double pageHeight)
|
||||
{
|
||||
return new PageXmlDocument.PageXmlWord()
|
||||
{
|
||||
Coords = ToCoords(word.BoundingBox, pageWidth, pageHeight),
|
||||
Glyphs = word.Letters.Select(l => ToPageXmlGlyph(l, pageWidth, pageHeight)).ToArray(),
|
||||
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = word.Text } },
|
||||
Id = "w" + NextId()
|
||||
};
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlGlyph ToPageXmlGlyph(Letter letter, double pageWidth, double pageHeight)
|
||||
{
|
||||
return new PageXmlDocument.PageXmlGlyph()
|
||||
{
|
||||
Coords = ToCoords(letter.GlyphRectangle, pageWidth, pageHeight),
|
||||
Ligature = false,
|
||||
Production = PageXmlDocument.PageXmlProductionSimpleType.Printed,
|
||||
TextStyle = new PageXmlDocument.PageXmlTextStyle()
|
||||
{
|
||||
FontSize = (float)letter.FontSize,
|
||||
FontFamily = letter.FontName,
|
||||
TextColourRgb = ToRgbEncoded(letter.Color),
|
||||
},
|
||||
TextEquivs = new[] { new PageXmlDocument.PageXmlTextEquiv() { Unicode = letter.Value } },
|
||||
Id = "c" + NextId()
|
||||
};
|
||||
}
|
||||
|
||||
private string PointToString(PdfPoint point, double pageWidth, double pageHeight)
|
||||
{
|
||||
double x = Math.Round(point.X * scale);
|
||||
double y = Math.Round((pageHeight - point.Y) * scale);
|
||||
|
||||
// move away from borders
|
||||
x = x > 1 ? x : 1;
|
||||
y = y > 1 ? y : 1;
|
||||
|
||||
x = x < pageWidth - 1 ? x : pageWidth - 1;
|
||||
y = y < pageHeight - 1 ? y : pageHeight - 1;
|
||||
|
||||
return x.ToString("0") + "," + y.ToString("0");
|
||||
}
|
||||
|
||||
private string ToPoints(IEnumerable<PdfPoint> points, double pageWidth, double pageHeight)
|
||||
{
|
||||
return string.Join(" ", points.Select(p => PointToString(p, pageWidth, pageHeight)));
|
||||
}
|
||||
|
||||
private string ToPoints(PdfRectangle pdfRectangle, double pageWidth, double pageHeight)
|
||||
{
|
||||
return ToPoints(
|
||||
new[] { pdfRectangle.BottomLeft, pdfRectangle.TopLeft, pdfRectangle.TopRight, pdfRectangle.BottomRight },
|
||||
pageWidth, pageHeight);
|
||||
}
|
||||
|
||||
private PageXmlDocument.PageXmlCoords ToCoords(PdfRectangle pdfRectangle, double pageWidth, double pageHeight)
|
||||
{
|
||||
return new PageXmlDocument.PageXmlCoords()
|
||||
{
|
||||
Points = ToPoints(pdfRectangle, pageWidth, pageHeight)
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// PageXml Text colour in RGB encoded format
|
||||
/// <para>(red value) + (256 x green value) + (65536 x blue value).</para>
|
||||
/// </summary>
|
||||
private string ToRgbEncoded(IColor color)
|
||||
{
|
||||
var rgb = color.ToRGBValues();
|
||||
int red = (int)Math.Round(255f * (float)rgb.r);
|
||||
int green = 256 * (int)Math.Round(255f * (float)rgb.g);
|
||||
int blue = 65536 * (int)Math.Round(255f * (float)rgb.b);
|
||||
int sum = red + green + blue;
|
||||
|
||||
// as per below, red and blue order might be inverted... var colorWin = System.Drawing.Color.FromArgb(sum);
|
||||
return sum.ToString();
|
||||
}
|
||||
|
||||
private string Serialize(PageXmlDocument pageXmlDocument)
|
||||
{
|
||||
XmlSerializer serializer = new XmlSerializer(typeof(PageXmlDocument));
|
||||
var settings = new XmlWriterSettings()
|
||||
{
|
||||
Encoding = System.Text.Encoding.UTF8,
|
||||
Indent = true,
|
||||
IndentChars = indentChar,
|
||||
};
|
||||
|
||||
using (var memoryStream = new MemoryStream())
|
||||
using (var xmlWriter = XmlWriter.Create(memoryStream, settings))
|
||||
{
|
||||
serializer.Serialize(xmlWriter, pageXmlDocument);
|
||||
return System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
|
||||
}
|
||||
}
|
||||
|
||||
private int NextId()
|
||||
{
|
||||
return nextId++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -273,7 +273,7 @@
|
||||
private PageXmlDocument.PageXmlImageRegion ToPageXmlImageRegion(IPdfImage pdfImage, PageXmlData data, double pageWidth, double pageHeight)
|
||||
{
|
||||
data.RegionsCount++;
|
||||
var bbox = pdfImage.Bounds;
|
||||
var bbox = pdfImage.BoundingBox;
|
||||
return new PageXmlDocument.PageXmlImageRegion()
|
||||
{
|
||||
Coords = ToCoords(bbox, pageWidth, pageHeight),
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// This detector does nothing, no ordering takes place.
|
||||
/// </summary>
|
||||
public class DefaultReadingOrderDetector : IReadingOrderDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Create an instance of default reading order detector, <see cref="DefaultReadingOrderDetector"/>.
|
||||
/// <para>This detector does nothing, no ordering takes place.</para>
|
||||
/// </summary>
|
||||
public static DefaultReadingOrderDetector Instance { get; } = new DefaultReadingOrderDetector();
|
||||
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
public IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks)
|
||||
{
|
||||
return textBlocks;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,17 +1,17 @@
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using UglyToad.PdfPig.Content;
|
||||
|
||||
/// <summary>
|
||||
/// Reading order detector determines the page's blocks reading order.
|
||||
/// <para>Note: Make sure you use <see cref="TextBlock.SetReadingOrder(int)"/> to set each <see cref="TextBlock"/> reading order when implementing <see cref="IReadingOrderDetector.Get(IReadOnlyList{TextBlock})"/>.</para>
|
||||
/// </summary>
|
||||
public interface IReadingOrderDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// Gets the blocks in reading order. The results is the correctly ordered Enumerable
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks);
|
||||
/// <param name="blocks">The objects implementing <see cref="IBoundingBox"/>s to order.</param>
|
||||
IEnumerable<TBlock> Get<TBlock>(IEnumerable<TBlock> blocks) where TBlock : IBoundingBox;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Content;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve the blocks' reading order using rendering order (TextSequence).
|
||||
@ -17,14 +18,28 @@
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
public IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks)
|
||||
/// <param name="blocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
/// <returns>the orignal list if type is not <see cref="TextBlock"/></returns>
|
||||
public IEnumerable<TBlock> Get<TBlock>(IEnumerable<TBlock> blocks)
|
||||
where TBlock : IBoundingBox
|
||||
{
|
||||
if (typeof(TBlock) != typeof(TextBlock))
|
||||
{
|
||||
return blocks;
|
||||
}
|
||||
|
||||
return OrderByRending(blocks);
|
||||
}
|
||||
|
||||
private IEnumerable<TBlock> OrderByRending<TBlock>(IEnumerable<TBlock> blocks)
|
||||
where TBlock : IBoundingBox
|
||||
{
|
||||
int readingOrder = 0;
|
||||
|
||||
foreach (var block in textBlocks.OrderBy(b => AvgTextSequence(b)))
|
||||
foreach (var block in blocks.OrderBy(b => AvgTextSequence(b as TextBlock)))
|
||||
{
|
||||
block.SetReadingOrder(readingOrder++);
|
||||
var txtBlock = block as TextBlock;
|
||||
txtBlock.SetReadingOrder(readingOrder++);
|
||||
yield return block;
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,7 +2,9 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.ObjectModel;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Content;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve the blocks' reading order using spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence).
|
||||
@ -59,7 +61,7 @@
|
||||
/// </summary>
|
||||
public double T { get; }
|
||||
|
||||
private Func<TextBlock, TextBlock, double, bool> getBeforeInMethod;
|
||||
private Func<IBoundingBox, IBoundingBox, double, bool> getBeforeInMethod;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm that retrieve the blocks' reading order using spatial reasoning (Allen’s interval relations) and possibly the rendering order (TextSequence).
|
||||
@ -75,53 +77,60 @@
|
||||
this.SpatialReasoningRule = spatialReasoningRule;
|
||||
this.UseRenderingOrder = useRenderingOrder;
|
||||
|
||||
getBeforeInMethod = GetBeforeInMethod();
|
||||
}
|
||||
|
||||
private Func<IBoundingBox, IBoundingBox, double, bool> GetBeforeInMethod()
|
||||
{
|
||||
switch (SpatialReasoningRule)
|
||||
{
|
||||
case SpatialReasoningRules.ColumnWise:
|
||||
if (UseRenderingOrder)
|
||||
{
|
||||
getBeforeInMethod = (TextBlock a, TextBlock b, double t) => GetBeforeInReadingVertical(a, b, t) || GetBeforeInRendering(a, b);
|
||||
// Important note: GetBeforeInRendering will return false if type is not TextBox meaning it's result gets ignored
|
||||
return (IBoundingBox a, IBoundingBox b, double t) => GetBeforeInReadingVertical(a, b, t)
|
||||
|| GetBeforeInRendering(a, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
getBeforeInMethod = GetBeforeInReadingVertical;
|
||||
return GetBeforeInReadingVertical;
|
||||
}
|
||||
break;
|
||||
|
||||
case SpatialReasoningRules.RowWise:
|
||||
if (UseRenderingOrder)
|
||||
{
|
||||
getBeforeInMethod = (TextBlock a, TextBlock b, double t) => GetBeforeInReadingHorizontal(a, b, t) || GetBeforeInRendering(a, b);
|
||||
return (IBoundingBox a, IBoundingBox b, double t) => GetBeforeInReadingHorizontal(a, b, t)
|
||||
|| GetBeforeInRendering(a, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
getBeforeInMethod = GetBeforeInReadingHorizontal;
|
||||
return GetBeforeInReadingHorizontal;
|
||||
}
|
||||
break;
|
||||
|
||||
case SpatialReasoningRules.Basic:
|
||||
default:
|
||||
if (UseRenderingOrder)
|
||||
{
|
||||
getBeforeInMethod = (TextBlock a, TextBlock b, double t) => GetBeforeInReading(a, b, t) || GetBeforeInRendering(a, b);
|
||||
return (IBoundingBox a, IBoundingBox b, double t) => GetBeforeInReading(a, b, t)
|
||||
|| GetBeforeInRendering(a, b);
|
||||
}
|
||||
else
|
||||
{
|
||||
getBeforeInMethod = GetBeforeInReading;
|
||||
return GetBeforeInReading;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the blocks in reading order and sets the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// Gets the blocks ordered in reading order.
|
||||
/// If blocks are of type <see cref="TextBlock"/> it will also set the <see cref="TextBlock.ReadingOrder"/>.
|
||||
/// </summary>
|
||||
/// <param name="textBlocks">The <see cref="TextBlock"/>s to order.</param>
|
||||
public IEnumerable<TextBlock> Get(IReadOnlyList<TextBlock> textBlocks)
|
||||
/// <param name="inBlocks">The blocks to order.</param>
|
||||
public IEnumerable<TBlock> Get<TBlock>(IEnumerable<TBlock> inBlocks)
|
||||
where TBlock : IBoundingBox
|
||||
{
|
||||
IReadOnlyList<TBlock> blocks = new ReadOnlyCollection<TBlock>(inBlocks.ToList());
|
||||
int readingOrder = 0;
|
||||
|
||||
var graph = BuildGraph(textBlocks, T);
|
||||
var graph = BuildGraph(blocks, T);
|
||||
|
||||
while (graph.Count > 0)
|
||||
{
|
||||
@ -135,14 +144,18 @@
|
||||
g.Value.Remove(index);
|
||||
}
|
||||
|
||||
var block = textBlocks[index];
|
||||
block.SetReadingOrder(readingOrder++);
|
||||
var block = blocks[index];
|
||||
if(block is TextBlock textBlock)
|
||||
{
|
||||
textBlock.SetReadingOrder(readingOrder++);
|
||||
}
|
||||
|
||||
yield return block;
|
||||
}
|
||||
}
|
||||
|
||||
private Dictionary<int, List<int>> BuildGraph(IReadOnlyList<TextBlock> textBlocks, double T)
|
||||
private Dictionary<int, List<int>> BuildGraph<TBlock>(IReadOnlyList<TBlock> blocks, double T)
|
||||
where TBlock : IBoundingBox
|
||||
{
|
||||
// We incorporate both relations into a single partial ordering of blocks by specifying a
|
||||
// directed graph with an edge between every pair of blocks for which at least one of the
|
||||
@ -150,18 +163,18 @@
|
||||
|
||||
var graph = new Dictionary<int, List<int>>();
|
||||
|
||||
for (int i = 0; i < textBlocks.Count; i++)
|
||||
for (int i = 0; i < blocks.Count; i++)
|
||||
{
|
||||
graph.Add(i, new List<int>());
|
||||
}
|
||||
|
||||
for (int i = 0; i < textBlocks.Count; i++)
|
||||
for (int i = 0; i < blocks.Count; i++)
|
||||
{
|
||||
var a = textBlocks[i];
|
||||
for (int j = 0; j < textBlocks.Count; j++)
|
||||
var a = blocks[i];
|
||||
for (int j = 0; j < blocks.Count; j++)
|
||||
{
|
||||
if (i == j) continue;
|
||||
var b = textBlocks[j];
|
||||
var b = blocks[j];
|
||||
|
||||
if (getBeforeInMethod(a, b, T))
|
||||
{
|
||||
@ -173,11 +186,22 @@
|
||||
return graph;
|
||||
}
|
||||
|
||||
private static bool GetBeforeInRendering(TextBlock a, TextBlock b)
|
||||
/// <summary>
|
||||
/// Get's before in Rendering order. This only works on <see cref="TextBlock"/>
|
||||
/// </summary>
|
||||
/// <param name="alpha"></param>
|
||||
/// <param name="bravo"></param>
|
||||
/// <returns>Text Before in rendering. False if type is not <see cref="TextBlock"/></returns>
|
||||
private static bool GetBeforeInRendering(IBoundingBox alpha, IBoundingBox bravo)
|
||||
{
|
||||
var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
|
||||
var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
|
||||
return avgTextSequenceA < avgTextSequenceB;
|
||||
if (alpha is TextBlock a && bravo is TextBlock b)
|
||||
{
|
||||
var avgTextSequenceA = a.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
|
||||
var avgTextSequenceB = b.TextLines.SelectMany(tl => tl.Words).SelectMany(w => w.Letters).Select(l => l.TextSequence).Average();
|
||||
return avgTextSequenceA < avgTextSequenceB;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@ -186,17 +210,17 @@
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T.</param>
|
||||
private static bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
|
||||
private static bool GetBeforeInReading(IBoundingBox a, IBoundingBox b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
|
||||
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
|
||||
|
||||
return xRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Overlaps;
|
||||
yRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Overlaps;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@ -205,7 +229,7 @@
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T.</param>
|
||||
private static bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
|
||||
private static bool GetBeforeInReadingVertical(IBoundingBox a, IBoundingBox b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
|
||||
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
|
||||
@ -213,20 +237,20 @@
|
||||
return xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
(xRelation == IntervalRelations.Overlaps && (yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps)) ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps)) ||
|
||||
((yRelation == IntervalRelations.Precedes || yRelation == IntervalRelations.Meets || yRelation == IntervalRelations.Overlaps) &&
|
||||
(xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
xRelation == IntervalRelations.Starts ||
|
||||
xRelation == IntervalRelations.FinishesI ||
|
||||
xRelation == IntervalRelations.Equals ||
|
||||
xRelation == IntervalRelations.During ||
|
||||
xRelation == IntervalRelations.DuringI ||
|
||||
xRelation == IntervalRelations.Finishes ||
|
||||
xRelation == IntervalRelations.StartsI ||
|
||||
xRelation == IntervalRelations.OverlapsI));
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps ||
|
||||
xRelation == IntervalRelations.Starts ||
|
||||
xRelation == IntervalRelations.FinishesI ||
|
||||
xRelation == IntervalRelations.Equals ||
|
||||
xRelation == IntervalRelations.During ||
|
||||
xRelation == IntervalRelations.DuringI ||
|
||||
xRelation == IntervalRelations.Finishes ||
|
||||
xRelation == IntervalRelations.StartsI ||
|
||||
xRelation == IntervalRelations.OverlapsI));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@ -235,29 +259,28 @@
|
||||
/// <param name="a"></param>
|
||||
/// <param name="b"></param>
|
||||
/// <param name="T">The tolerance parameter T.</param>
|
||||
private static bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
|
||||
private static bool GetBeforeInReadingHorizontal(IBoundingBox a, IBoundingBox b, double T)
|
||||
{
|
||||
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
|
||||
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
|
||||
|
||||
return yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
(yRelation == IntervalRelations.Overlaps && (xRelation == IntervalRelations.Precedes ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps)) ||
|
||||
xRelation == IntervalRelations.Meets ||
|
||||
xRelation == IntervalRelations.Overlaps)) ||
|
||||
((xRelation == IntervalRelations.Precedes || xRelation == IntervalRelations.Meets || xRelation == IntervalRelations.Overlaps) &&
|
||||
(yRelation == IntervalRelations.Precedes ||
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Starts ||
|
||||
yRelation == IntervalRelations.FinishesI ||
|
||||
yRelation == IntervalRelations.Equals ||
|
||||
yRelation == IntervalRelations.During ||
|
||||
yRelation == IntervalRelations.DuringI ||
|
||||
yRelation == IntervalRelations.Finishes ||
|
||||
yRelation == IntervalRelations.StartsI ||
|
||||
yRelation == IntervalRelations.OverlapsI));
|
||||
yRelation == IntervalRelations.Meets ||
|
||||
yRelation == IntervalRelations.Overlaps ||
|
||||
yRelation == IntervalRelations.Starts ||
|
||||
yRelation == IntervalRelations.FinishesI ||
|
||||
yRelation == IntervalRelations.Equals ||
|
||||
yRelation == IntervalRelations.During ||
|
||||
yRelation == IntervalRelations.DuringI ||
|
||||
yRelation == IntervalRelations.Finishes ||
|
||||
yRelation == IntervalRelations.StartsI ||
|
||||
yRelation == IntervalRelations.OverlapsI));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -9,7 +9,7 @@
|
||||
/// <summary>
|
||||
/// A block of text.
|
||||
/// </summary>
|
||||
public class TextBlock
|
||||
public class TextBlock: IBoundingBox
|
||||
{
|
||||
/// <summary>
|
||||
/// The separator used between lines in the block.
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
/// <summary>
|
||||
/// A line of text.
|
||||
/// </summary>
|
||||
public class TextLine
|
||||
public class TextLine : IBoundingBox
|
||||
{
|
||||
/// <summary>
|
||||
/// The separator used between words in the line.
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
|
||||
if (images?.Any() == true)
|
||||
{
|
||||
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
|
||||
bboxes.AddRange(images.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0).Select(o => o.BoundingBox));
|
||||
}
|
||||
|
||||
return GetWhitespaces(bboxes,
|
||||
|
||||
@ -29,6 +29,32 @@
|
||||
Assert.Equal(100, ordered[1].BoundingBox.Left);
|
||||
}
|
||||
|
||||
class MyTestBlock : IBoundingBox
|
||||
{
|
||||
public PdfRectangle BoundingBox { get; set; }
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WorksWithAnyTypeThatImplementsIBoundingBox()
|
||||
{
|
||||
var left = new MyTestBlock() {
|
||||
BoundingBox= new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10))
|
||||
};
|
||||
var right = new MyTestBlock()
|
||||
{
|
||||
BoundingBox = new PdfRectangle(new PdfPoint(100, 0), new PdfPoint(110, 10))
|
||||
};
|
||||
|
||||
// We deliberately submit in the wrong order
|
||||
var textBlocks = new List<MyTestBlock>() { right, left };
|
||||
|
||||
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, UnsupervisedReadingOrderDetector.SpatialReasoningRules.RowWise);
|
||||
var orderedBlocks = unsupervisedReadingOrderDetector.Get(textBlocks);
|
||||
|
||||
var ordered = orderedBlocks.ToList();
|
||||
Assert.Equal(0, ordered[0].BoundingBox.Left);
|
||||
Assert.Equal(100, ordered[1].BoundingBox.Left);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DocumentTest()
|
||||
|
||||
@ -26,29 +26,29 @@
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
var images = page.GetImages().OrderBy(x => x.Bounds.Width).ToList();
|
||||
var images = page.GetImages().OrderBy(x => x.BoundingBox.Width).ToList();
|
||||
|
||||
var pdfPigSquare = images[0];
|
||||
|
||||
Assert.Equal(148.3d, pdfPigSquare.Bounds.Width, doubleComparer);
|
||||
Assert.Equal(148.3d, pdfPigSquare.Bounds.Height, doubleComparer);
|
||||
Assert.Equal(60.1d, pdfPigSquare.Bounds.Left, doubleComparer);
|
||||
Assert.Equal(765.8d, pdfPigSquare.Bounds.Top, doubleComparer);
|
||||
Assert.Equal(148.3d, pdfPigSquare.BoundingBox.Width, doubleComparer);
|
||||
Assert.Equal(148.3d, pdfPigSquare.BoundingBox.Height, doubleComparer);
|
||||
Assert.Equal(60.1d, pdfPigSquare.BoundingBox.Left, doubleComparer);
|
||||
Assert.Equal(765.8d, pdfPigSquare.BoundingBox.Top, doubleComparer);
|
||||
|
||||
|
||||
var pdfPigSquished = images[1];
|
||||
|
||||
Assert.Equal(206.8d, pdfPigSquished.Bounds.Width, doubleComparer);
|
||||
Assert.Equal(83.2d, pdfPigSquished.Bounds.Height, doubleComparer);
|
||||
Assert.Equal(309.8d, pdfPigSquished.Bounds.Left, doubleComparer);
|
||||
Assert.Equal(552.1d, pdfPigSquished.Bounds.Top, doubleComparer);
|
||||
Assert.Equal(206.8d, pdfPigSquished.BoundingBox.Width, doubleComparer);
|
||||
Assert.Equal(83.2d, pdfPigSquished.BoundingBox.Height, doubleComparer);
|
||||
Assert.Equal(309.8d, pdfPigSquished.BoundingBox.Left, doubleComparer);
|
||||
Assert.Equal(552.1d, pdfPigSquished.BoundingBox.Top, doubleComparer);
|
||||
|
||||
var birthdayPigs = images[2];
|
||||
|
||||
Assert.Equal(391d, birthdayPigs.Bounds.Width, doubleComparer);
|
||||
Assert.Equal(267.1d, birthdayPigs.Bounds.Height, doubleComparer);
|
||||
Assert.Equal(102.2d, birthdayPigs.Bounds.Left, doubleComparer);
|
||||
Assert.Equal(426.3d, birthdayPigs.Bounds.Top, doubleComparer);
|
||||
Assert.Equal(391d, birthdayPigs.BoundingBox.Width, doubleComparer);
|
||||
Assert.Equal(267.1d, birthdayPigs.BoundingBox.Height, doubleComparer);
|
||||
Assert.Equal(102.2d, birthdayPigs.BoundingBox.Left, doubleComparer);
|
||||
Assert.Equal(426.3d, birthdayPigs.BoundingBox.Top, doubleComparer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -80,6 +80,7 @@
|
||||
"UglyToad.PdfPig.Content.InlineImage",
|
||||
"UglyToad.PdfPig.Content.IPageFactory`1",
|
||||
"UglyToad.PdfPig.Content.IPdfImage",
|
||||
"UglyToad.PdfPig.Content.IBoundingBox",
|
||||
"UglyToad.PdfPig.Content.IResourceStore",
|
||||
"UglyToad.PdfPig.Content.Letter",
|
||||
"UglyToad.PdfPig.Content.MarkedContentElement",
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
|
||||
public class TestPdfImage : IPdfImage
|
||||
{
|
||||
public PdfRectangle Bounds { get; set; }
|
||||
public PdfRectangle BoundingBox { get; set; }
|
||||
|
||||
public int WidthInSamples { get; set; }
|
||||
|
||||
|
||||
@ -563,8 +563,8 @@
|
||||
|
||||
Assert.NotNull(image);
|
||||
|
||||
Assert.Equal(expectedBounds.BottomLeft, image.Bounds.BottomLeft);
|
||||
Assert.Equal(expectedBounds.TopRight, image.Bounds.TopRight);
|
||||
Assert.Equal(expectedBounds.BottomLeft, image.BoundingBox.BottomLeft);
|
||||
Assert.Equal(expectedBounds.TopRight, image.BoundingBox.TopRight);
|
||||
|
||||
Assert.Equal(imageBytes, image.RawMemory.ToArray());
|
||||
}
|
||||
@ -609,10 +609,10 @@
|
||||
Assert.Equal(2, page1Images.Count);
|
||||
|
||||
var image1 = page1Images[0];
|
||||
Assert.Equal(expectedBounds1, image1.Bounds);
|
||||
Assert.Equal(expectedBounds1, image1.BoundingBox);
|
||||
|
||||
var image2 = page1Images[1];
|
||||
Assert.Equal(expectedBounds2, image2.Bounds);
|
||||
Assert.Equal(expectedBounds2, image2.BoundingBox);
|
||||
|
||||
var page2Doc = document.GetPage(2);
|
||||
|
||||
@ -620,7 +620,7 @@
|
||||
|
||||
Assert.NotNull(image3);
|
||||
|
||||
Assert.Equal(expectedBounds3, image3.Bounds);
|
||||
Assert.Equal(expectedBounds3, image3.BoundingBox);
|
||||
|
||||
Assert.Equal(imageBytes, image1.RawMemory.ToArray());
|
||||
Assert.Equal(imageBytes, image2.RawMemory.ToArray());
|
||||
@ -696,8 +696,8 @@
|
||||
|
||||
Assert.NotNull(image);
|
||||
|
||||
Assert.Equal(expectedBounds.BottomLeft, image.Bounds.BottomLeft);
|
||||
Assert.Equal(expectedBounds.TopRight, image.Bounds.TopRight);
|
||||
Assert.Equal(expectedBounds.BottomLeft, image.BoundingBox.BottomLeft);
|
||||
Assert.Equal(expectedBounds.TopRight, image.BoundingBox.TopRight);
|
||||
|
||||
Assert.True(image.TryGetPng(out var png));
|
||||
Assert.NotNull(png);
|
||||
|
||||
15
src/UglyToad.PdfPig/Content/IBoundingBox.cs
Normal file
15
src/UglyToad.PdfPig/Content/IBoundingBox.cs
Normal file
@ -0,0 +1,15 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using UglyToad.PdfPig.Core;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for classes with a bounding box
|
||||
/// </summary>
|
||||
public interface IBoundingBox
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the Bounding Box: The rectangle completely containing this object
|
||||
/// </summary>
|
||||
PdfRectangle BoundingBox { get; }
|
||||
}
|
||||
}
|
||||
@ -12,12 +12,8 @@
|
||||
/// <summary>
|
||||
/// An image in a PDF document, may be an <see cref="InlineImage"/> or a PostScript image XObject (<see cref="XObjectImage"/>).
|
||||
/// </summary>
|
||||
public interface IPdfImage
|
||||
public interface IPdfImage: IBoundingBox
|
||||
{
|
||||
/// <summary>
|
||||
/// The placement rectangle of the image in PDF coordinates.
|
||||
/// </summary>
|
||||
PdfRectangle Bounds { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The width of the image in samples.
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
private readonly Lazy<ReadOnlyMemory<byte>>? memoryFactory;
|
||||
|
||||
/// <inheritdoc />
|
||||
public PdfRectangle Bounds { get; }
|
||||
public PdfRectangle BoundingBox { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public int WidthInSamples { get; }
|
||||
@ -69,7 +69,7 @@
|
||||
DictionaryToken streamDictionary,
|
||||
ColorSpaceDetails colorSpaceDetails)
|
||||
{
|
||||
Bounds = bounds;
|
||||
BoundingBox = bounds;
|
||||
WidthInSamples = widthInSamples;
|
||||
HeightInSamples = heightInSamples;
|
||||
Decode = decode;
|
||||
@ -124,7 +124,7 @@
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return $"Inline Image (w {Bounds.Width}, h {Bounds.Height})";
|
||||
return $"Inline Image (w {BoundingBox.Width}, h {BoundingBox.Height})";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,171 +1,176 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using Core;
|
||||
using Graphics.Colors;
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using Core;
|
||||
using Graphics.Colors;
|
||||
using PdfFonts;
|
||||
|
||||
/// <summary>
|
||||
/// A glyph or combination of glyphs (characters) drawn by a PDF content stream.
|
||||
/// </summary>
|
||||
public class Letter
|
||||
{
|
||||
/// <summary>
|
||||
/// The text for this letter or unicode character.
|
||||
/// </summary>
|
||||
public string Value { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Text orientation of the letter.
|
||||
/// </summary>
|
||||
public TextOrientation TextOrientation { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The placement position of the character in PDF space. See <see cref="StartBaseLine"/>
|
||||
/// </summary>
|
||||
public PdfPoint Location => StartBaseLine;
|
||||
|
||||
/// <summary>
|
||||
/// The placement position of the character in PDF space (the start point of the baseline). See <see cref="Location"/>
|
||||
/// </summary>
|
||||
public PdfPoint StartBaseLine { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The end point of the baseline.
|
||||
/// </summary>
|
||||
public PdfPoint EndBaseLine { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The width occupied by the character within the PDF content.
|
||||
/// </summary>
|
||||
public double Width { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Position of the bounding box for the glyph, this is the box surrounding the visible glyph as it appears on the page.
|
||||
/// For example letters with descenders, p, j, etc., will have a box extending below the <see cref="Location"/> they are placed at.
|
||||
/// The width of the glyph may also be more or less than the <see cref="Width"/> allocated for the character in the PDF content.
|
||||
/// </summary>
|
||||
public PdfRectangle GlyphRectangle { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Size as defined in the PDF file. This is not equivalent to font size in points but is relative to other font sizes on the page.
|
||||
/// </summary>
|
||||
public double FontSize { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the font.
|
||||
/// </summary>
|
||||
public string? FontName => Font?.Name;
|
||||
|
||||
/// <summary>
|
||||
/// Details about the font for this letter.
|
||||
/// </summary>
|
||||
public FontDetails Font { get; }
|
||||
/// <summary>
|
||||
/// A glyph or combination of glyphs (characters) drawn by a PDF content stream.
|
||||
/// </summary>
|
||||
public class Letter : IBoundingBox
|
||||
{
|
||||
/// <summary>
|
||||
/// The text for this letter or unicode character.
|
||||
/// </summary>
|
||||
public string Value { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Text rendering mode that indicates whether we should draw this letter's strokes,
|
||||
/// fill, both, neither (in case of hidden text), etc.
|
||||
/// If it calls for stroking the <see cref="StrokeColor" /> is used.
|
||||
/// If it calls for filling, the <see cref="FillColor"/> is used.
|
||||
/// In modes that perform both filling and stroking, the effect is as if each glyph outline were filled and then stroked in separate operations.
|
||||
/// </summary>
|
||||
public TextRenderingMode RenderingMode { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The primary color of the letter, which is either the <see cref="StrokeColor"/> in case
|
||||
/// <see cref="RenderingMode"/> is <see cref="TextRenderingMode.Stroke"/>, or otherwise
|
||||
/// it is the <see cref="FillColor"/>.
|
||||
/// </summary>
|
||||
public IColor Color { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Stroking color
|
||||
/// </summary>
|
||||
public IColor StrokeColor { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Non-stroking (fill) color
|
||||
/// </summary>
|
||||
public IColor FillColor { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The size of the font in points.
|
||||
/// </summary>
|
||||
public double PointSize { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Sequence number of the ShowText operation that printed this letter.
|
||||
/// </summary>
|
||||
public int TextSequence { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new letter to represent some text drawn by the Tj operator.
|
||||
/// </summary>
|
||||
public Letter(string value, PdfRectangle glyphRectangle,
|
||||
PdfPoint startBaseLine,
|
||||
PdfPoint endBaseLine,
|
||||
double width,
|
||||
double fontSize,
|
||||
FontDetails font,
|
||||
TextRenderingMode renderingMode,
|
||||
IColor strokeColor,
|
||||
IColor fillColor,
|
||||
double pointSize,
|
||||
int textSequence)
|
||||
{
|
||||
Value = value;
|
||||
GlyphRectangle = glyphRectangle;
|
||||
StartBaseLine = startBaseLine;
|
||||
EndBaseLine = endBaseLine;
|
||||
Width = width;
|
||||
FontSize = fontSize;
|
||||
Font = font;
|
||||
RenderingMode = renderingMode;
|
||||
if (renderingMode == TextRenderingMode.Stroke)
|
||||
{
|
||||
Color = StrokeColor = strokeColor ?? GrayColor.Black;
|
||||
FillColor = fillColor;
|
||||
}
|
||||
else
|
||||
{
|
||||
Color = FillColor = fillColor ?? GrayColor.Black;
|
||||
StrokeColor = strokeColor;
|
||||
}
|
||||
PointSize = pointSize;
|
||||
TextSequence = textSequence;
|
||||
TextOrientation = GetTextOrientation();
|
||||
}
|
||||
|
||||
private TextOrientation GetTextOrientation()
|
||||
{
|
||||
if (System.Math.Abs(StartBaseLine.Y - EndBaseLine.Y) < 10e-5)
|
||||
{
|
||||
if (StartBaseLine.X > EndBaseLine.X)
|
||||
{
|
||||
return TextOrientation.Rotate180;
|
||||
}
|
||||
|
||||
return TextOrientation.Horizontal;
|
||||
}
|
||||
|
||||
if (System.Math.Abs(StartBaseLine.X - EndBaseLine.X) < 10e-5)
|
||||
{
|
||||
if (StartBaseLine.Y > EndBaseLine.Y)
|
||||
{
|
||||
return TextOrientation.Rotate90;
|
||||
}
|
||||
|
||||
return TextOrientation.Rotate270;
|
||||
}
|
||||
|
||||
return TextOrientation.Other;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Produces a string representation of the letter and its position.
|
||||
/// </summary>
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{Value} {Location} {FontName} {PointSize}";
|
||||
}
|
||||
}
|
||||
}
|
||||
/// <summary>
|
||||
/// Text orientation of the letter.
|
||||
/// </summary>
|
||||
public TextOrientation TextOrientation { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The placement position of the character in PDF space. See <see cref="StartBaseLine"/>
|
||||
/// </summary>
|
||||
public PdfPoint Location => StartBaseLine;
|
||||
|
||||
/// <summary>
|
||||
/// The placement position of the character in PDF space (the start point of the baseline). See <see cref="Location"/>
|
||||
/// </summary>
|
||||
public PdfPoint StartBaseLine { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The end point of the baseline.
|
||||
/// </summary>
|
||||
public PdfPoint EndBaseLine { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The width occupied by the character within the PDF content.
|
||||
/// </summary>
|
||||
public double Width { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Position of the bounding box for the glyph, this is the box surrounding the visible glyph as it appears on the page.
|
||||
/// For example letters with descenders, p, j, etc., will have a box extending below the <see cref="Location"/> they are placed at.
|
||||
/// The width of the glyph may also be more or less than the <see cref="Width"/> allocated for the character in the PDF content.
|
||||
/// </summary>
|
||||
public PdfRectangle GlyphRectangle { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the Bounding Box: The rectangle completely containing this object
|
||||
/// </summary>
|
||||
public PdfRectangle BoundingBox => GlyphRectangle;
|
||||
|
||||
/// <summary>
|
||||
/// Size as defined in the PDF file. This is not equivalent to font size in points but is relative to other font sizes on the page.
|
||||
/// </summary>
|
||||
public double FontSize { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the font.
|
||||
/// </summary>
|
||||
public string? FontName => Font?.Name;
|
||||
|
||||
/// <summary>
|
||||
/// Details about the font for this letter.
|
||||
/// </summary>
|
||||
public FontDetails Font { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Text rendering mode that indicates whether we should draw this letter's strokes,
|
||||
/// fill, both, neither (in case of hidden text), etc.
|
||||
/// If it calls for stroking the <see cref="StrokeColor" /> is used.
|
||||
/// If it calls for filling, the <see cref="FillColor"/> is used.
|
||||
/// In modes that perform both filling and stroking, the effect is as if each glyph outline were filled and then stroked in separate operations.
|
||||
/// </summary>
|
||||
public TextRenderingMode RenderingMode { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The primary color of the letter, which is either the <see cref="StrokeColor"/> in case
|
||||
/// <see cref="RenderingMode"/> is <see cref="TextRenderingMode.Stroke"/>, or otherwise
|
||||
/// it is the <see cref="FillColor"/>.
|
||||
/// </summary>
|
||||
public IColor Color { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Stroking color
|
||||
/// </summary>
|
||||
public IColor StrokeColor { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Non-stroking (fill) color
|
||||
/// </summary>
|
||||
public IColor FillColor { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The size of the font in points.
|
||||
/// </summary>
|
||||
public double PointSize { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Sequence number of the ShowText operation that printed this letter.
|
||||
/// </summary>
|
||||
public int TextSequence { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new letter to represent some text drawn by the Tj operator.
|
||||
/// </summary>
|
||||
public Letter(string value, PdfRectangle glyphRectangle,
|
||||
PdfPoint startBaseLine,
|
||||
PdfPoint endBaseLine,
|
||||
double width,
|
||||
double fontSize,
|
||||
FontDetails font,
|
||||
TextRenderingMode renderingMode,
|
||||
IColor strokeColor,
|
||||
IColor fillColor,
|
||||
double pointSize,
|
||||
int textSequence)
|
||||
{
|
||||
Value = value;
|
||||
GlyphRectangle = glyphRectangle;
|
||||
StartBaseLine = startBaseLine;
|
||||
EndBaseLine = endBaseLine;
|
||||
Width = width;
|
||||
FontSize = fontSize;
|
||||
Font = font;
|
||||
RenderingMode = renderingMode;
|
||||
if (renderingMode == TextRenderingMode.Stroke)
|
||||
{
|
||||
Color = StrokeColor = strokeColor ?? GrayColor.Black;
|
||||
FillColor = fillColor;
|
||||
}
|
||||
else
|
||||
{
|
||||
Color = FillColor = fillColor ?? GrayColor.Black;
|
||||
StrokeColor = strokeColor;
|
||||
}
|
||||
PointSize = pointSize;
|
||||
TextSequence = textSequence;
|
||||
TextOrientation = GetTextOrientation();
|
||||
}
|
||||
|
||||
private TextOrientation GetTextOrientation()
|
||||
{
|
||||
if (System.Math.Abs(StartBaseLine.Y - EndBaseLine.Y) < 10e-5)
|
||||
{
|
||||
if (StartBaseLine.X > EndBaseLine.X)
|
||||
{
|
||||
return TextOrientation.Rotate180;
|
||||
}
|
||||
|
||||
return TextOrientation.Horizontal;
|
||||
}
|
||||
|
||||
if (System.Math.Abs(StartBaseLine.X - EndBaseLine.X) < 10e-5)
|
||||
{
|
||||
if (StartBaseLine.Y > EndBaseLine.Y)
|
||||
{
|
||||
return TextOrientation.Rotate90;
|
||||
}
|
||||
|
||||
return TextOrientation.Rotate270;
|
||||
}
|
||||
|
||||
return TextOrientation.Other;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Produces a string representation of the letter and its position.
|
||||
/// </summary>
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{Value} {Location} {FontName} {PointSize}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
/// <summary>
|
||||
/// A word.
|
||||
/// </summary>
|
||||
public class Word
|
||||
public class Word : IBoundingBox
|
||||
{
|
||||
/// <summary>
|
||||
/// The text of the word.
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
private readonly Lazy<ReadOnlyMemory<byte>>? memoryFactory;
|
||||
|
||||
/// <inheritdoc />
|
||||
public PdfRectangle Bounds { get; }
|
||||
public PdfRectangle BoundingBox { get; }
|
||||
|
||||
/// <inheritdoc />
|
||||
public int WidthInSamples { get; }
|
||||
@ -81,7 +81,7 @@
|
||||
Lazy<ReadOnlyMemory<byte>>? bytes,
|
||||
ColorSpaceDetails? colorSpaceDetails)
|
||||
{
|
||||
Bounds = bounds;
|
||||
BoundingBox = bounds;
|
||||
WidthInSamples = widthInSamples;
|
||||
HeightInSamples = heightInSamples;
|
||||
BitsPerComponent = bitsPerComponent;
|
||||
@ -116,7 +116,7 @@
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return $"XObject Image (w {Bounds.Width}, h {Bounds.Height}): {ImageDictionary}";
|
||||
return $"XObject Image (w {BoundingBox.Width}, h {BoundingBox.Height}): {ImageDictionary}";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user