merge pull request #72 from uglytoad/fix-export-formatting

fix export formatting
This commit is contained in:
Eliot Jones
2019-10-17 11:28:06 +01:00
committed by GitHub
54 changed files with 2906 additions and 5143 deletions

View File

@@ -4,6 +4,11 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
using DocumentLayoutAnalysis;
using Export;
using Xunit; using Xunit;
public class PigProductionHandbookTests public class PigProductionHandbookTests
@@ -32,7 +37,7 @@
var page = document.GetPage(1); var page = document.GetPage(1);
// Pinkish. // Pinkish.
var (r, g , b) = page.Letters[0].Color.ToRGBValues(); var (r, g, b) = page.Letters[0].Color.ToRGBValues();
Assert.Equal(1, r); Assert.Equal(1, r);
Assert.Equal(0.914m, g); Assert.Equal(0.914m, g);
@@ -98,7 +103,7 @@
[Fact] [Fact]
public void Page4HasCorrectWords() public void Page4HasCorrectWords()
{ {
var expected = WordsPage4.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries); var expected = WordsPage4.Split(new[] { "\r", "\r\n", "\n", " " }, StringSplitOptions.RemoveEmptyEntries);
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff)) using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{ {
var page = document.GetPage(4); var page = document.GetPage(4);
@@ -129,6 +134,41 @@
} }
} }
[Fact]
public void CanExportAltoXmlFormat()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
var xml = exporter.Get(document.GetPage(4), true);
Assert.NotNull(xml);
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
using (var xmlReader = new XmlTextReader(xmlStream))
{
var xDocument = XDocument.Load(xmlReader);
Assert.NotNull(xDocument);
}
}
}
[Fact]
public void CanExportAltoXmlFormatPage16()
{
// Page 16 contains an unprintable string and a single line of text which causes problems for Docstrum.
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
var xml = exporter.Get(document.GetPage(16), true);
Assert.NotNull(xml);
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
using (var xmlReader = new XmlTextReader(xmlStream))
{
var xDocument = XDocument.Load(xmlReader);
Assert.NotNull(xDocument);
}
}
}
[Fact] [Fact]
public void LettersHaveCorrectPosition() public void LettersHaveCorrectPosition()
{ {

View File

@@ -77,7 +77,7 @@
"UglyToad.PdfPig.CrossReference.CrossReferenceType", "UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary", "UglyToad.PdfPig.CrossReference.TrailerDictionary",
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances", "UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
"UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB", "UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBoundingBoxes",
"UglyToad.PdfPig.DocumentLayoutAnalysis.DefaultPageSegmenter", "UglyToad.PdfPig.DocumentLayoutAnalysis.DefaultPageSegmenter",
"UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter", "UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter",
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions", "UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
@@ -87,10 +87,11 @@
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType", "UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException", "UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException", "UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
"UglyToad.PdfPig.Export.ITextExporter",
"UglyToad.PdfPig.Export.AltoXmlTextExporter", "UglyToad.PdfPig.Export.AltoXmlTextExporter",
"UglyToad.PdfPig.Export.HOcrTextExporter", "UglyToad.PdfPig.Export.HOcrTextExporter",
"UglyToad.PdfPig.Export.ITextExporter",
"UglyToad.PdfPig.Export.PageXmlTextExporter", "UglyToad.PdfPig.Export.PageXmlTextExporter",
"UglyToad.PdfPig.Export.Alto.AltoDocument",
"UglyToad.PdfPig.Fonts.DescriptorFontFile", "UglyToad.PdfPig.Fonts.DescriptorFontFile",
"UglyToad.PdfPig.Fonts.Exceptions.InvalidFontFormatException", "UglyToad.PdfPig.Fonts.Exceptions.InvalidFontFormatException",
"UglyToad.PdfPig.Fonts.FontDescriptor", "UglyToad.PdfPig.Fonts.FontDescriptor",

View File

@@ -11,6 +11,67 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// </summary> /// </summary>
internal class ClusteringAlgorithms internal class ClusteringAlgorithms
{ {
/// <summary>
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
/// https://en.wikipedia.org/wiki/Transitive_closure
/// </summary>
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
/// <param name="elements">List of elements to group.</param>
/// <param name="distMeasure">The distance measure between two points.</param>
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
/// <param name="pivotPoint">The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> elements,
Func<PdfPoint, PdfPoint, double> distMeasure,
Func<T, T, double> maxDistanceFunction,
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
{
/*************************************************************************************
* Algorithm steps
* 1. Find nearest neighbours indexes (done in parallel)
* Iterate every point (pivot) and put its nearest neighbour's index in an array
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
* Only conciders a neighbour if it is within the maximum distance.
* If not within the maximum distance, index will be set to -1.
* Each element has only one connected neighbour.
* NB: Given the possible asymmetry in the relationship, it is possible
* that if indexes[i] = j then indexes[j] != i.
*
* 2. Group indexes
* Group indexes if share neighbours in common - Transitive closure
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
* (i,j,k) will form a group and (m,n) will form another group.
*************************************************************************************/
int[] indexes = Enumerable.Repeat((int)-1, elements.Count).ToArray();
var candidatesPoints = elements.Select(candidatesPoint).ToList();
// 1. Find nearest neighbours indexes
Parallel.For(0, elements.Count, e =>
{
var pivot = elements[e];
if (filterPivot(pivot))
{
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
var paired = elements[index];
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
{
indexes[e] = index;
}
}
});
// 2. Group indexes
var groupedIndexes = GroupIndexes(indexes);
return groupedIndexes;
}
/// <summary> /// <summary>
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance. /// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
/// https://en.wikipedia.org/wiki/Transitive_closure /// https://en.wikipedia.org/wiki/Transitive_closure
@@ -47,7 +108,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
*************************************************************************************/ *************************************************************************************/
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray(); int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList(); var candidatesPoints = elements.Select(candidatesPoint).ToList();
// 1. Find nearest neighbours indexes // 1. Find nearest neighbours indexes
Parallel.For(0, elements.Length, e => Parallel.For(0, elements.Length, e =>

View File

@@ -9,19 +9,21 @@ using UglyToad.PdfPig.Util;
namespace UglyToad.PdfPig.DocumentLayoutAnalysis namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{ {
/// <inheritdoc />
/// <summary> /// <summary>
/// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood /// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
/// clustering of connected components extracted from the document. /// clustering of connected components extracted from the document.
/// This implementation leverages bounding boxes and does not exactly replicates the original algorithm. /// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
/// <para>See 'The document spectrum for page layout analysis.' by L. OGorman.</para> /// <para>See 'The document spectrum for page layout analysis.' by L. O'Gorman.</para>
/// </summary> /// </summary>
public class DocstrumBB : IPageSegmenter public class DocstrumBoundingBoxes : IPageSegmenter
{ {
/// <summary> /// <summary>
/// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBB"/>. /// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBoundingBoxes"/>.
/// </summary> /// </summary>
public static DocstrumBB Instance { get; } = new DocstrumBB(); public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes();
/// <inheritdoc />
/// <summary> /// <summary>
/// Get the blocks. /// Get the blocks.
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para> /// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
@@ -30,73 +32,108 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <returns></returns> /// <returns></returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords) public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
{ {
return GetBlocks(pageWords, -30, 30, -135, -45, 1.3); return GetBlocks(pageWords, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3);
} }
/// <summary> /// <summary>
/// Get the blocks. See original paper for more information. /// Get the blocks. See original paper for more information.
/// </summary> /// </summary>
/// <param name="pageWords"></param> /// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
/// <param name="wlAngleLB">Within-line lower bound angle.</param> /// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
/// <param name="wlAngleUB">Within-line upper bound angle.</param> /// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
/// <param name="blAngleLB">Between-line lower bound angle.</param> /// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
/// <param name="blAngleUB">Between-line upper bound angle.</param>
/// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between
/// text lines for blocking. Maximum distance will be this number times the between-line /// text lines for blocking. Maximum distance will be this number times the between-line
/// distance found by the analysis.</param> /// distance found by the analysis.</param>
/// <returns></returns> /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double wlAngleLB, double wlAngleUB, public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
double blAngleLB, double blAngleUB, double blMultiplier) AngleBounds betweenLine,
double betweenLineMultiplier)
{ {
if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance; if (words == null)
{
return EmptyArray<TextBlock>.Instance;
}
var pageWordsArr = pageWords.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToArray(); // remove white spaces var wordsList = new List<Word>();
var withinLineDistList = new ConcurrentBag<double[]>(); foreach (var word in words)
var betweenLineDistList = new ConcurrentBag<double[]>(); {
if (string.IsNullOrWhiteSpace(word.Text))
{
continue;
}
wordsList.Add(word);
}
if (wordsList.Count == 0)
{
return EmptyArray<TextBlock>.Instance;
}
var withinLineDistList = new ConcurrentBag<double>();
var betweenLineDistList = new ConcurrentBag<double>();
// 1. Estimate in line and between line spacing // 1. Estimate in line and between line spacing
Parallel.For(0, pageWordsArr.Length, i => Parallel.For(0, wordsList.Count, i =>
{ {
var word = pageWordsArr[i]; var word = wordsList[i];
// Within-line distance // Within-line distance
var pointWL = GetNearestPointData(pageWordsArr, word, var pointsWithinLine = GetNearestPointDistance(wordsList, word,
bb => bb.BottomRight, bb => bb.BottomRight, bb => bb.BottomRight, bb => bb.BottomRight,
bb => bb.BottomLeft, bb => bb.BottomLeft, bb => bb.BottomLeft, bb => bb.BottomLeft,
wlAngleLB, wlAngleUB, Distances.Horizontal); withinLine, Distances.Horizontal);
if (pointWL != null) withinLineDistList.Add(pointWL);
if (pointsWithinLine != null)
{
withinLineDistList.Add(pointsWithinLine.Value);
}
// Between-line distance // Between-line distance
var pointBL = GetNearestPointData(pageWordsArr, word, var pointsBetweenLine = GetNearestPointDistance(wordsList, word,
bb => bb.BottomLeft, bb => bb.Centroid, bb => bb.BottomLeft, bb => bb.Centroid,
bb => bb.TopLeft, bb => bb.Centroid, bb => bb.TopLeft, bb => bb.Centroid,
blAngleLB, blAngleUB, Distances.Vertical); betweenLine, Distances.Vertical);
if (pointBL != null) betweenLineDistList.Add(pointBL);
if (pointsBetweenLine != null)
{
betweenLineDistList.Add(pointsBetweenLine.Value);
}
}); });
double withinLineDistance = GetPeakAverageDistance(withinLineDistList); double? withinLineDistance = GetPeakAverageDistance(withinLineDistList);
double betweenLineDistance = GetPeakAverageDistance(betweenLineDistList); double? betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
if (withinLineDistance == null || betweenLineDistance == null)
{
return new[] {new TextBlock(new[] {new TextLine(wordsList)})};
}
// 2. Find lines of text // 2. Find lines of text
double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance); double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
var lines = GetLines(pageWordsArr, maxDistWL, wlAngleLB, wlAngleUB).ToArray(); var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine).ToArray();
// 3. Find blocks of text // 3. Find blocks of text
double maxDistBL = blMultiplier * betweenLineDistance; double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
var blocks = GetLinesGroups(lines, maxDistBL).ToList(); var blocks = GetLinesGroups(lines, maxDistanceBetweenLine).ToList();
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text. // 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
for (int b = 0; b < blocks.Count; b++) for (var b = 0; b < blocks.Count; b++)
{ {
if (blocks[b] == null) continue; if (blocks[b] == null)
for (int c = 0; c < blocks.Count; c++)
{ {
if (b == c) continue; continue;
if (blocks[c] == null) continue; }
if (AreRectangleOverlapping(blocks[b].BoundingBox, blocks[c].BoundingBox)) for (var c = 0; c < blocks.Count; c++)
{
if (b == c || blocks[c] == null)
{
continue;
}
if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
{ {
// Merge // Merge
// 1. Merge all words // 1. Merge all words
@@ -105,7 +142,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the // 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
// same block. Filtering will still be done based on angle. // same block. Filtering will still be done based on angle.
var mergedLines = GetLines(mergedWords.ToArray(), double.MaxValue, wlAngleLB, wlAngleUB); var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine);
blocks[b] = new TextBlock(mergedLines.ToList()); blocks[b] = new TextBlock(mergedLines.ToList());
// Remove // Remove
@@ -117,70 +154,57 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
return blocks.Where(b => b != null).ToList(); return blocks.Where(b => b != null).ToList();
} }
private bool AreRectangleOverlapping(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right) return false;
if (rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) return false;
return true;
}
/// <summary> /// <summary>
/// Get information on the nearest point, filtered for angle. /// Get information on the nearest point, filtered for angle.
/// </summary> /// </summary>
/// <param name="words"></param> private double? GetNearestPointDistance(List<Word> words, Word pivot, Func<PdfRectangle,
/// <param name="pivot"></param>
/// <param name="funcPivotDist"></param>
/// <param name="funcPivotAngle"></param>
/// <param name="funcPointsDist"></param>
/// <param name="funcPointsAngle"></param>
/// <param name="angleStart"></param>
/// <param name="angleEnd"></param>
/// <param name="finalDistMEasure"></param>
/// <returns></returns>
private double[] GetNearestPointData(Word[] words, Word pivot, Func<PdfRectangle,
PdfPoint> funcPivotDist, Func<PdfRectangle, PdfPoint> funcPivotAngle, PdfPoint> funcPivotDist, Func<PdfRectangle, PdfPoint> funcPivotAngle,
Func<PdfRectangle, PdfPoint> funcPointsDist, Func<PdfRectangle, PdfPoint> funcPointsAngle, Func<PdfRectangle, PdfPoint> funcPointsDist, Func<PdfRectangle, PdfPoint> funcPointsAngle,
double angleStart, double angleEnd, AngleBounds angleBounds,
Func<PdfPoint, PdfPoint, double> finalDistMEasure) Func<PdfPoint, PdfPoint, double> finalDistanceMeasure)
{ {
var pointR = funcPivotDist(pivot.BoundingBox); var pointR = funcPivotDist(pivot.BoundingBox);
// Filter by angle var pivotPoint = funcPivotAngle(pivot.BoundingBox);
var filtered = words.Where(w =>
{
var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox));
return (angleWL >= angleStart && angleWL <= angleEnd);
}).ToList();
filtered.Remove(pivot); // remove itself
if (filtered.Count > 0) var wordsWithinAngleBoundDistancePoints = new List<PdfPoint>();
{
int index = pointR.FindIndexNearest(
filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(),
Distances.Euclidean, out double distWL);
if (index >= 0) // Filter to words within the angle range.
foreach (var word in words)
{
// Ignore the pivot word.
if (ReferenceEquals(word, pivot))
{ {
var matchWL = filtered[index]; continue;
return new double[] }
{
(double)pivot.Letters.Select(l => l.FontSize).Mode(), var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox));
finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox))
}; if (angleBounds.Contains(angle))
{
wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox));
} }
} }
return null;
if (wordsWithinAngleBoundDistancePoints.Count == 0)
{
return null;
}
var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, Distances.Euclidean, out _);
if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
{
return null;
}
return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]);
} }
/// <summary> /// <summary>
/// Build lines via transitive closure. /// Build lines via transitive closure.
/// </summary> /// </summary>
/// <param name="words"></param> private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine)
/// <param name="maxDist"></param>
/// <param name="wlAngleLB"></param>
/// <param name="wlAngleUB"></param>
/// <returns></returns>
private IEnumerable<TextLine> GetLines(Word[] words, double maxDist, double wlAngleLB, double wlAngleUB)
{ {
/*************************************************************************************************** /***************************************************************************************************
* /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not * /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
@@ -196,8 +220,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
pivot => true, pivot => true,
(pivot, candidate) => (pivot, candidate) =>
{ {
var angleWL = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle // Compare bottom right with bottom left for angle
return (angleWL >= wlAngleLB && angleWL <= wlAngleUB); var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);
return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
}).ToList(); }).ToList();
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList(); Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
@@ -214,7 +240,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList(); orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
} }
for (int a = 0; a < groupedIndexes.Count(); a++) for (var a = 0; a < groupedIndexes.Count; a++)
{ {
yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i]))); yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
} }
@@ -223,10 +249,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
/// <summary> /// <summary>
/// Build blocks via transitive closure. /// Build blocks via transitive closure.
/// </summary> /// </summary>
/// <param name="lines"></param> private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
/// <param name="maxDist"></param>
/// <returns></returns>
private IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
{ {
/************************************************************************************************** /**************************************************************************************************
* We want to measure the distance between two lines using the following method: * We want to measure the distance between two lines using the following method:
@@ -249,11 +272,11 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
if (d < 0) return double.MaxValue; // not overlapping -> max distance if (d < 0) return double.MaxValue; // not overlapping -> max distance
return Distances.Euclidean( return Distances.Euclidean(
new PdfPoint(left + d / 2, l1.Point1.Y), new PdfPoint(left + d / 2, l1.Point1.Y),
new PdfPoint(left + d / 2, l2.Point1.Y)); new PdfPoint(left + d / 2, l2.Point1.Y));
}; };
var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines, var groupedIndexes = ClusteringAlgorithms.SimpleTransitiveClosure(lines,
euclidianOverlappingMiddleDistance, euclidianOverlappingMiddleDistance,
(pivot, candidate) => maxDist, (pivot, candidate) => maxDist,
pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight), pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
@@ -265,26 +288,72 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList()); yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
} }
} }
/// <summary> /// <summary>
/// Get the average distance value of the peak bucket of the histogram. /// Get the average distance value of the peak bucket of the histogram.
/// </summary> /// </summary>
/// <param name="values">array[0]=font size, array[1]=distance</param> /// <param name="distances">The set of distances to average.</param>
/// <returns></returns> private static double? GetPeakAverageDistance(IEnumerable<double> distances)
private double GetPeakAverageDistance(IEnumerable<double[]> values)
{ {
int max = (int)values.Max(x => x[1]) + 1; var buckets = new Dictionary<int, List<double>>();
int[] distrib = new int[max]; foreach (var distance in distances)
// Create histogram with buckets of size 1.
for (int i = 0; i < max; i++)
{ {
distrib[i] = values.Where(x => x[1] > i && x[1] <= i + 1).Count(); var floor = (int)distance;
if (buckets.ContainsKey(floor))
{
buckets[floor].Add(distance);
}
else
{
buckets[floor] = new List<double> {distance};
}
} }
var peakIndex = Array.IndexOf(distrib, distrib.Max()); var best = default(List<double>);
return values.Where(v => v[1] > peakIndex && v[1] <= peakIndex + 1).Average(x => x[1]); foreach (var bucket in buckets)
{
if (best == null || bucket.Value.Count > best.Count)
{
best = bucket.Value;
}
}
return best?.Average();
}
/// <summary>
/// The bounds for the angle between two words for them to have a certain type of relationship.
/// </summary>
public struct AngleBounds
{
/// <summary>
/// The lower bound in degrees.
/// </summary>
public double Lower { get; }
/// <summary>
/// The upper bound in degrees.
/// </summary>
public double Upper { get; }
/// <summary>
/// Create a new <see cref="AngleBounds"/>.
/// </summary>
public AngleBounds(double lowerBound, double upperBound)
{
Lower = lowerBound;
Upper = upperBound;
}
/// <summary>
/// Whether the bounds contain the angle.
/// </summary>
public bool Contains(double angle)
{
return angle >= Lower && angle <= Upper;
}
} }
} }
} }

View File

@@ -0,0 +1,32 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Alternative.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoAlternative
{
/// <summary>
/// Purpose.
/// </summary>
[XmlAttribute("PURPOSE")]
public string Purpose { get; set; }
/// <summary>
/// Value.
/// </summary>
[XmlText]
public string Value { get; set; }
}
}
}

View File

@@ -0,0 +1,144 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Schema;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Base type for any kind of block on the page.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[XmlInclude(typeof(AltoTextBlock))]
[XmlInclude(typeof(AltoGraphicalElement))]
[XmlInclude(typeof(AltoIllustration))]
[XmlInclude(typeof(AltoComposedBlock))]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoBlock : AltoPositionedElement
{
private float rotation;
private bool correctionStatus;
private AltoBlockTypeShow show;
private AltoBlockTypeActuate actuate;
/// <remarks/>
public AltoShape Shape { get; set; }
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <remarks/>
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
public string StyleRefs { get; set; }
/// <remarks/>
[XmlAttribute("TAGREFS", DataType = "IDREFS")]
public string TagRefs { get; set; }
/// <remarks/>
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
public string ProcessingRefs { get; set; }
/// <summary>
/// The rotation of e.g. text or illustration within the block. The value is in degree counterclockwise.
/// </summary>
[XmlAttribute("ROTATION")]
public float Rotation
{
get => rotation;
set
{
rotation = value;
if (!float.IsNaN(value)) RotationSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool RotationSpecified { get; private set; }
/// <summary>
/// The next block in reading sequence on the page.
/// </summary>
[XmlAttribute("IDNEXT", DataType = "IDREF")]
public string IdNext { get; set; }
/// <summary>
/// Correction Status. Indicates whether manual correction has been done or not.
/// The correction status should be recorded at the highest level possible (Block, TextLine, String).
/// </summary>
[XmlAttribute("CS")]
public bool CorrectionStatus
{
get => correctionStatus;
set
{
correctionStatus = value;
CorrectionStatusSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool CorrectionStatusSpecified { get; private set; }
/// <remarks/>
[XmlAttribute("type", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
public string Type { get; set; }
/// <remarks/>
[XmlAttribute("href", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink", DataType = "anyURI")]
public string Href { get; set; }
/// <remarks/>
[XmlAttribute("role", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
public string Role { get; set; }
/// <remarks/>
[XmlAttribute("arcrole", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
public string Arcrole { get; set; }
/// <remarks/>
[XmlAttribute("title", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
public string Title { get; set; }
/// <remarks/>
[XmlAttribute("show", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
public AltoBlockTypeShow Show
{
get => show;
set
{
show = value;
ShowSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool ShowSpecified { get; private set; }
/// <remarks/>
[XmlAttribute("actuate", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
public AltoBlockTypeActuate Actuate
{
get => actuate;
set
{
actuate = value;
ActuateSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool ActuateSpecified { get; private set; }
}
}
}

View File

@@ -0,0 +1,31 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto/xlink] Block Type Actuate
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(AnonymousType = true, Namespace = "http://www.w3.org/1999/xlink")]
public enum AltoBlockTypeActuate
{
/// <remarks/>
[XmlEnum("onLoad")]
OnLoad,
/// <remarks/>
[XmlEnum("onRequest")]
OnRequest,
/// <remarks/>
[XmlEnum("other")]
Other,
/// <remarks/>
[XmlEnum("none")]
None,
}
}
}

View File

@@ -0,0 +1,34 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto/xlink] Block Type Show.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(AnonymousType = true, Namespace = "http://www.w3.org/1999/xlink")]
public enum AltoBlockTypeShow
{
/// <remarks/>
[XmlEnum("new")]
New,
/// <remarks/>
[XmlEnum("replace")]
Replace,
/// <remarks/>
[XmlEnum("embed")]
Embed,
/// <remarks/>
[XmlEnum("other")]
Other,
/// <remarks/>
[XmlEnum("none")]
None
}
}
}

View File

@@ -0,0 +1,32 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A circle shape. <see cref="HorizontalPosition"/> and <see cref="VerticalPosition"/> describe the center of the circle.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoCircle
{
/// <remarks/>
[XmlAttribute("HPOS")]
public float HorizontalPosition { get; set; }
/// <remarks/>
[XmlAttribute("VPOS")]
public float VerticalPosition { get; set; }
/// <remarks/>
[XmlAttribute("RADIUS")]
public float Radius { get; set; }
}
}
}

View File

@@ -0,0 +1,45 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A block that consists of other blocks.
/// <para>WARNING: The CIRCULAR GROUP REFERENCES was removed from the xsd.
/// NEED TO ADD IT BACK!!!</para>
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoComposedBlock : AltoBlock
{
// TODO: what is this?
/*****************************************************************
* /!\ WARNING /!\
* The CIRCULAR GROUP REFERENCES below was removed from the xsd
* NEED TO ADD IT BACK!!!
* <xsd:sequence minOccurs="0" maxOccurs="unbounded">
* <xsd:group ref="BlockGroup"/>
* </xsd:sequence>
*****************************************************************/
/// <summary>
/// A user defined string to identify the type of composed block (e.g. table, advertisement, ...)
/// </summary>
[XmlAttribute("TYPE")]
public string TypeComposed { get; set; }
/// <summary>
/// An ID to link to an image which contains only the composed block.
/// The ID and the file link is defined in the related METS file.
/// </summary>
[XmlAttribute("FILEID")]
public string FileId { get; set; }
}
}
}

View File

@@ -0,0 +1,37 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Description
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoDescription
{
/// <remarks/>
public AltoMeasurementUnit MeasurementUnit { get; set; }
/// <remarks/>
[XmlElement("sourceImageInformation")]
public AltoSourceImageInformation SourceImageInformation { get; set; }
/// <summary>
/// Element deprecated. 'Processing' should be used instead.
/// </summary>
[XmlElement("OCRProcessing")]
public AltoDescriptionOcrProcessing[] OcrProcessing { get; set; }
/// <remarks/>
[XmlElement("Processing")]
public AltoDescriptionProcessing[] Processings { get; set; }
}
}
}

View File

@@ -0,0 +1,26 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <inheritdoc />
/// <summary>
/// [Alto] Description Ocr Processing
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoDescriptionOcrProcessing : AltoOcrProcessing
{
/// <remarks/>
[XmlAttribute(DataType = "ID")]
public string Id { get; set; }
}
}
}

View File

@@ -0,0 +1,27 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <inheritdoc />
/// <summary>
/// [Alto] Description Processing
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoDescriptionProcessing : AltoProcessingStep
{
/// <summary>
/// Id.
/// </summary>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
}
}
}

View File

@@ -0,0 +1,34 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A unique identifier for the document.
/// <para>This identifier must be unique within the local
/// To facilitate file sharing or interoperability with other systems,
/// documentIdentifierLocation may be added to designate the system or
/// application where the identifier is unique.</para>
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoDocumentIdentifier
{
/// <summary>
/// A location qualifier, i.e., a namespace.
/// </summary>
[XmlAttribute("documentIdentifierLocation")]
public string DocumentIdentifierLocation { get; set; }
/// <remarks/>
[XmlText]
public string Value { get; set; }
}
}
}

View File

@@ -0,0 +1,57 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] An ellipse shape. HPOS and VPOS describe the center of the ellipse.
/// HLENGTH and VLENGTH are the width and height of the described ellipse.
/// <para>The attribute ROTATION tells the rotation of the e.g. text or
/// illustration within the block.The value is in degrees counterclockwise.</para>
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoEllipse
{
private float rotation;
/// <remarks/>
[XmlAttribute("HPOS")]
public float HorizontalPosition { get; set; }
/// <remarks/>
[XmlAttribute("VPOS")]
public float VerticalPosition { get; set; }
/// <remarks/>
[XmlAttribute("HLENGTH")]
public float HorizontalLength { get; set; }
/// <remarks/>
[XmlAttribute("VLENGTH")]
public float VerticalLength { get; set; }
/// <remarks/>
[XmlAttribute("ROTATION")]
public float Rotation
{
get => rotation;
set
{
rotation = value;
if (!float.IsNaN(value)) RotationSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool RotationSpecified { get; private set; }
}
}
}

View File

@@ -0,0 +1,33 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A unique identifier for the image file. This is drawn from MIX.
///
/// <para>This identifier must be unique within the local
/// To facilitate file sharing or interoperability with other systems,
/// fileIdentifierLocation may be added to designate the system or
/// application where the identifier is unique.</para>
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoFileIdentifier
{
/// <remarks/>
[XmlAttribute("fileIdentifierLocation")]
public string FileIdentifierLocation { get; set; }
/// <remarks/>
[XmlText]
public string Value { get; set; }
}
}
}

View File

@@ -0,0 +1,50 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Font styles.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Flags]
[Serializable]
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoFontStyles
{
/// <summary>
/// Bold.
/// </summary>
[XmlEnum("bold")]
Bold = 1,
/// <summary>
/// Italics.
/// </summary>
[XmlEnum("italics")]
Italics = 2,
/// <summary>
/// Subscript.
/// </summary>
[XmlEnum("subscript")]
Subscript = 4,
/// <summary>
/// Superscript.
/// </summary>
[XmlEnum("superscript")]
Superscript = 8,
/// <summary>
/// Small caps.
/// </summary>
[XmlEnum("smallcaps")]
SmallCaps = 16,
/// <summary>
/// Underline.
/// </summary>
[XmlEnum("underline")]
Underline = 32,
}
}
}

View File

@@ -0,0 +1,29 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Font type (Serif or Sans-Serif).
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoFontType
{
/// <summary>
/// Serif.
/// </summary>
[XmlEnum("serif")]
Serif,
/// <summary>
/// Sans-serif.
/// </summary>
[XmlEnum("sans-serif")]
SansSerif,
}
}
}

View File

@@ -0,0 +1,29 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Font width (Fixed or proportional).
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoFontWidth
{
/// <summary>
/// Proportional.
/// </summary>
[XmlEnum("proportional")]
Proportional,
/// <summary>
/// Remarks.
/// </summary>
[XmlEnum("fixed")]
Fixed
}
}
}

View File

@@ -0,0 +1,90 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Modern OCR software stores information on glyph level. A glyph is essentially a character or ligature.
/// Accordingly the value for the glyph element will be defined as follows:
/// Pre-composed representation = base + combining character(s) (decomposed representation)
/// See http://www.fileformat.info/info/unicode/char/0101/index.htm
/// "U+0101" = (U+0061) + (U+0304)
/// "combining characters" ("base characters" in combination with non-spacing marks or characters which are combined to one) are represented as one "glyph", e.g.áàâ.
///
/// <para>Each glyph has its own coordinate information and must be separately addressable as a distinct object.
/// Correction and verification processes can be carried out for individual characters.</para>
///
/// <para>Post-OCR analysis of the text as well as adaptive OCR algorithm must be able to record information on glyph level.
/// In order to reproduce the decision of the OCR software, optional characters must be recorded.These are called variants.
/// The OCR software evaluates each variant and picks the one with the highest confidence score as the glyph.
/// The confidence score expresses how confident the OCR software is that a single glyph had been recognized correctly.</para>
///
/// <para>The glyph elements are in order of the word. Each glyph need to be recorded to built up the whole word sequence.</para>
///
/// <para>The glyphs CONTENT attribute is no replacement for the strings CONTENT attribute.
/// Due to post-processing steps such as correction the values of both attributes may be inconsistent.</para>
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoGlyph : AltoPositionedElement
{
private float gc;
/// <remarks/>
public AltoShape Shape { get; set; }
/// <summary>
/// Alternative (combined) character for the glyph, outlined by OCR engine or similar recognition processes.
/// In case the variant are two (combining) characters, two characters are outlined in one Variant element.
/// E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn".
/// <para>Details for different use-cases see on the samples on GitHub.</para>
/// </summary>
[XmlElement("Variant")]
public AltoVariant[] Variant { get; set; }
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <summary>
/// CONTENT contains the precomposed representation (combining character) of the character from the parent String element.
/// The sequence position of the Gylph element matches the position of the character in the String.
/// </summary>
[XmlAttribute("CONTENT")]
public string Content { get; set; }
/// <summary>
/// This GC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence for the variant where is 1 is certain.
/// This attribute is optional. If it is not available, the default value for the variant is "0".
///
/// <para>The GC attribute semantic is the same as the WC attribute on the String element and VC on Variant element.</para>
/// </summary>
[XmlAttribute("GC")]
public float Gc
{
get => gc;
set
{
gc = value;
if (!float.IsNaN(value)) GcSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool GcSpecified { get; private set; }
/// <remarks/>
public override string ToString()
{
return Content;
}
}
}
}

View File

@@ -0,0 +1,21 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A graphic used to separate blocks. Usually a line or rectangle.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoGraphicalElement : AltoBlock
{
}
}
}

View File

@@ -0,0 +1,32 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A picture or image.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoIllustration : AltoBlock
{
/// <summary>
/// A user defined string to identify the type of illustration like photo, map, drawing, chart, ...
/// </summary>
[XmlAttribute("TYPE")]
public string IllustrationType { get; set; }
/// <summary>
/// A link to an image which contains only the illustration.
/// </summary>
[XmlAttribute("FILEID")]
public string FileId { get; set; }
}
}
}

View File

@@ -0,0 +1,44 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] There are following variation of tag types available:
/// LayoutTag criteria about arrangement or graphical appearance;
/// StructureTag criteria about grouping or formation;
/// RoleTag criteria about function or mission;
/// NamedEntityTag criteria about assignment of terms to their relationship / meaning (NER);
/// OtherTag criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those.;
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#", IncludeInSchema = false)]
public enum AltoItemsChoice
{
/// <summary>
/// Criteria about arrangement or graphical appearance.
/// </summary>
LayoutTag,
/// <summary>
/// Criteria about assignment of terms to their relationship / meaning (NER).
/// </summary>
NamedEntityTag,
/// <summary>
/// Criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those.
/// </summary>
OtherTag,
/// <summary>
/// Criteria about function or mission.
/// </summary>
RoleTag,
/// <summary>
/// Criteria about grouping or formation.
/// </summary>
StructureTag
}
}
}

View File

@@ -0,0 +1,28 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Layout.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoLayout
{
/// <remarks/>
[XmlElement("Page")]
public AltoPage[] Pages { get; set; }
/// <remarks/>
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
public string StyleRefs { get; set; }
}
}
}

View File

@@ -0,0 +1,47 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] All measurement values inside the alto file are related to this unit, except the font size.
///
/// Coordinates as being used in HPOS and VPOS are absolute coordinates referring to the upper-left corner of a page.
/// The upper left corner of the page is defined as coordinate (0/0).
///
/// <para>values meaning:
/// mm10: 1/10th of millimeter;
/// inch1200: 1/1200th of inch;
/// pixel: 1 pixel</para>
///
/// The values for pixel will be related to the resolution of the image based
/// on which the layout is described. Incase the original image is not known
/// the scaling factor can be calculated based on total width and height of
/// the image and the according information of the PAGE element.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoMeasurementUnit
{
/// <summary>
/// 1 pixel.
/// </summary>
[XmlEnum("pixel")]
Pixel,
/// <summary>
/// 1/10th of millimeter.
/// </summary>
[XmlEnum("mm10")]
Mm10,
/// <summary>
/// 1/1200th of inch.
/// </summary>
[XmlEnum("inch1200")]
Inch1200,
}
}
}

View File

@@ -0,0 +1,34 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Ocr Processing
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoOcrProcessing
{
/// <remarks/>
[XmlElement("preProcessingStep")]
public AltoProcessingStep[] PreProcessingStep { get; set; }
/// <remarks/>
public AltoProcessingStep OcrProcessingStep { get; set; }
/// <remarks/>
[XmlElement("postProcessingStep")]
public AltoProcessingStep[] PostProcessingStep { get; set; }
}
}
}

View File

@@ -0,0 +1,194 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] One page of a document.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoPage
{
private float height;
private float width;
private AltoQuality quality;
private AltoPosition position;
private float accuracy;
private float pc;
/// <summary>
/// The area between the top line of print and the upper edge of the leaf. It may contain page number or running title.
/// </summary>
public AltoPageSpace TopMargin { get; set; }
/// <summary>
/// The area between the printspace and the left border of a page. May contain margin notes.
/// </summary>
public AltoPageSpace LeftMargin { get; set; }
/// <summary>
/// The area between the printspace and the right border of a page. May contain margin notes.
/// </summary>
public AltoPageSpace RightMargin { get; set; }
/// <summary>
/// The area between the bottom line of letterpress or writing and the bottom edge of the leaf.
/// It may contain a page number, a signature number or a catch word.
/// </summary>
public AltoPageSpace BottomMargin { get; set; }
/// <summary>
/// Rectangle covering the printed area of a page. Page number and running title are not part of the print space.
/// </summary>
public AltoPageSpace PrintSpace { get; set; }
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <summary>
/// Any user-defined class like title page.
/// </summary>
[XmlAttribute("PAGECLASS")]
public string PageClass { get; set; }
/// <remarks/>
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
public string StyleRefs { get; set; }
/// <remarks/>
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
public string ProcessingRefs { get; set; }
/// <remarks/>
[XmlAttribute("HEIGHT")]
public float Height
{
get => height;
set
{
height = value;
if (!float.IsNaN(value)) HeightSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool HeightSpecified { get; private set; }
/// <remarks/>
[XmlAttribute("WIDTH")]
public float Width
{
get => width;
set
{
width = value;
if (!float.IsNaN(value)) WidthSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool WidthSpecified { get; private set; }
/// <summary>
/// The number of the page within the document.
/// </summary>
[XmlAttribute("PHYSICAL_IMG_NR")]
public float PhysicalImgNr { get; set; }
/// <summary>
/// The page number that is printed on the page.
/// </summary>
[XmlAttribute("PRINTED_IMG_NR")]
public string PrintedImgNr { get; set; }
/// <remarks/>
[XmlAttribute("QUALITY")]
public AltoQuality Quality
{
get => quality;
set
{
quality = value;
QualitySpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool QualitySpecified { get; private set; }
/// <remarks/>
[XmlAttribute("QUALITY_DETAIL")]
public string QualityDetail { get; set; }
/// <remarks/>
[XmlAttribute("POSITION")]
public AltoPosition Position
{
get => position;
set
{
position = value;
PositionSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool PositionSpecified { get; private set; }
/// <summary>
/// A link to the processing description that has been used for this page.
/// </summary>
[XmlAttribute("PROCESSING", DataType = "IDREF")]
public string Processing { get; set; }
/// <summary>
/// Estimated percentage of OCR Accuracy in range from 0 to 100
/// </summary>
[XmlAttribute("ACCURACY")]
public float Accuracy
{
get => accuracy;
set
{
accuracy = value;
if (!float.IsNaN(value)) AccuracySpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool AccuracySpecified { get; private set; }
/// <summary>
///
/// </summary>
[XmlAttribute("PC")]
public float Pc
{
get => pc;
set
{
pc = value;
if (!float.IsNaN(value)) PcSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool PcSpecified { get; private set; }
}
}
}

View File

@@ -0,0 +1,54 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <inheritdoc />
/// <summary>
/// [Alto] A region on a page.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoPageSpace : AltoPositionedElement
{
/// <summary>
/// Shape.
/// </summary>
public AltoShape Shape { get; set; }
/// <remarks/>
[XmlElement("TextBlock")]
public AltoTextBlock[] TextBlock { get; set; }
/// <remarks/>
[XmlElement("Illustration")]
public AltoIllustration[] Illustrations { get; set; }
/// <remarks/>
[XmlElement("GraphicalElement")]
public AltoGraphicalElement[] GraphicalElements { get; set; }
/// <remarks/>
[XmlElement("ComposedBlock")]
public AltoComposedBlock[] ComposedBlocks { get; set; }
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <remarks/>
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
public string StyleRefs { get; set; }
/// <remarks/>
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
public string ProcessingRefs { get; set; }
}
}
}

View File

@@ -0,0 +1,121 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A paragraph style defines formatting properties of text blocks.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoParagraphStyle
{
private AltoParagraphStyleAlign align;
private float left;
private float right;
private float linespace;
private float firstLine;
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <summary>
/// Indicates the alignement of the paragraph. Could be left, right, center or justify.
/// </summary>
[XmlAttribute("ALIGN")]
public AltoParagraphStyleAlign Align
{
get => align;
set
{
align = value;
AlignSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool AlignSpecified { get; private set; }
/// <summary>
/// Left indent of the paragraph in relation to the column.
/// </summary>
[XmlAttribute("LEFT")]
public float Left
{
get => left;
set
{
left = value;
if (!float.IsNaN(value)) LeftSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool LeftSpecified { get; private set; }
/// <summary>
/// Right indent of the paragraph in relation to the column.
/// </summary>
[XmlAttribute("RIGHT")]
public float Right
{
get => right;
set
{
right = value;
if (!float.IsNaN(value)) RightSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool RightSpecified { get; private set; }
/// <summary>
/// Line spacing between two lines of the paragraph. Measurement calculated from baseline to baseline.
/// </summary>
[XmlAttribute("LINESPACE")]
public float LineSpace
{
get => linespace;
set
{
linespace = value;
if (!float.IsNaN(value)) LineSpaceSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool LineSpaceSpecified { get; private set; }
/// <summary>
/// Indent of the first line of the paragraph if this is different from the other lines. A negative
/// value indicates an indent to the left, a positive value indicates an indent to the right.
/// </summary>
[XmlAttribute("FIRSTLINE")]
public float FirstLine
{
get => firstLine;
set
{
firstLine = value;
if (!float.IsNaN(value)) FirstLineSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool FirstLineSpecified { get; private set; }
}
}
}

View File

@@ -0,0 +1,27 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Indicates the alignment of the paragraph. Could be left, right, center or justify.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoParagraphStyleAlign
{
/// <remarks/>
Left,
/// <remarks/>
Right,
/// <remarks/>
Center,
/// <remarks/>
Block
}
}
}

View File

@@ -0,0 +1,24 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A polygon shape.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoPolygon
{
/// <remarks/>
[XmlAttribute("POINTS")]
public string Points { get; set; }
}
}
}

View File

@@ -0,0 +1,39 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Position of the page. Could be lefthanded, righthanded, cover, foldout or single if it has no special position.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoPosition
{
/// <summary>
/// Left page.
/// </summary>
Left,
/// <summary>
/// Right page.
/// </summary>
Right,
/// <summary>
/// Foldout page.
/// </summary>
Foldout,
/// <summary>
/// Single page.
/// </summary>
Single,
/// <summary>
/// Cover page.
/// </summary>
Cover
}
}
}

View File

@@ -0,0 +1,106 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// Encapsulates width/height and position data.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
public abstract class AltoPositionedElement
{
private float height;
private float width;
private float horizontalPosition;
private float verticalPosition;
/// <summary>
/// Height.
/// </summary>
[XmlAttribute("HEIGHT")]
public float Height
{
get => height;
set
{
height = value;
if (!float.IsNaN(value)) HeightSpecified = true;
}
}
/// <summary>
/// Whether to include <see cref="Height"/> in the output.
/// </summary>
[XmlIgnore]
public bool HeightSpecified { get; private set; }
/// <summary>
/// Width.
/// </summary>
[XmlAttribute("WIDTH")]
public float Width
{
get => width;
set
{
width = value;
if (!float.IsNaN(value)) WidthSpecified = true;
}
}
/// <summary>
/// Whether to include <see cref="Width"/> in the output.
/// </summary>
[XmlIgnore]
public bool WidthSpecified { get; private set; }
/// <summary>
/// Horizontal position.
/// </summary>
[XmlAttribute("HPOS")]
public float HorizontalPosition
{
get => horizontalPosition;
set
{
horizontalPosition = value;
if (!float.IsNaN(value)) HorizontalPositionSpecified = true;
}
}
/// <summary>
/// Whether to include <see cref="HorizontalPosition"/> in the output.
/// </summary>
[XmlIgnore]
public bool HorizontalPositionSpecified { get; private set; }
/// <summary>
/// Vertical position.
/// </summary>
[XmlAttribute("VPOS")]
public float VerticalPosition
{
get => verticalPosition;
set
{
verticalPosition = value;
if (!float.IsNaN(value)) VerticalPositionSpecified = true;
}
}
/// <summary>
/// Whether to include <see cref="VerticalPosition"/> in the output.
/// </summary>
[XmlIgnore]
public bool VerticalPositionSpecified { get; private set; }
}
}
}

View File

@@ -0,0 +1,46 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Classification of the category of operation, how the file was created, including generation, modification,
/// preprocessing, postprocessing or any other steps.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Flags]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoProcessingCategory
{
/// <summary>
/// Content generation.
/// </summary>
[XmlEnum("contentGeneration")]
ContentGeneration = 1,
/// <summary>
/// Content modification.
/// </summary>
[XmlEnum("contentModification")]
ContentModification = 2,
/// <summary>
/// Pre-operation.
/// </summary>
[XmlEnum("preOperation")]
PreOperation = 4,
/// <summary>
/// Post-operation.
/// </summary>
[XmlEnum("postOperation")]
PostOperation = 8,
/// <summary>
/// Other.
/// </summary>
[XmlEnum("other")]
Other = 16
}
}
}

View File

@@ -0,0 +1,49 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Information about a software application. Where applicable, the preferred method
/// for determining this information is by selecting Help -- About.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoProcessingSoftware
{
/// <summary>
/// The name of the organization or company that created the application.
/// </summary>
[XmlAttribute("softwareCreator")]
public string SoftwareCreator { get; set; }
/// <summary>
/// The name of the application.
/// </summary>
[XmlAttribute("softwareName")]
public string SoftwareName { get; set; }
/// <summary>
/// The version of the application.
/// </summary>
[XmlAttribute("softwareVersion")]
public string SoftwareVersion { get; set; }
/// <summary>
/// A description of any important characteristics of the application, especially for
/// non-commercial applications. For example, if a non-commercial application is built
/// using commercial components, e.g., an OCR engine SDK. Those components should be mentioned here.
/// </summary>
[XmlAttribute("applicationDescription")]
public string ApplicationDescription { get; set; }
}
}
}

View File

@@ -0,0 +1,71 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Description of the processing step.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoProcessingStep
{
private AltoProcessingCategory processingCategory;
/// <summary>
/// Classification of the category of operation, how the file was created, including
/// generation, modification, preprocessing, postprocessing or any other steps.
/// </summary>
[XmlAttribute("processingCategory")]
public AltoProcessingCategory ProcessingCategory
{
get => processingCategory;
set
{
processingCategory = value;
ProcessingCategorySpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool ProcessingCategorySpecified { get; private set; }
/// <summary>
/// Date or DateTime the image was processed.
/// </summary>
[XmlAttribute("processingDateTime")]
public string ProcessingDateTime { get; set; }
/// <summary>
/// Identifies the organization level producer(s) of the processed image.
/// </summary>
[XmlAttribute("processingAgency")]
public string ProcessingAgency { get; set; }
/// <summary>
/// An ordinal listing of the image processing steps performed. For example, "image despeckling."
/// </summary>
[XmlElement("processingStepDescription")]
public string[] ProcessingStepDescription { get; set; }
/// <summary>
/// A description of any setting of the processing application. For example, for a multi-engine
/// OCR application this might include the engines which were used. Ideally, this description
/// should be adequate so that someone else using the same application can produce identical results.
/// </summary>
[XmlAttribute("processingStepSettings")]
public string ProcessingStepSettings { get; set; }
/// <remarks/>
[XmlElement("processingSoftware")]
public AltoProcessingSoftware ProcessingSoftware { get; set; }
}
}
}

View File

@@ -0,0 +1,36 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Gives brief information about original page quality
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoQuality
{
/// <remarks/>
// ReSharper disable once InconsistentNaming
OK,
/// <remarks/>
Missing,
/// <remarks/>
[XmlEnum("Missing in original")]
MissingInOriginal,
/// <remarks/>
Damaged,
/// <remarks/>
Retained,
/// <remarks/>
Target,
/// <remarks/>
[XmlEnum("As in original")]
AsInOriginal,
}
}
}

View File

@@ -0,0 +1,25 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A white space.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
// ReSharper disable once InconsistentNaming
public class AltoSP : AltoPositionedElement
{
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
}
}
}

View File

@@ -0,0 +1,26 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Describes the bounding shape of a block, if it is not rectangular.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoShape
{
/// <remarks/>
[XmlElement("Circle", typeof(AltoCircle))]
[XmlElement("Ellipse", typeof(AltoEllipse))]
[XmlElement("Polygon", typeof(AltoPolygon))]
public object Item { get; set; }
}
}
}

View File

@@ -0,0 +1,32 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Information to identify the image file from which the OCR text was created.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoSourceImageInformation
{
/// <remarks/>
[XmlElement("fileName")]
public string FileName { get; set; }
/// <remarks/>
[XmlElement("fileIdentifier")]
public AltoFileIdentifier[] FileIdentifiers { get; set; }
/// <remarks/>
[XmlElement("documentIdentifier")]
public AltoDocumentIdentifier[] DocumentIdentifiers { get; set; }
}
}
}

View File

@@ -0,0 +1,148 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A sequence of chars. Strings are separated by white spaces or hyphenation chars.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoString : AltoPositionedElement
{
private AltoFontStyles style;
private AltoSubsType subsType;
private float wc;
private bool correctionStatus;
/// <remarks/>
public AltoShape Shape { get; set; }
/// <remarks/>
[XmlElement("ALTERNATIVE")]
public AltoAlternative[] Alternative { get; set; }
/// <remarks/>
[XmlElement("Glyph")]
public AltoGlyph[] Glyph { get; set; }
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <remarks/>
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
public string StyleRefs { get; set; }
/// <remarks/>
[XmlAttribute("TAGREFS", DataType = "IDREFS")]
public string TagRefs { get; set; }
/// <remarks/>
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
public string ProcessingRefs { get; set; }
/// <remarks/>
[XmlAttribute("CONTENT")]
public string Content { get; set; }
/// <remarks/>
[XmlAttribute("STYLE")]
public AltoFontStyles Style
{
get => style;
set
{
style = value;
StyleSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool StyleSpecified { get; private set; }
/// <remarks/>
[XmlAttribute("SUBS_TYPE")]
public AltoSubsType SubsType
{
get => subsType;
set
{
subsType = value;
SubsTypeSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool SubsTypeSpecified { get; private set; }
/// <summary>
/// Content of the substitution.
/// </summary>
[XmlAttribute("SUBS_CONTENT")]
public string SubsContent { get; set; }
/// <remarks/>
[XmlAttribute("WC")]
public float Wc
{
get => wc;
set
{
wc = value;
if (!float.IsNaN(value)) WcSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool WcSpecified { get; private set; }
/// <summary>
/// Confidence level of each character in that string. A list of numbers,
/// one number between 0 (sure) and 9 (unsure) for each character.
/// </summary>
[XmlAttribute("CC")]
public string Cc { get; set; }
/// <summary>
/// Correction Status. Indicates whether manual correction has been done or not.
/// The correction status should be recorded at the highest level possible (Block, TextLine, String).
/// </summary>
[XmlAttribute("CS")]
public bool CorrectionStatus
{
get => correctionStatus;
set
{
correctionStatus = value;
CorrectionStatusSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool CorrectionStatusSpecified { get; private set; }
/// <summary>
/// Attribute to record language of the string. The language should be recorded at the highest level possible.
/// </summary>
[XmlAttribute("LANG", DataType = "language")]
public string Language { get; set; }
/// <remarks/>
public override string ToString()
{
return Content;
}
}
}
}

View File

@@ -0,0 +1,32 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Styles.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoStyles
{
/// <summary>
/// Text Style.
/// </summary>
[XmlElement("TextStyle")]
public AltoTextStyle[] TextStyle { get; set; }
/// <summary>
/// Paragraph Style.
/// </summary>
[XmlElement("ParagraphStyle")]
public AltoParagraphStyle[] ParagraphStyle { get; set; }
}
}
}

View File

@@ -0,0 +1,25 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Type of the substitution (if any).
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public enum AltoSubsType
{
/// <remarks/>
HypPart1,
/// <remarks/>
HypPart2,
/// <remarks/>
Abbreviation,
}
}
}

View File

@@ -0,0 +1,60 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Tag.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoTag
{
/// <summary>
/// The xml data wrapper element XmlData is used to contain XML encoded metadata.
/// The content of an XmlData element can be in any namespace or in no namespace.
/// As permitted by the XML Schema Standard, the processContents attribute value for the
/// metadata in an XmlData is set to "lax". Therefore, if the source schema and its location are
/// identified by means of an XML schemaLocation attribute, then an XML processor will validate
/// the elements for which it can find declarations.If a source schema is not identified, or cannot be
/// found at the specified schemaLocation, then an XML validator will check for well-formedness,
/// but otherwise skip over the elements appearing in the XmlData element.
/// </summary>
public AltoTagXmlData XmlData { get; set; }
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <summary>
/// Type can be used to classify and group the information within each tag element type.
/// </summary>
[XmlAttribute("TYPE")]
public string Type { get; set; }
/// <summary>
/// Content / information value of the tag.
/// </summary>
[XmlAttribute("LABEL")]
public string Label { get; set; }
/// <summary>
/// Description text for tag information for clarification.
/// </summary>
[XmlAttribute("DESCRIPTION")]
public string Description { get; set; }
/// <summary>
/// Any URI for authority or description relevant information.
/// </summary>
[XmlAttribute("URI", DataType = "anyURI")]
public string Uri { get; set; }
}
}
}

View File

@@ -0,0 +1,32 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] The xml data wrapper element XmlData is used to contain XML encoded metadata.
/// The content of an XmlData element can be in any namespace or in no namespace.
/// As permitted by the XML Schema Standard, the processContents attribute value for the
/// metadata in an XmlData is set to "lax". Therefore, if the source schema and its location are
/// identified by means of an XML schemaLocation attribute, then an XML processor will validate
/// the elements for which it can find declarations. If a source schema is not identified, or cannot be
/// found at the specified schemaLocation, then an XML validator will check for well-formedness,
/// but otherwise skip over the elements appearing in the XmlData element.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoTagXmlData
{
/// <remarks/>
[XmlAnyElement]
public XmlElement[] Any { get; set; }
}
}
}

View File

@@ -0,0 +1,39 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] There are following variation of tag types available:
/// LayoutTag criteria about arrangement or graphical appearance;
/// StructureTag criteria about grouping or formation;
/// RoleTag criteria about function or mission;
/// NamedEntityTag criteria about assignment of terms to their relationship / meaning (NER);
/// OtherTag criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those.;
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoTags
{
/// <remarks/>
[XmlElement("LayoutTag", typeof(AltoTag))]
[XmlElement("NamedEntityTag", typeof(AltoTag))]
[XmlElement("OtherTag", typeof(AltoTag))]
[XmlElement("RoleTag", typeof(AltoTag))]
[XmlElement("StructureTag", typeof(AltoTag))]
[XmlChoiceIdentifier("ItemsElementName")]
public AltoTag[] Items { get; set; }
/// <remarks/>
[XmlElement("ItemsElementName")]
[XmlIgnore]
public AltoItemsChoice[] ItemsElementName { get; set; }
}
}
}

View File

@@ -0,0 +1,43 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <inheritdoc />
/// <summary>
/// [Alto] A block of text.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoTextBlock : AltoBlock
{
/// <remarks/>
[XmlElement("TextLine")]
public AltoTextBlockTextLine[] TextLines { get; set; }
/// <summary>
/// Attribute deprecated. LANG should be used instead.
/// </summary>
[XmlAttribute("language", DataType = "language")]
public string Language { get; set; }
/// <summary>
/// Attribute to record language of the textblock.
/// </summary>
[XmlAttribute("LANG", DataType = "language")]
public string Lang { get; set; }
/// <remarks/>
public override string ToString()
{
return string.Join<AltoTextBlockTextLine>(" ", TextLines);
}
}
}
}

View File

@@ -0,0 +1,101 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A single line of text.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoTextBlockTextLine : AltoPositionedElement
{
private float baseline;
private bool correctionStatus;
/// <remarks/>
public AltoShape Shape { get; set; }
/// <remarks/>
[XmlElement("String")]
public AltoString[] Strings { get; set; }
/// <remarks/>
[XmlElement("SP")]
public AltoSP[] Sp { get; set; }
/// <summary>
/// A hyphenation char. Can appear only at the end of a line.
/// </summary>
[XmlElement("HYP")]
public AltoTextBlockTextLineHyp Hyp { get; set; }
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <remarks/>
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
public string StyleRefs { get; set; }
/// <remarks/>
[XmlAttribute("TAGREFS", DataType = "IDREFS")]
public string TagRefs { get; set; }
/// <remarks/>
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
public string ProcessingRefs { get; set; }
/// <remarks/>
[XmlAttribute("BASELINE")]
public float BaseLine
{
get => baseline;
set
{
baseline = value;
if (!float.IsNaN(value)) BaseLineSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool BaseLineSpecified { get; private set; }
/// <remarks/>
[XmlAttribute("LANG", DataType = "language")]
public string Language { get; set; }
/// <summary>
/// Correction Status. Indicates whether manual correction has been done or not.
/// The correction status should be recorded at the highest level possible (Block, TextLine, String).
/// </summary>
[XmlAttribute("CS")]
public bool CorrectionStatus
{
get => correctionStatus;
set
{
correctionStatus = value;
CorrectionStatusSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool CorrectionStatusSpecified { get; private set; }
/// <remarks/>
public override string ToString()
{
return string.Join<AltoString>(" ", Strings); // take in account order?
}
}
}
}

View File

@@ -0,0 +1,26 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A hyphenation char. Can appear only at the end of a line.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoTextBlockTextLineHyp : AltoPositionedElement
{
/// <summary>
/// Content.
/// </summary>
[XmlAttribute("CONTENT")]
public string Content { get; set; }
}
}
}

View File

@@ -0,0 +1,96 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] A text style defines font properties of text.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoTextStyle
{
private AltoFontType fontType;
private AltoFontWidth fontWidth;
private AltoFontStyles fontStyle;
/// <remarks/>
[XmlAttribute("ID", DataType = "ID")]
public string Id { get; set; }
/// <summary>
/// The font name.
/// </summary>
[XmlAttribute("FONTFAMILY")]
public string FontFamily { get; set; }
/// <remarks/>
[XmlAttribute("FONTTYPE")]
public AltoFontType FontType
{
get => fontType;
set
{
fontType = value;
FontTypeSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool FontTypeSpecified { get; private set; }
/// <remarks/>
[XmlAttribute("FONTWIDTH")]
public AltoFontWidth FontWidth
{
get => fontWidth;
set
{
fontWidth = value;
FontWidthSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool FontWidthSpecified { get; private set; }
/// <summary>
/// The font size, in points (1/72 of an inch).
/// </summary>
[XmlAttribute("FONTSIZE")]
public float FontSize { get; set; }
/// <summary>
/// The font color as an RGB value.
/// </summary>
[XmlAttribute("FONTCOLOR", DataType = "hexBinary")]
public byte[] FontColor { get; set; }
/// <summary>
/// The font style.
/// </summary>
[XmlAttribute("FONTSTYLE")]
public AltoFontStyles FontStyle
{
get => fontStyle;
set
{
fontStyle = value;
FontStyleSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool FontStyleSpecified { get; private set; }
}
}
}

View File

@@ -0,0 +1,56 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
public partial class AltoDocument
{
/// <summary>
/// [Alto] Alternative (combined) character for the glyph, outlined by OCR engine or similar recognition processes.
/// In case the variant are two (combining) characters, two characters are outlined in one Variant element.
/// E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn".
/// Details for different use-cases see on the samples on GitHub.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
public class AltoVariant
{
private float vcField;
/// <summary>
/// Each Variant represents an option for the glyph that the OCR software detected as possible alternatives.
/// In case the variant are two(combining) characters, two characters are outlined in one Variant element.
/// E.g.a Glyph element with CONTENT="m" can have a Variant element with the content "rn".
///
/// <para>Details for different use-cases see on the samples on GitHub.</para>
/// </summary>
[XmlAttribute("CONTENT")]
public string Content { get; set; }
/// <summary>
/// This VC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence
/// for the variant where is 1 is certain.
/// This attribute is optional. If it is not available, the default value for the variant is "0".
/// The VC attribute semantic is the same as the GC attribute on the Glyph element.
/// </summary>
[XmlAttribute("VC")]
public float Vc
{
get => vcField;
set
{
vcField = value;
if (!float.IsNaN(value)) VcSpecified = true;
}
}
/// <remarks/>
[XmlIgnore]
public bool VcSpecified { get; private set; }
}
}
}

View File

@@ -0,0 +1,51 @@
namespace UglyToad.PdfPig.Export.Alto
{
using System;
using System.ComponentModel;
using System.Diagnostics;
using System.Xml.Serialization;
/// <summary>
/// [Alto] Alto Schema root
/// <para>Version 4.1</para>
/// See https://github.com/altoxml/schema
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
[Serializable]
[DebuggerStepThrough]
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
[XmlRoot("alto", Namespace = "http://www.loc.gov/standards/alto/ns-v4#", IsNullable = false)]
public partial class AltoDocument
{
/// <summary>
/// Describes general settings of the alto file like measurement units and metadata
/// </summary>
public AltoDescription Description { get; set; }
/// <summary>
/// Styles define properties of layout elements. A style defined in a parent element
/// is used as default style for all related children elements.
/// </summary>
public AltoStyles Styles { get; set; }
/// <summary>
/// Tag define properties of additional characteristic. The tags are referenced from
/// related content element on Block or String element by attribute TAGREF via the tag ID.
///
/// This container element contains the individual elements for LayoutTags, StructureTags,
/// RoleTags, NamedEntityTags and OtherTags
/// </summary>
public AltoTags Tags { get; set; }
/// <summary>
/// The root layout element.
/// </summary>
public AltoLayout Layout { get; set; }
/// <summary>
/// Schema version of the ALTO file.
/// </summary>
[XmlAttribute("SCHEMAVERSION")]
public string SchemaVersion { get; set; }
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -126,6 +126,25 @@
BottomRight = bottomRight; BottomRight = bottomRight;
} }
/// <summary>
/// Whether two rectangles overlap.
/// </summary>
public bool IntersectsWith(PdfRectangle rectangle)
{
if (Left > rectangle.Right || rectangle.Left > Right)
{
return false;
}
if (Top < rectangle.Bottom || rectangle.Top < Bottom)
{
return false;
}
return true;
}
/// <summary> /// <summary>
/// To string override. /// To string override.
/// </summary> /// </summary>