mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-20 20:07:57 +08:00
merge pull request #72 from uglytoad/fix-export-formatting
fix export formatting
This commit is contained in:
@@ -4,6 +4,11 @@
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Xml;
|
||||
using System.Xml.Linq;
|
||||
using DocumentLayoutAnalysis;
|
||||
using Export;
|
||||
using Xunit;
|
||||
|
||||
public class PigProductionHandbookTests
|
||||
@@ -32,7 +37,7 @@
|
||||
var page = document.GetPage(1);
|
||||
|
||||
// Pinkish.
|
||||
var (r, g , b) = page.Letters[0].Color.ToRGBValues();
|
||||
var (r, g, b) = page.Letters[0].Color.ToRGBValues();
|
||||
|
||||
Assert.Equal(1, r);
|
||||
Assert.Equal(0.914m, g);
|
||||
@@ -98,7 +103,7 @@
|
||||
[Fact]
|
||||
public void Page4HasCorrectWords()
|
||||
{
|
||||
var expected = WordsPage4.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries);
|
||||
var expected = WordsPage4.Split(new[] { "\r", "\r\n", "\n", " " }, StringSplitOptions.RemoveEmptyEntries);
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var page = document.GetPage(4);
|
||||
@@ -129,6 +134,41 @@
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanExportAltoXmlFormat()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
|
||||
var xml = exporter.Get(document.GetPage(4), true);
|
||||
Assert.NotNull(xml);
|
||||
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
|
||||
using (var xmlReader = new XmlTextReader(xmlStream))
|
||||
{
|
||||
var xDocument = XDocument.Load(xmlReader);
|
||||
Assert.NotNull(xDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanExportAltoXmlFormatPage16()
|
||||
{
|
||||
// Page 16 contains an unprintable string and a single line of text which causes problems for Docstrum.
|
||||
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
|
||||
{
|
||||
var exporter = new AltoXmlTextExporter(new NearestNeighbourWordExtractor(), new DocstrumBoundingBoxes());
|
||||
var xml = exporter.Get(document.GetPage(16), true);
|
||||
Assert.NotNull(xml);
|
||||
using (var xmlStream = new MemoryStream(Encoding.UTF8.GetBytes(xml)))
|
||||
using (var xmlReader = new XmlTextReader(xmlStream))
|
||||
{
|
||||
var xDocument = XDocument.Load(xmlReader);
|
||||
Assert.NotNull(xDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void LettersHaveCorrectPosition()
|
||||
{
|
||||
|
@@ -77,7 +77,7 @@
|
||||
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
|
||||
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.Distances",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBB",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.DocstrumBoundingBoxes",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.DefaultPageSegmenter",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.IPageSegmenter",
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.MathExtensions",
|
||||
@@ -87,10 +87,11 @@
|
||||
"UglyToad.PdfPig.DocumentLayoutAnalysis.EdgeType",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
|
||||
"UglyToad.PdfPig.Exceptions.PdfDocumentFormatException",
|
||||
"UglyToad.PdfPig.Export.ITextExporter",
|
||||
"UglyToad.PdfPig.Export.AltoXmlTextExporter",
|
||||
"UglyToad.PdfPig.Export.HOcrTextExporter",
|
||||
"UglyToad.PdfPig.Export.ITextExporter",
|
||||
"UglyToad.PdfPig.Export.PageXmlTextExporter",
|
||||
"UglyToad.PdfPig.Export.Alto.AltoDocument",
|
||||
"UglyToad.PdfPig.Fonts.DescriptorFontFile",
|
||||
"UglyToad.PdfPig.Fonts.Exceptions.InvalidFontFormatException",
|
||||
"UglyToad.PdfPig.Fonts.FontDescriptor",
|
||||
|
@@ -11,6 +11,67 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// </summary>
|
||||
internal class ClusteringAlgorithms
|
||||
{
|
||||
/// <summary>
|
||||
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Letter, Word, TextLine, etc.</typeparam>
|
||||
/// <param name="elements">List of elements to group.</param>
|
||||
/// <param name="distMeasure">The distance measure between two points.</param>
|
||||
/// <param name="maxDistanceFunction">The function that determines the maximum distance between two points in the same cluster.</param>
|
||||
/// <param name="pivotPoint">The pivot's point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="candidatesPoint">The candidates' point to use for pairing, e.g. BottomLeft, TopLeft.</param>
|
||||
/// <param name="filterPivot">Filter to apply to the pivot point. If false, point will not be paired at all, e.g. is white space.</param>
|
||||
/// <param name="filterFinal">Filter to apply to both the pivot and the paired point. If false, point will not be paired at all, e.g. pivot and paired point have same font.</param>
|
||||
internal static IEnumerable<HashSet<int>> SimpleTransitiveClosure<T>(List<T> elements,
|
||||
Func<PdfPoint, PdfPoint, double> distMeasure,
|
||||
Func<T, T, double> maxDistanceFunction,
|
||||
Func<T, PdfPoint> pivotPoint, Func<T, PdfPoint> candidatesPoint,
|
||||
Func<T, bool> filterPivot, Func<T, T, bool> filterFinal)
|
||||
{
|
||||
/*************************************************************************************
|
||||
* Algorithm steps
|
||||
* 1. Find nearest neighbours indexes (done in parallel)
|
||||
* Iterate every point (pivot) and put its nearest neighbour's index in an array
|
||||
* e.g. if nearest neighbour of point i is point j, then indexes[i] = j.
|
||||
* Only conciders a neighbour if it is within the maximum distance.
|
||||
* If not within the maximum distance, index will be set to -1.
|
||||
* Each element has only one connected neighbour.
|
||||
* NB: Given the possible asymmetry in the relationship, it is possible
|
||||
* that if indexes[i] = j then indexes[j] != i.
|
||||
*
|
||||
* 2. Group indexes
|
||||
* Group indexes if share neighbours in common - Transitive closure
|
||||
* e.g. if we have indexes[i] = j, indexes[j] = k, indexes[m] = n and indexes[n] = -1
|
||||
* (i,j,k) will form a group and (m,n) will form another group.
|
||||
*************************************************************************************/
|
||||
|
||||
int[] indexes = Enumerable.Repeat((int)-1, elements.Count).ToArray();
|
||||
var candidatesPoints = elements.Select(candidatesPoint).ToList();
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Count, e =>
|
||||
{
|
||||
var pivot = elements[e];
|
||||
|
||||
if (filterPivot(pivot))
|
||||
{
|
||||
int index = pivotPoint(pivot).FindIndexNearest(candidatesPoints, distMeasure, out double dist);
|
||||
var paired = elements[index];
|
||||
|
||||
if (filterFinal(pivot, paired) && dist < maxDistanceFunction(pivot, paired))
|
||||
{
|
||||
indexes[e] = index;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 2. Group indexes
|
||||
var groupedIndexes = GroupIndexes(indexes);
|
||||
|
||||
return groupedIndexes;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to group elements via transitive closure, using nearest neighbours and maximum distance.
|
||||
/// https://en.wikipedia.org/wiki/Transitive_closure
|
||||
@@ -47,7 +108,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
*************************************************************************************/
|
||||
|
||||
int[] indexes = Enumerable.Repeat((int)-1, elements.Length).ToArray();
|
||||
var candidatesPoints = elements.Select(x => candidatesPoint(x)).ToList();
|
||||
var candidatesPoints = elements.Select(candidatesPoint).ToList();
|
||||
|
||||
// 1. Find nearest neighbours indexes
|
||||
Parallel.For(0, elements.Length, e =>
|
||||
|
@@ -9,19 +9,21 @@ using UglyToad.PdfPig.Util;
|
||||
|
||||
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
{
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// The Docstrum algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
|
||||
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
|
||||
/// clustering of connected components extracted from the document.
|
||||
/// This implementation leverages bounding boxes and does not exactly replicates the original algorithm.
|
||||
/// <para>See 'The document spectrum for page layout analysis.' by L. O’Gorman.</para>
|
||||
/// <para>See 'The document spectrum for page layout analysis.' by L. O'Gorman.</para>
|
||||
/// </summary>
|
||||
public class DocstrumBB : IPageSegmenter
|
||||
public class DocstrumBoundingBoxes : IPageSegmenter
|
||||
{
|
||||
/// <summary>
|
||||
/// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBB"/>.
|
||||
/// Create an instance of Docstrum for bounding boxes page segmenter, <see cref="DocstrumBoundingBoxes"/>.
|
||||
/// </summary>
|
||||
public static DocstrumBB Instance { get; } = new DocstrumBB();
|
||||
public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes();
|
||||
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// Get the blocks.
|
||||
/// <para>Uses wlAngleLB = -30, wlAngleUB = 30, blAngleLB = -135, blAngleUB = -45, blMulti = 1.3.</para>
|
||||
@@ -30,73 +32,108 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <returns></returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords)
|
||||
{
|
||||
return GetBlocks(pageWords, -30, 30, -135, -45, 1.3);
|
||||
return GetBlocks(pageWords, new AngleBounds(-30, 30), new AngleBounds(-135, -45), 1.3);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the blocks. See original paper for more information.
|
||||
/// </summary>
|
||||
/// <param name="pageWords"></param>
|
||||
/// <param name="wlAngleLB">Within-line lower bound angle.</param>
|
||||
/// <param name="wlAngleUB">Within-line upper bound angle.</param>
|
||||
/// <param name="blAngleLB">Between-line lower bound angle.</param>
|
||||
/// <param name="blAngleUB">Between-line upper bound angle.</param>
|
||||
/// <param name="blMultiplier">Multiplier that gives the maximum perpendicular distance between
|
||||
/// <param name="words">The words to segment into <see cref="TextBlock"/>s.</param>
|
||||
/// <param name="withinLine">Angle bounds for words to be considered on the same line.</param>
|
||||
/// <param name="betweenLine">Angle bounds for words to be considered on separate lines.</param>
|
||||
/// <param name="betweenLineMultiplier">Multiplier that gives the maximum perpendicular distance between
|
||||
/// text lines for blocking. Maximum distance will be this number times the between-line
|
||||
/// distance found by the analysis.</param>
|
||||
/// <returns></returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> pageWords, double wlAngleLB, double wlAngleUB,
|
||||
double blAngleLB, double blAngleUB, double blMultiplier)
|
||||
/// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
|
||||
public IReadOnlyList<TextBlock> GetBlocks(IEnumerable<Word> words, AngleBounds withinLine,
|
||||
AngleBounds betweenLine,
|
||||
double betweenLineMultiplier)
|
||||
{
|
||||
if (pageWords.Count() == 0) return EmptyArray<TextBlock>.Instance;
|
||||
if (words == null)
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
var pageWordsArr = pageWords.Where(w => !string.IsNullOrWhiteSpace(w.Text)).ToArray(); // remove white spaces
|
||||
var wordsList = new List<Word>();
|
||||
|
||||
var withinLineDistList = new ConcurrentBag<double[]>();
|
||||
var betweenLineDistList = new ConcurrentBag<double[]>();
|
||||
foreach (var word in words)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(word.Text))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
wordsList.Add(word);
|
||||
}
|
||||
|
||||
if (wordsList.Count == 0)
|
||||
{
|
||||
return EmptyArray<TextBlock>.Instance;
|
||||
}
|
||||
|
||||
var withinLineDistList = new ConcurrentBag<double>();
|
||||
var betweenLineDistList = new ConcurrentBag<double>();
|
||||
|
||||
// 1. Estimate in line and between line spacing
|
||||
Parallel.For(0, pageWordsArr.Length, i =>
|
||||
Parallel.For(0, wordsList.Count, i =>
|
||||
{
|
||||
var word = pageWordsArr[i];
|
||||
var word = wordsList[i];
|
||||
|
||||
// Within-line distance
|
||||
var pointWL = GetNearestPointData(pageWordsArr, word,
|
||||
var pointsWithinLine = GetNearestPointDistance(wordsList, word,
|
||||
bb => bb.BottomRight, bb => bb.BottomRight,
|
||||
bb => bb.BottomLeft, bb => bb.BottomLeft,
|
||||
wlAngleLB, wlAngleUB, Distances.Horizontal);
|
||||
if (pointWL != null) withinLineDistList.Add(pointWL);
|
||||
withinLine, Distances.Horizontal);
|
||||
|
||||
if (pointsWithinLine != null)
|
||||
{
|
||||
withinLineDistList.Add(pointsWithinLine.Value);
|
||||
}
|
||||
|
||||
// Between-line distance
|
||||
var pointBL = GetNearestPointData(pageWordsArr, word,
|
||||
var pointsBetweenLine = GetNearestPointDistance(wordsList, word,
|
||||
bb => bb.BottomLeft, bb => bb.Centroid,
|
||||
bb => bb.TopLeft, bb => bb.Centroid,
|
||||
blAngleLB, blAngleUB, Distances.Vertical);
|
||||
if (pointBL != null) betweenLineDistList.Add(pointBL);
|
||||
betweenLine, Distances.Vertical);
|
||||
|
||||
if (pointsBetweenLine != null)
|
||||
{
|
||||
betweenLineDistList.Add(pointsBetweenLine.Value);
|
||||
}
|
||||
});
|
||||
|
||||
double withinLineDistance = GetPeakAverageDistance(withinLineDistList);
|
||||
double betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
|
||||
double? withinLineDistance = GetPeakAverageDistance(withinLineDistList);
|
||||
double? betweenLineDistance = GetPeakAverageDistance(betweenLineDistList);
|
||||
|
||||
if (withinLineDistance == null || betweenLineDistance == null)
|
||||
{
|
||||
return new[] {new TextBlock(new[] {new TextLine(wordsList)})};
|
||||
}
|
||||
|
||||
// 2. Find lines of text
|
||||
double maxDistWL = Math.Min(3 * withinLineDistance, Math.Sqrt(2) * betweenLineDistance);
|
||||
var lines = GetLines(pageWordsArr, maxDistWL, wlAngleLB, wlAngleUB).ToArray();
|
||||
double maxDistanceWithinLine = Math.Min(3 * withinLineDistance.Value, Math.Sqrt(2) * betweenLineDistance.Value);
|
||||
var lines = GetLines(wordsList, maxDistanceWithinLine, withinLine).ToArray();
|
||||
|
||||
// 3. Find blocks of text
|
||||
double maxDistBL = blMultiplier * betweenLineDistance;
|
||||
var blocks = GetLinesGroups(lines, maxDistBL).ToList();
|
||||
double maxDistanceBetweenLine = betweenLineMultiplier * betweenLineDistance.Value;
|
||||
var blocks = GetLinesGroups(lines, maxDistanceBetweenLine).ToList();
|
||||
|
||||
// 4. Merge overlapping blocks - might happen in certain conditions, e.g. justified text.
|
||||
for (int b = 0; b < blocks.Count; b++)
|
||||
for (var b = 0; b < blocks.Count; b++)
|
||||
{
|
||||
if (blocks[b] == null) continue;
|
||||
|
||||
for (int c = 0; c < blocks.Count; c++)
|
||||
if (blocks[b] == null)
|
||||
{
|
||||
if (b == c) continue;
|
||||
if (blocks[c] == null) continue;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (AreRectangleOverlapping(blocks[b].BoundingBox, blocks[c].BoundingBox))
|
||||
for (var c = 0; c < blocks.Count; c++)
|
||||
{
|
||||
if (b == c || blocks[c] == null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (blocks[b].BoundingBox.IntersectsWith(blocks[c].BoundingBox))
|
||||
{
|
||||
// Merge
|
||||
// 1. Merge all words
|
||||
@@ -105,7 +142,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
|
||||
// 2. Rebuild lines, using max distance = +Inf as we know all words will be in the
|
||||
// same block. Filtering will still be done based on angle.
|
||||
var mergedLines = GetLines(mergedWords.ToArray(), double.MaxValue, wlAngleLB, wlAngleUB);
|
||||
var mergedLines = GetLines(mergedWords, double.MaxValue, withinLine);
|
||||
blocks[b] = new TextBlock(mergedLines.ToList());
|
||||
|
||||
// Remove
|
||||
@@ -117,70 +154,57 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
return blocks.Where(b => b != null).ToList();
|
||||
}
|
||||
|
||||
private bool AreRectangleOverlapping(PdfRectangle rectangle1, PdfRectangle rectangle2)
|
||||
{
|
||||
if (rectangle1.Left > rectangle2.Right || rectangle2.Left > rectangle1.Right) return false;
|
||||
if (rectangle1.Top < rectangle2.Bottom || rectangle2.Top < rectangle1.Bottom) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get information on the nearest point, filtered for angle.
|
||||
/// </summary>
|
||||
/// <param name="words"></param>
|
||||
/// <param name="pivot"></param>
|
||||
/// <param name="funcPivotDist"></param>
|
||||
/// <param name="funcPivotAngle"></param>
|
||||
/// <param name="funcPointsDist"></param>
|
||||
/// <param name="funcPointsAngle"></param>
|
||||
/// <param name="angleStart"></param>
|
||||
/// <param name="angleEnd"></param>
|
||||
/// <param name="finalDistMEasure"></param>
|
||||
/// <returns></returns>
|
||||
private double[] GetNearestPointData(Word[] words, Word pivot, Func<PdfRectangle,
|
||||
private double? GetNearestPointDistance(List<Word> words, Word pivot, Func<PdfRectangle,
|
||||
PdfPoint> funcPivotDist, Func<PdfRectangle, PdfPoint> funcPivotAngle,
|
||||
Func<PdfRectangle, PdfPoint> funcPointsDist, Func<PdfRectangle, PdfPoint> funcPointsAngle,
|
||||
double angleStart, double angleEnd,
|
||||
Func<PdfPoint, PdfPoint, double> finalDistMEasure)
|
||||
AngleBounds angleBounds,
|
||||
Func<PdfPoint, PdfPoint, double> finalDistanceMeasure)
|
||||
{
|
||||
var pointR = funcPivotDist(pivot.BoundingBox);
|
||||
|
||||
// Filter by angle
|
||||
var filtered = words.Where(w =>
|
||||
{
|
||||
var angleWL = Distances.Angle(funcPivotAngle(pivot.BoundingBox), funcPointsAngle(w.BoundingBox));
|
||||
return (angleWL >= angleStart && angleWL <= angleEnd);
|
||||
}).ToList();
|
||||
filtered.Remove(pivot); // remove itself
|
||||
var pivotPoint = funcPivotAngle(pivot.BoundingBox);
|
||||
|
||||
if (filtered.Count > 0)
|
||||
{
|
||||
int index = pointR.FindIndexNearest(
|
||||
filtered.Select(w => funcPointsDist(w.BoundingBox)).ToList(),
|
||||
Distances.Euclidean, out double distWL);
|
||||
var wordsWithinAngleBoundDistancePoints = new List<PdfPoint>();
|
||||
|
||||
if (index >= 0)
|
||||
// Filter to words within the angle range.
|
||||
foreach (var word in words)
|
||||
{
|
||||
var matchWL = filtered[index];
|
||||
return new double[]
|
||||
// Ignore the pivot word.
|
||||
if (ReferenceEquals(word, pivot))
|
||||
{
|
||||
(double)pivot.Letters.Select(l => l.FontSize).Mode(),
|
||||
finalDistMEasure(pointR, funcPointsDist(matchWL.BoundingBox))
|
||||
};
|
||||
continue;
|
||||
}
|
||||
|
||||
var angle = Distances.Angle(pivotPoint, funcPointsAngle(word.BoundingBox));
|
||||
|
||||
if (angleBounds.Contains(angle))
|
||||
{
|
||||
wordsWithinAngleBoundDistancePoints.Add(funcPointsDist(word.BoundingBox));
|
||||
}
|
||||
}
|
||||
|
||||
if (wordsWithinAngleBoundDistancePoints.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var closestWordIndex = pointR.FindIndexNearest(wordsWithinAngleBoundDistancePoints, Distances.Euclidean, out _);
|
||||
|
||||
if (closestWordIndex < 0 || closestWordIndex >= wordsWithinAngleBoundDistancePoints.Count)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return finalDistanceMeasure(pointR, wordsWithinAngleBoundDistancePoints[closestWordIndex]);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Build lines via transitive closure.
|
||||
/// </summary>
|
||||
/// <param name="words"></param>
|
||||
/// <param name="maxDist"></param>
|
||||
/// <param name="wlAngleLB"></param>
|
||||
/// <param name="wlAngleUB"></param>
|
||||
/// <returns></returns>
|
||||
private IEnumerable<TextLine> GetLines(Word[] words, double maxDist, double wlAngleLB, double wlAngleUB)
|
||||
private static IEnumerable<TextLine> GetLines(List<Word> words, double maxDist, AngleBounds withinLine)
|
||||
{
|
||||
/***************************************************************************************************
|
||||
* /!\ WARNING: Given how FindIndexNearest() works, if 'maxDist' > 'word Width', the algo might not
|
||||
@@ -196,8 +220,10 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
pivot => true,
|
||||
(pivot, candidate) =>
|
||||
{
|
||||
var angleWL = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft); // compare bottom right with bottom left for angle
|
||||
return (angleWL >= wlAngleLB && angleWL <= wlAngleUB);
|
||||
// Compare bottom right with bottom left for angle
|
||||
var withinLineAngle = Distances.Angle(pivot.BoundingBox.BottomRight, candidate.BoundingBox.BottomLeft);
|
||||
|
||||
return (withinLineAngle >= withinLine.Lower && withinLineAngle <= withinLine.Upper);
|
||||
}).ToList();
|
||||
|
||||
Func<IEnumerable<Word>, IReadOnlyList<Word>> orderFunc = l => l.OrderBy(x => x.BoundingBox.Left).ToList();
|
||||
@@ -214,7 +240,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
orderFunc = l => l.OrderBy(x => x.BoundingBox.Bottom).ToList();
|
||||
}
|
||||
|
||||
for (int a = 0; a < groupedIndexes.Count(); a++)
|
||||
for (var a = 0; a < groupedIndexes.Count; a++)
|
||||
{
|
||||
yield return new TextLine(orderFunc(groupedIndexes[a].Select(i => words[i])));
|
||||
}
|
||||
@@ -223,10 +249,7 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <summary>
|
||||
/// Build blocks via transitive closure.
|
||||
/// </summary>
|
||||
/// <param name="lines"></param>
|
||||
/// <param name="maxDist"></param>
|
||||
/// <returns></returns>
|
||||
private IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
|
||||
private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist)
|
||||
{
|
||||
/**************************************************************************************************
|
||||
* We want to measure the distance between two lines using the following method:
|
||||
@@ -269,22 +292,68 @@ namespace UglyToad.PdfPig.DocumentLayoutAnalysis
|
||||
/// <summary>
|
||||
/// Get the average distance value of the peak bucket of the histogram.
|
||||
/// </summary>
|
||||
/// <param name="values">array[0]=font size, array[1]=distance</param>
|
||||
/// <returns></returns>
|
||||
private double GetPeakAverageDistance(IEnumerable<double[]> values)
|
||||
/// <param name="distances">The set of distances to average.</param>
|
||||
private static double? GetPeakAverageDistance(IEnumerable<double> distances)
|
||||
{
|
||||
int max = (int)values.Max(x => x[1]) + 1;
|
||||
int[] distrib = new int[max];
|
||||
var buckets = new Dictionary<int, List<double>>();
|
||||
foreach (var distance in distances)
|
||||
{
|
||||
var floor = (int)distance;
|
||||
|
||||
// Create histogram with buckets of size 1.
|
||||
for (int i = 0; i < max; i++)
|
||||
if (buckets.ContainsKey(floor))
|
||||
{
|
||||
distrib[i] = values.Where(x => x[1] > i && x[1] <= i + 1).Count();
|
||||
buckets[floor].Add(distance);
|
||||
}
|
||||
else
|
||||
{
|
||||
buckets[floor] = new List<double> {distance};
|
||||
}
|
||||
}
|
||||
|
||||
var peakIndex = Array.IndexOf(distrib, distrib.Max());
|
||||
var best = default(List<double>);
|
||||
|
||||
return values.Where(v => v[1] > peakIndex && v[1] <= peakIndex + 1).Average(x => x[1]);
|
||||
foreach (var bucket in buckets)
|
||||
{
|
||||
if (best == null || bucket.Value.Count > best.Count)
|
||||
{
|
||||
best = bucket.Value;
|
||||
}
|
||||
}
|
||||
|
||||
return best?.Average();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The bounds for the angle between two words for them to have a certain type of relationship.
|
||||
/// </summary>
|
||||
public struct AngleBounds
|
||||
{
|
||||
/// <summary>
|
||||
/// The lower bound in degrees.
|
||||
/// </summary>
|
||||
public double Lower { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The upper bound in degrees.
|
||||
/// </summary>
|
||||
public double Upper { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="AngleBounds"/>.
|
||||
/// </summary>
|
||||
public AngleBounds(double lowerBound, double upperBound)
|
||||
{
|
||||
Lower = lowerBound;
|
||||
Upper = upperBound;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether the bounds contain the angle.
|
||||
/// </summary>
|
||||
public bool Contains(double angle)
|
||||
{
|
||||
return angle >= Lower && angle <= Upper;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,32 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Alternative.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoAlternative
|
||||
{
|
||||
/// <summary>
|
||||
/// Purpose.
|
||||
/// </summary>
|
||||
[XmlAttribute("PURPOSE")]
|
||||
public string Purpose { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Value.
|
||||
/// </summary>
|
||||
[XmlText]
|
||||
public string Value { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
144
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoBlock.cs
Normal file
144
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoBlock.cs
Normal file
@@ -0,0 +1,144 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Schema;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Base type for any kind of block on the page.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[XmlInclude(typeof(AltoTextBlock))]
|
||||
[XmlInclude(typeof(AltoGraphicalElement))]
|
||||
[XmlInclude(typeof(AltoIllustration))]
|
||||
[XmlInclude(typeof(AltoComposedBlock))]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoBlock : AltoPositionedElement
|
||||
{
|
||||
private float rotation;
|
||||
private bool correctionStatus;
|
||||
private AltoBlockTypeShow show;
|
||||
private AltoBlockTypeActuate actuate;
|
||||
|
||||
/// <remarks/>
|
||||
public AltoShape Shape { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
|
||||
public string StyleRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("TAGREFS", DataType = "IDREFS")]
|
||||
public string TagRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
|
||||
public string ProcessingRefs { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The rotation of e.g. text or illustration within the block. The value is in degree counterclockwise.
|
||||
/// </summary>
|
||||
[XmlAttribute("ROTATION")]
|
||||
public float Rotation
|
||||
{
|
||||
get => rotation;
|
||||
set
|
||||
{
|
||||
rotation = value;
|
||||
if (!float.IsNaN(value)) RotationSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool RotationSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// The next block in reading sequence on the page.
|
||||
/// </summary>
|
||||
[XmlAttribute("IDNEXT", DataType = "IDREF")]
|
||||
public string IdNext { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Correction Status. Indicates whether manual correction has been done or not.
|
||||
/// The correction status should be recorded at the highest level possible (Block, TextLine, String).
|
||||
/// </summary>
|
||||
[XmlAttribute("CS")]
|
||||
public bool CorrectionStatus
|
||||
{
|
||||
get => correctionStatus;
|
||||
set
|
||||
{
|
||||
correctionStatus = value;
|
||||
CorrectionStatusSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool CorrectionStatusSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("type", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public string Type { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("href", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink", DataType = "anyURI")]
|
||||
public string Href { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("role", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public string Role { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("arcrole", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public string Arcrole { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("title", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public string Title { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("show", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public AltoBlockTypeShow Show
|
||||
{
|
||||
get => show;
|
||||
set
|
||||
{
|
||||
show = value;
|
||||
ShowSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool ShowSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("actuate", Form = XmlSchemaForm.Qualified, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public AltoBlockTypeActuate Actuate
|
||||
{
|
||||
get => actuate;
|
||||
set
|
||||
{
|
||||
actuate = value;
|
||||
ActuateSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool ActuateSpecified { get; private set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,31 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto/xlink] Block Type Actuate
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public enum AltoBlockTypeActuate
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlEnum("onLoad")]
|
||||
OnLoad,
|
||||
/// <remarks/>
|
||||
[XmlEnum("onRequest")]
|
||||
OnRequest,
|
||||
/// <remarks/>
|
||||
[XmlEnum("other")]
|
||||
Other,
|
||||
/// <remarks/>
|
||||
[XmlEnum("none")]
|
||||
None,
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,34 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto/xlink] Block Type Show.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.w3.org/1999/xlink")]
|
||||
public enum AltoBlockTypeShow
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlEnum("new")]
|
||||
New,
|
||||
/// <remarks/>
|
||||
[XmlEnum("replace")]
|
||||
Replace,
|
||||
/// <remarks/>
|
||||
[XmlEnum("embed")]
|
||||
Embed,
|
||||
/// <remarks/>
|
||||
[XmlEnum("other")]
|
||||
Other,
|
||||
/// <remarks/>
|
||||
[XmlEnum("none")]
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
32
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoCircle.cs
Normal file
32
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoCircle.cs
Normal file
@@ -0,0 +1,32 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A circle shape. <see cref="HorizontalPosition"/> and <see cref="VerticalPosition"/> describe the center of the circle.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoCircle
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlAttribute("HPOS")]
|
||||
public float HorizontalPosition { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("VPOS")]
|
||||
public float VerticalPosition { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("RADIUS")]
|
||||
public float Radius { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,45 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A block that consists of other blocks.
|
||||
/// <para>WARNING: The CIRCULAR GROUP REFERENCES was removed from the xsd.
|
||||
/// NEED TO ADD IT BACK!!!</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoComposedBlock : AltoBlock
|
||||
{
|
||||
// TODO: what is this?
|
||||
/*****************************************************************
|
||||
* /!\ WARNING /!\
|
||||
* The CIRCULAR GROUP REFERENCES below was removed from the xsd
|
||||
* NEED TO ADD IT BACK!!!
|
||||
* <xsd:sequence minOccurs="0" maxOccurs="unbounded">
|
||||
* <xsd:group ref="BlockGroup"/>
|
||||
* </xsd:sequence>
|
||||
*****************************************************************/
|
||||
|
||||
/// <summary>
|
||||
/// A user defined string to identify the type of composed block (e.g. table, advertisement, ...)
|
||||
/// </summary>
|
||||
[XmlAttribute("TYPE")]
|
||||
public string TypeComposed { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// An ID to link to an image which contains only the composed block.
|
||||
/// The ID and the file link is defined in the related METS file.
|
||||
/// </summary>
|
||||
[XmlAttribute("FILEID")]
|
||||
public string FileId { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,37 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Description
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoDescription
|
||||
{
|
||||
/// <remarks/>
|
||||
public AltoMeasurementUnit MeasurementUnit { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("sourceImageInformation")]
|
||||
public AltoSourceImageInformation SourceImageInformation { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Element deprecated. 'Processing' should be used instead.
|
||||
/// </summary>
|
||||
[XmlElement("OCRProcessing")]
|
||||
public AltoDescriptionOcrProcessing[] OcrProcessing { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("Processing")]
|
||||
public AltoDescriptionProcessing[] Processings { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,26 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// [Alto] Description Ocr Processing
|
||||
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoDescriptionOcrProcessing : AltoOcrProcessing
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlAttribute(DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,27 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// [Alto] Description Processing
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoDescriptionProcessing : AltoProcessingStep
|
||||
{
|
||||
/// <summary>
|
||||
/// Id.
|
||||
/// </summary>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,34 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A unique identifier for the document.
|
||||
/// <para>This identifier must be unique within the local
|
||||
/// To facilitate file sharing or interoperability with other systems,
|
||||
/// documentIdentifierLocation may be added to designate the system or
|
||||
/// application where the identifier is unique.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoDocumentIdentifier
|
||||
{
|
||||
/// <summary>
|
||||
/// A location qualifier, i.e., a namespace.
|
||||
/// </summary>
|
||||
[XmlAttribute("documentIdentifierLocation")]
|
||||
public string DocumentIdentifierLocation { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlText]
|
||||
public string Value { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
57
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoEllipse.cs
Normal file
57
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoEllipse.cs
Normal file
@@ -0,0 +1,57 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] An ellipse shape. HPOS and VPOS describe the center of the ellipse.
|
||||
/// HLENGTH and VLENGTH are the width and height of the described ellipse.
|
||||
/// <para>The attribute ROTATION tells the rotation of the e.g. text or
|
||||
/// illustration within the block.The value is in degrees counterclockwise.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoEllipse
|
||||
{
|
||||
private float rotation;
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("HPOS")]
|
||||
public float HorizontalPosition { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("VPOS")]
|
||||
public float VerticalPosition { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("HLENGTH")]
|
||||
public float HorizontalLength { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("VLENGTH")]
|
||||
public float VerticalLength { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ROTATION")]
|
||||
public float Rotation
|
||||
{
|
||||
get => rotation;
|
||||
set
|
||||
{
|
||||
rotation = value;
|
||||
if (!float.IsNaN(value)) RotationSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool RotationSpecified { get; private set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,33 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A unique identifier for the image file. This is drawn from MIX.
|
||||
///
|
||||
/// <para>This identifier must be unique within the local
|
||||
/// To facilitate file sharing or interoperability with other systems,
|
||||
/// fileIdentifierLocation may be added to designate the system or
|
||||
/// application where the identifier is unique.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoFileIdentifier
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlAttribute("fileIdentifierLocation")]
|
||||
public string FileIdentifierLocation { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlText]
|
||||
public string Value { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,50 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Font styles.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Flags]
|
||||
[Serializable]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoFontStyles
|
||||
{
|
||||
/// <summary>
|
||||
/// Bold.
|
||||
/// </summary>
|
||||
[XmlEnum("bold")]
|
||||
Bold = 1,
|
||||
/// <summary>
|
||||
/// Italics.
|
||||
/// </summary>
|
||||
[XmlEnum("italics")]
|
||||
Italics = 2,
|
||||
/// <summary>
|
||||
/// Subscript.
|
||||
/// </summary>
|
||||
[XmlEnum("subscript")]
|
||||
Subscript = 4,
|
||||
/// <summary>
|
||||
/// Superscript.
|
||||
/// </summary>
|
||||
[XmlEnum("superscript")]
|
||||
Superscript = 8,
|
||||
/// <summary>
|
||||
/// Small caps.
|
||||
/// </summary>
|
||||
[XmlEnum("smallcaps")]
|
||||
SmallCaps = 16,
|
||||
/// <summary>
|
||||
/// Underline.
|
||||
/// </summary>
|
||||
[XmlEnum("underline")]
|
||||
Underline = 32,
|
||||
}
|
||||
}
|
||||
}
|
29
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoFontType.cs
Normal file
29
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoFontType.cs
Normal file
@@ -0,0 +1,29 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Font type (Serif or Sans-Serif).
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoFontType
|
||||
{
|
||||
/// <summary>
|
||||
/// Serif.
|
||||
/// </summary>
|
||||
[XmlEnum("serif")]
|
||||
Serif,
|
||||
/// <summary>
|
||||
/// Sans-serif.
|
||||
/// </summary>
|
||||
[XmlEnum("sans-serif")]
|
||||
SansSerif,
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,29 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Font width (Fixed or proportional).
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoFontWidth
|
||||
{
|
||||
/// <summary>
|
||||
/// Proportional.
|
||||
/// </summary>
|
||||
[XmlEnum("proportional")]
|
||||
Proportional,
|
||||
/// <summary>
|
||||
/// Remarks.
|
||||
/// </summary>
|
||||
[XmlEnum("fixed")]
|
||||
Fixed
|
||||
}
|
||||
}
|
||||
}
|
90
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoGlyph.cs
Normal file
90
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoGlyph.cs
Normal file
@@ -0,0 +1,90 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Modern OCR software stores information on glyph level. A glyph is essentially a character or ligature.
|
||||
/// Accordingly the value for the glyph element will be defined as follows:
|
||||
/// Pre-composed representation = base + combining character(s) (decomposed representation)
|
||||
/// See http://www.fileformat.info/info/unicode/char/0101/index.htm
|
||||
/// "U+0101" = (U+0061) + (U+0304)
|
||||
/// "combining characters" ("base characters" in combination with non-spacing marks or characters which are combined to one) are represented as one "glyph", e.g.áàâ.
|
||||
///
|
||||
/// <para>Each glyph has its own coordinate information and must be separately addressable as a distinct object.
|
||||
/// Correction and verification processes can be carried out for individual characters.</para>
|
||||
///
|
||||
/// <para>Post-OCR analysis of the text as well as adaptive OCR algorithm must be able to record information on glyph level.
|
||||
/// In order to reproduce the decision of the OCR software, optional characters must be recorded.These are called variants.
|
||||
/// The OCR software evaluates each variant and picks the one with the highest confidence score as the glyph.
|
||||
/// The confidence score expresses how confident the OCR software is that a single glyph had been recognized correctly.</para>
|
||||
///
|
||||
/// <para>The glyph elements are in order of the word. Each glyph need to be recorded to built up the whole word sequence.</para>
|
||||
///
|
||||
/// <para>The glyph’s CONTENT attribute is no replacement for the string’s CONTENT attribute.
|
||||
/// Due to post-processing steps such as correction the values of both attributes may be inconsistent.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoGlyph : AltoPositionedElement
|
||||
{
|
||||
private float gc;
|
||||
|
||||
/// <remarks/>
|
||||
public AltoShape Shape { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Alternative (combined) character for the glyph, outlined by OCR engine or similar recognition processes.
|
||||
/// In case the variant are two (combining) characters, two characters are outlined in one Variant element.
|
||||
/// E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn".
|
||||
/// <para>Details for different use-cases see on the samples on GitHub.</para>
|
||||
/// </summary>
|
||||
[XmlElement("Variant")]
|
||||
public AltoVariant[] Variant { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// CONTENT contains the precomposed representation (combining character) of the character from the parent String element.
|
||||
/// The sequence position of the Gylph element matches the position of the character in the String.
|
||||
/// </summary>
|
||||
[XmlAttribute("CONTENT")]
|
||||
public string Content { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// This GC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence for the variant where is 1 is certain.
|
||||
/// This attribute is optional. If it is not available, the default value for the variant is "0".
|
||||
///
|
||||
/// <para>The GC attribute semantic is the same as the WC attribute on the String element and VC on Variant element.</para>
|
||||
/// </summary>
|
||||
[XmlAttribute("GC")]
|
||||
public float Gc
|
||||
{
|
||||
get => gc;
|
||||
set
|
||||
{
|
||||
gc = value;
|
||||
if (!float.IsNaN(value)) GcSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool GcSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
public override string ToString()
|
||||
{
|
||||
return Content;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,21 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A graphic used to separate blocks. Usually a line or rectangle.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoGraphicalElement : AltoBlock
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,32 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A picture or image.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoIllustration : AltoBlock
|
||||
{
|
||||
/// <summary>
|
||||
/// A user defined string to identify the type of illustration like photo, map, drawing, chart, ...
|
||||
/// </summary>
|
||||
[XmlAttribute("TYPE")]
|
||||
public string IllustrationType { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// A link to an image which contains only the illustration.
|
||||
/// </summary>
|
||||
[XmlAttribute("FILEID")]
|
||||
public string FileId { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,44 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] There are following variation of tag types available:
|
||||
/// LayoutTag – criteria about arrangement or graphical appearance;
|
||||
/// StructureTag – criteria about grouping or formation;
|
||||
/// RoleTag – criteria about function or mission;
|
||||
/// NamedEntityTag – criteria about assignment of terms to their relationship / meaning (NER);
|
||||
/// OtherTag – criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those.;
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#", IncludeInSchema = false)]
|
||||
public enum AltoItemsChoice
|
||||
{
|
||||
/// <summary>
|
||||
/// Criteria about arrangement or graphical appearance.
|
||||
/// </summary>
|
||||
LayoutTag,
|
||||
/// <summary>
|
||||
/// Criteria about assignment of terms to their relationship / meaning (NER).
|
||||
/// </summary>
|
||||
NamedEntityTag,
|
||||
/// <summary>
|
||||
/// Criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those.
|
||||
/// </summary>
|
||||
OtherTag,
|
||||
/// <summary>
|
||||
/// Criteria about function or mission.
|
||||
/// </summary>
|
||||
RoleTag,
|
||||
/// <summary>
|
||||
/// Criteria about grouping or formation.
|
||||
/// </summary>
|
||||
StructureTag
|
||||
}
|
||||
}
|
||||
}
|
28
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoLayout.cs
Normal file
28
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoLayout.cs
Normal file
@@ -0,0 +1,28 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Layout.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoLayout
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlElement("Page")]
|
||||
public AltoPage[] Pages { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
|
||||
public string StyleRefs { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,47 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] All measurement values inside the alto file are related to this unit, except the font size.
|
||||
///
|
||||
/// Coordinates as being used in HPOS and VPOS are absolute coordinates referring to the upper-left corner of a page.
|
||||
/// The upper left corner of the page is defined as coordinate (0/0).
|
||||
///
|
||||
/// <para>values meaning:
|
||||
/// mm10: 1/10th of millimeter;
|
||||
/// inch1200: 1/1200th of inch;
|
||||
/// pixel: 1 pixel</para>
|
||||
///
|
||||
/// The values for pixel will be related to the resolution of the image based
|
||||
/// on which the layout is described. Incase the original image is not known
|
||||
/// the scaling factor can be calculated based on total width and height of
|
||||
/// the image and the according information of the PAGE element.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoMeasurementUnit
|
||||
{
|
||||
/// <summary>
|
||||
/// 1 pixel.
|
||||
/// </summary>
|
||||
[XmlEnum("pixel")]
|
||||
Pixel,
|
||||
/// <summary>
|
||||
/// 1/10th of millimeter.
|
||||
/// </summary>
|
||||
[XmlEnum("mm10")]
|
||||
Mm10,
|
||||
/// <summary>
|
||||
/// 1/1200th of inch.
|
||||
/// </summary>
|
||||
[XmlEnum("inch1200")]
|
||||
Inch1200,
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,34 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Ocr Processing
|
||||
/// <para>Element deprecated. 'AltoProcessing' should be used instead.</para>
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoOcrProcessing
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlElement("preProcessingStep")]
|
||||
public AltoProcessingStep[] PreProcessingStep { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
public AltoProcessingStep OcrProcessingStep { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("postProcessingStep")]
|
||||
public AltoProcessingStep[] PostProcessingStep { get; set; }
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
194
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoPage.cs
Normal file
194
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoPage.cs
Normal file
@@ -0,0 +1,194 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] One page of a document.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoPage
|
||||
{
|
||||
private float height;
|
||||
private float width;
|
||||
private AltoQuality quality;
|
||||
private AltoPosition position;
|
||||
private float accuracy;
|
||||
private float pc;
|
||||
|
||||
/// <summary>
|
||||
/// The area between the top line of print and the upper edge of the leaf. It may contain page number or running title.
|
||||
/// </summary>
|
||||
public AltoPageSpace TopMargin { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The area between the printspace and the left border of a page. May contain margin notes.
|
||||
/// </summary>
|
||||
public AltoPageSpace LeftMargin { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The area between the printspace and the right border of a page. May contain margin notes.
|
||||
/// </summary>
|
||||
public AltoPageSpace RightMargin { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The area between the bottom line of letterpress or writing and the bottom edge of the leaf.
|
||||
/// It may contain a page number, a signature number or a catch word.
|
||||
/// </summary>
|
||||
public AltoPageSpace BottomMargin { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Rectangle covering the printed area of a page. Page number and running title are not part of the print space.
|
||||
/// </summary>
|
||||
public AltoPageSpace PrintSpace { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Any user-defined class like title page.
|
||||
/// </summary>
|
||||
[XmlAttribute("PAGECLASS")]
|
||||
public string PageClass { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
|
||||
public string StyleRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
|
||||
public string ProcessingRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("HEIGHT")]
|
||||
public float Height
|
||||
{
|
||||
get => height;
|
||||
set
|
||||
{
|
||||
height = value;
|
||||
if (!float.IsNaN(value)) HeightSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool HeightSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("WIDTH")]
|
||||
public float Width
|
||||
{
|
||||
get => width;
|
||||
set
|
||||
{
|
||||
width = value;
|
||||
if (!float.IsNaN(value)) WidthSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool WidthSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// The number of the page within the document.
|
||||
/// </summary>
|
||||
[XmlAttribute("PHYSICAL_IMG_NR")]
|
||||
public float PhysicalImgNr { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The page number that is printed on the page.
|
||||
/// </summary>
|
||||
[XmlAttribute("PRINTED_IMG_NR")]
|
||||
public string PrintedImgNr { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("QUALITY")]
|
||||
public AltoQuality Quality
|
||||
{
|
||||
get => quality;
|
||||
set
|
||||
{
|
||||
quality = value;
|
||||
QualitySpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool QualitySpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("QUALITY_DETAIL")]
|
||||
public string QualityDetail { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("POSITION")]
|
||||
public AltoPosition Position
|
||||
{
|
||||
get => position;
|
||||
set
|
||||
{
|
||||
position = value;
|
||||
PositionSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool PositionSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// A link to the processing description that has been used for this page.
|
||||
/// </summary>
|
||||
[XmlAttribute("PROCESSING", DataType = "IDREF")]
|
||||
public string Processing { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Estimated percentage of OCR Accuracy in range from 0 to 100
|
||||
/// </summary>
|
||||
[XmlAttribute("ACCURACY")]
|
||||
public float Accuracy
|
||||
{
|
||||
get => accuracy;
|
||||
set
|
||||
{
|
||||
accuracy = value;
|
||||
if (!float.IsNaN(value)) AccuracySpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool AccuracySpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
[XmlAttribute("PC")]
|
||||
public float Pc
|
||||
{
|
||||
get => pc;
|
||||
set
|
||||
{
|
||||
pc = value;
|
||||
if (!float.IsNaN(value)) PcSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool PcSpecified { get; private set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,54 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// [Alto] A region on a page.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoPageSpace : AltoPositionedElement
|
||||
{
|
||||
/// <summary>
|
||||
/// Shape.
|
||||
/// </summary>
|
||||
public AltoShape Shape { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("TextBlock")]
|
||||
public AltoTextBlock[] TextBlock { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("Illustration")]
|
||||
public AltoIllustration[] Illustrations { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("GraphicalElement")]
|
||||
public AltoGraphicalElement[] GraphicalElements { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("ComposedBlock")]
|
||||
public AltoComposedBlock[] ComposedBlocks { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
|
||||
public string StyleRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
|
||||
public string ProcessingRefs { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,121 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A paragraph style defines formatting properties of text blocks.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoParagraphStyle
|
||||
{
|
||||
private AltoParagraphStyleAlign align;
|
||||
private float left;
|
||||
private float right;
|
||||
private float linespace;
|
||||
private float firstLine;
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Indicates the alignement of the paragraph. Could be left, right, center or justify.
|
||||
/// </summary>
|
||||
[XmlAttribute("ALIGN")]
|
||||
public AltoParagraphStyleAlign Align
|
||||
{
|
||||
get => align;
|
||||
set
|
||||
{
|
||||
align = value;
|
||||
AlignSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool AlignSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Left indent of the paragraph in relation to the column.
|
||||
/// </summary>
|
||||
[XmlAttribute("LEFT")]
|
||||
public float Left
|
||||
{
|
||||
get => left;
|
||||
set
|
||||
{
|
||||
left = value;
|
||||
if (!float.IsNaN(value)) LeftSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool LeftSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Right indent of the paragraph in relation to the column.
|
||||
/// </summary>
|
||||
[XmlAttribute("RIGHT")]
|
||||
public float Right
|
||||
{
|
||||
get => right;
|
||||
set
|
||||
{
|
||||
right = value;
|
||||
if (!float.IsNaN(value)) RightSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool RightSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Line spacing between two lines of the paragraph. Measurement calculated from baseline to baseline.
|
||||
/// </summary>
|
||||
[XmlAttribute("LINESPACE")]
|
||||
public float LineSpace
|
||||
{
|
||||
get => linespace;
|
||||
set
|
||||
{
|
||||
linespace = value;
|
||||
if (!float.IsNaN(value)) LineSpaceSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool LineSpaceSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Indent of the first line of the paragraph if this is different from the other lines. A negative
|
||||
/// value indicates an indent to the left, a positive value indicates an indent to the right.
|
||||
/// </summary>
|
||||
[XmlAttribute("FIRSTLINE")]
|
||||
public float FirstLine
|
||||
{
|
||||
get => firstLine;
|
||||
set
|
||||
{
|
||||
firstLine = value;
|
||||
if (!float.IsNaN(value)) FirstLineSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool FirstLineSpecified { get; private set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,27 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Indicates the alignment of the paragraph. Could be left, right, center or justify.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoParagraphStyleAlign
|
||||
{
|
||||
/// <remarks/>
|
||||
Left,
|
||||
/// <remarks/>
|
||||
Right,
|
||||
/// <remarks/>
|
||||
Center,
|
||||
/// <remarks/>
|
||||
Block
|
||||
}
|
||||
}
|
||||
}
|
24
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoPolygon.cs
Normal file
24
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoPolygon.cs
Normal file
@@ -0,0 +1,24 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A polygon shape.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoPolygon
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlAttribute("POINTS")]
|
||||
public string Points { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
39
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoPosition.cs
Normal file
39
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoPosition.cs
Normal file
@@ -0,0 +1,39 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Position of the page. Could be lefthanded, righthanded, cover, foldout or single if it has no special position.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoPosition
|
||||
{
|
||||
/// <summary>
|
||||
/// Left page.
|
||||
/// </summary>
|
||||
Left,
|
||||
/// <summary>
|
||||
/// Right page.
|
||||
/// </summary>
|
||||
Right,
|
||||
/// <summary>
|
||||
/// Foldout page.
|
||||
/// </summary>
|
||||
Foldout,
|
||||
/// <summary>
|
||||
/// Single page.
|
||||
/// </summary>
|
||||
Single,
|
||||
/// <summary>
|
||||
/// Cover page.
|
||||
/// </summary>
|
||||
Cover
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,106 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// Encapsulates width/height and position data.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
public abstract class AltoPositionedElement
|
||||
{
|
||||
private float height;
|
||||
private float width;
|
||||
private float horizontalPosition;
|
||||
private float verticalPosition;
|
||||
|
||||
/// <summary>
|
||||
/// Height.
|
||||
/// </summary>
|
||||
[XmlAttribute("HEIGHT")]
|
||||
public float Height
|
||||
{
|
||||
get => height;
|
||||
set
|
||||
{
|
||||
height = value;
|
||||
if (!float.IsNaN(value)) HeightSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include <see cref="Height"/> in the output.
|
||||
/// </summary>
|
||||
[XmlIgnore]
|
||||
public bool HeightSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Width.
|
||||
/// </summary>
|
||||
[XmlAttribute("WIDTH")]
|
||||
public float Width
|
||||
{
|
||||
get => width;
|
||||
set
|
||||
{
|
||||
width = value;
|
||||
if (!float.IsNaN(value)) WidthSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include <see cref="Width"/> in the output.
|
||||
/// </summary>
|
||||
[XmlIgnore]
|
||||
public bool WidthSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Horizontal position.
|
||||
/// </summary>
|
||||
[XmlAttribute("HPOS")]
|
||||
public float HorizontalPosition
|
||||
{
|
||||
get => horizontalPosition;
|
||||
set
|
||||
{
|
||||
horizontalPosition = value;
|
||||
if (!float.IsNaN(value)) HorizontalPositionSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include <see cref="HorizontalPosition"/> in the output.
|
||||
/// </summary>
|
||||
[XmlIgnore]
|
||||
public bool HorizontalPositionSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Vertical position.
|
||||
/// </summary>
|
||||
[XmlAttribute("VPOS")]
|
||||
public float VerticalPosition
|
||||
{
|
||||
get => verticalPosition;
|
||||
set
|
||||
{
|
||||
verticalPosition = value;
|
||||
if (!float.IsNaN(value)) VerticalPositionSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include <see cref="VerticalPosition"/> in the output.
|
||||
/// </summary>
|
||||
[XmlIgnore]
|
||||
public bool VerticalPositionSpecified { get; private set; }
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,46 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Classification of the category of operation, how the file was created, including generation, modification,
|
||||
/// preprocessing, postprocessing or any other steps.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Flags]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoProcessingCategory
|
||||
{
|
||||
/// <summary>
|
||||
/// Content generation.
|
||||
/// </summary>
|
||||
[XmlEnum("contentGeneration")]
|
||||
ContentGeneration = 1,
|
||||
/// <summary>
|
||||
/// Content modification.
|
||||
/// </summary>
|
||||
[XmlEnum("contentModification")]
|
||||
ContentModification = 2,
|
||||
/// <summary>
|
||||
/// Pre-operation.
|
||||
/// </summary>
|
||||
[XmlEnum("preOperation")]
|
||||
PreOperation = 4,
|
||||
/// <summary>
|
||||
/// Post-operation.
|
||||
/// </summary>
|
||||
[XmlEnum("postOperation")]
|
||||
PostOperation = 8,
|
||||
/// <summary>
|
||||
/// Other.
|
||||
/// </summary>
|
||||
[XmlEnum("other")]
|
||||
Other = 16
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,49 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Information about a software application. Where applicable, the preferred method
|
||||
/// for determining this information is by selecting Help -- About.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoProcessingSoftware
|
||||
{
|
||||
/// <summary>
|
||||
/// The name of the organization or company that created the application.
|
||||
/// </summary>
|
||||
[XmlAttribute("softwareCreator")]
|
||||
public string SoftwareCreator { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The name of the application.
|
||||
/// </summary>
|
||||
[XmlAttribute("softwareName")]
|
||||
public string SoftwareName { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The version of the application.
|
||||
/// </summary>
|
||||
[XmlAttribute("softwareVersion")]
|
||||
public string SoftwareVersion { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// A description of any important characteristics of the application, especially for
|
||||
/// non-commercial applications. For example, if a non-commercial application is built
|
||||
/// using commercial components, e.g., an OCR engine SDK. Those components should be mentioned here.
|
||||
/// </summary>
|
||||
[XmlAttribute("applicationDescription")]
|
||||
public string ApplicationDescription { get; set; }
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -0,0 +1,71 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Description of the processing step.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoProcessingStep
|
||||
{
|
||||
private AltoProcessingCategory processingCategory;
|
||||
|
||||
/// <summary>
|
||||
/// Classification of the category of operation, how the file was created, including
|
||||
/// generation, modification, preprocessing, postprocessing or any other steps.
|
||||
/// </summary>
|
||||
[XmlAttribute("processingCategory")]
|
||||
public AltoProcessingCategory ProcessingCategory
|
||||
{
|
||||
get => processingCategory;
|
||||
set
|
||||
{
|
||||
processingCategory = value;
|
||||
ProcessingCategorySpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool ProcessingCategorySpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Date or DateTime the image was processed.
|
||||
/// </summary>
|
||||
[XmlAttribute("processingDateTime")]
|
||||
public string ProcessingDateTime { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Identifies the organization level producer(s) of the processed image.
|
||||
/// </summary>
|
||||
[XmlAttribute("processingAgency")]
|
||||
public string ProcessingAgency { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// An ordinal listing of the image processing steps performed. For example, "image despeckling."
|
||||
/// </summary>
|
||||
[XmlElement("processingStepDescription")]
|
||||
public string[] ProcessingStepDescription { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// A description of any setting of the processing application. For example, for a multi-engine
|
||||
/// OCR application this might include the engines which were used. Ideally, this description
|
||||
/// should be adequate so that someone else using the same application can produce identical results.
|
||||
/// </summary>
|
||||
[XmlAttribute("processingStepSettings")]
|
||||
public string ProcessingStepSettings { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("processingSoftware")]
|
||||
public AltoProcessingSoftware ProcessingSoftware { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
36
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoQuality.cs
Normal file
36
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoQuality.cs
Normal file
@@ -0,0 +1,36 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Gives brief information about original page quality
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoQuality
|
||||
{
|
||||
/// <remarks/>
|
||||
// ReSharper disable once InconsistentNaming
|
||||
OK,
|
||||
/// <remarks/>
|
||||
Missing,
|
||||
/// <remarks/>
|
||||
[XmlEnum("Missing in original")]
|
||||
MissingInOriginal,
|
||||
/// <remarks/>
|
||||
Damaged,
|
||||
/// <remarks/>
|
||||
Retained,
|
||||
/// <remarks/>
|
||||
Target,
|
||||
/// <remarks/>
|
||||
[XmlEnum("As in original")]
|
||||
AsInOriginal,
|
||||
}
|
||||
}
|
||||
}
|
25
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoSP.cs
Normal file
25
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoSP.cs
Normal file
@@ -0,0 +1,25 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A white space.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
// ReSharper disable once InconsistentNaming
|
||||
public class AltoSP : AltoPositionedElement
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
26
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoShape.cs
Normal file
26
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoShape.cs
Normal file
@@ -0,0 +1,26 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Describes the bounding shape of a block, if it is not rectangular.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoShape
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlElement("Circle", typeof(AltoCircle))]
|
||||
[XmlElement("Ellipse", typeof(AltoEllipse))]
|
||||
[XmlElement("Polygon", typeof(AltoPolygon))]
|
||||
public object Item { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,32 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Information to identify the image file from which the OCR text was created.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoSourceImageInformation
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlElement("fileName")]
|
||||
public string FileName { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("fileIdentifier")]
|
||||
public AltoFileIdentifier[] FileIdentifiers { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("documentIdentifier")]
|
||||
public AltoDocumentIdentifier[] DocumentIdentifiers { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
148
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoString.cs
Normal file
148
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoString.cs
Normal file
@@ -0,0 +1,148 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A sequence of chars. Strings are separated by white spaces or hyphenation chars.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoString : AltoPositionedElement
|
||||
{
|
||||
private AltoFontStyles style;
|
||||
private AltoSubsType subsType;
|
||||
private float wc;
|
||||
private bool correctionStatus;
|
||||
|
||||
/// <remarks/>
|
||||
public AltoShape Shape { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("ALTERNATIVE")]
|
||||
public AltoAlternative[] Alternative { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("Glyph")]
|
||||
public AltoGlyph[] Glyph { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
|
||||
public string StyleRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("TAGREFS", DataType = "IDREFS")]
|
||||
public string TagRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
|
||||
public string ProcessingRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("CONTENT")]
|
||||
public string Content { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("STYLE")]
|
||||
public AltoFontStyles Style
|
||||
{
|
||||
get => style;
|
||||
set
|
||||
{
|
||||
style = value;
|
||||
StyleSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool StyleSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("SUBS_TYPE")]
|
||||
public AltoSubsType SubsType
|
||||
{
|
||||
get => subsType;
|
||||
set
|
||||
{
|
||||
subsType = value;
|
||||
SubsTypeSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool SubsTypeSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Content of the substitution.
|
||||
/// </summary>
|
||||
[XmlAttribute("SUBS_CONTENT")]
|
||||
public string SubsContent { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("WC")]
|
||||
public float Wc
|
||||
{
|
||||
get => wc;
|
||||
set
|
||||
{
|
||||
wc = value;
|
||||
if (!float.IsNaN(value)) WcSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool WcSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level of each character in that string. A list of numbers,
|
||||
/// one number between 0 (sure) and 9 (unsure) for each character.
|
||||
/// </summary>
|
||||
[XmlAttribute("CC")]
|
||||
public string Cc { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Correction Status. Indicates whether manual correction has been done or not.
|
||||
/// The correction status should be recorded at the highest level possible (Block, TextLine, String).
|
||||
/// </summary>
|
||||
[XmlAttribute("CS")]
|
||||
public bool CorrectionStatus
|
||||
{
|
||||
get => correctionStatus;
|
||||
set
|
||||
{
|
||||
correctionStatus = value;
|
||||
CorrectionStatusSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool CorrectionStatusSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Attribute to record language of the string. The language should be recorded at the highest level possible.
|
||||
/// </summary>
|
||||
[XmlAttribute("LANG", DataType = "language")]
|
||||
public string Language { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
public override string ToString()
|
||||
{
|
||||
return Content;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
32
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoStyles.cs
Normal file
32
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoStyles.cs
Normal file
@@ -0,0 +1,32 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Styles.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoStyles
|
||||
{
|
||||
/// <summary>
|
||||
/// Text Style.
|
||||
/// </summary>
|
||||
[XmlElement("TextStyle")]
|
||||
public AltoTextStyle[] TextStyle { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Paragraph Style.
|
||||
/// </summary>
|
||||
[XmlElement("ParagraphStyle")]
|
||||
public AltoParagraphStyle[] ParagraphStyle { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
25
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoSubsType.cs
Normal file
25
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoSubsType.cs
Normal file
@@ -0,0 +1,25 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Type of the substitution (if any).
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public enum AltoSubsType
|
||||
{
|
||||
/// <remarks/>
|
||||
HypPart1,
|
||||
/// <remarks/>
|
||||
HypPart2,
|
||||
/// <remarks/>
|
||||
Abbreviation,
|
||||
}
|
||||
}
|
||||
}
|
60
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoTag.cs
Normal file
60
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoTag.cs
Normal file
@@ -0,0 +1,60 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Tag.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoTag
|
||||
{
|
||||
/// <summary>
|
||||
/// The xml data wrapper element XmlData is used to contain XML encoded metadata.
|
||||
/// The content of an XmlData element can be in any namespace or in no namespace.
|
||||
/// As permitted by the XML Schema Standard, the processContents attribute value for the
|
||||
/// metadata in an XmlData is set to "lax". Therefore, if the source schema and its location are
|
||||
/// identified by means of an XML schemaLocation attribute, then an XML processor will validate
|
||||
/// the elements for which it can find declarations.If a source schema is not identified, or cannot be
|
||||
/// found at the specified schemaLocation, then an XML validator will check for well-formedness,
|
||||
/// but otherwise skip over the elements appearing in the XmlData element.
|
||||
/// </summary>
|
||||
public AltoTagXmlData XmlData { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Type can be used to classify and group the information within each tag element type.
|
||||
/// </summary>
|
||||
[XmlAttribute("TYPE")]
|
||||
public string Type { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Content / information value of the tag.
|
||||
/// </summary>
|
||||
[XmlAttribute("LABEL")]
|
||||
public string Label { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Description text for tag information for clarification.
|
||||
/// </summary>
|
||||
[XmlAttribute("DESCRIPTION")]
|
||||
public string Description { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Any URI for authority or description relevant information.
|
||||
/// </summary>
|
||||
[XmlAttribute("URI", DataType = "anyURI")]
|
||||
public string Uri { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,32 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] The xml data wrapper element XmlData is used to contain XML encoded metadata.
|
||||
/// The content of an XmlData element can be in any namespace or in no namespace.
|
||||
/// As permitted by the XML Schema Standard, the processContents attribute value for the
|
||||
/// metadata in an XmlData is set to "lax". Therefore, if the source schema and its location are
|
||||
/// identified by means of an XML schemaLocation attribute, then an XML processor will validate
|
||||
/// the elements for which it can find declarations. If a source schema is not identified, or cannot be
|
||||
/// found at the specified schemaLocation, then an XML validator will check for well-formedness,
|
||||
/// but otherwise skip over the elements appearing in the XmlData element.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoTagXmlData
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlAnyElement]
|
||||
public XmlElement[] Any { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
39
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoTags.cs
Normal file
39
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoTags.cs
Normal file
@@ -0,0 +1,39 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] There are following variation of tag types available:
|
||||
/// LayoutTag – criteria about arrangement or graphical appearance;
|
||||
/// StructureTag – criteria about grouping or formation;
|
||||
/// RoleTag – criteria about function or mission;
|
||||
/// NamedEntityTag – criteria about assignment of terms to their relationship / meaning (NER);
|
||||
/// OtherTag – criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those.;
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoTags
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlElement("LayoutTag", typeof(AltoTag))]
|
||||
[XmlElement("NamedEntityTag", typeof(AltoTag))]
|
||||
[XmlElement("OtherTag", typeof(AltoTag))]
|
||||
[XmlElement("RoleTag", typeof(AltoTag))]
|
||||
[XmlElement("StructureTag", typeof(AltoTag))]
|
||||
[XmlChoiceIdentifier("ItemsElementName")]
|
||||
public AltoTag[] Items { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("ItemsElementName")]
|
||||
[XmlIgnore]
|
||||
public AltoItemsChoice[] ItemsElementName { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,43 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <inheritdoc />
|
||||
/// <summary>
|
||||
/// [Alto] A block of text.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoTextBlock : AltoBlock
|
||||
{
|
||||
/// <remarks/>
|
||||
[XmlElement("TextLine")]
|
||||
public AltoTextBlockTextLine[] TextLines { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Attribute deprecated. LANG should be used instead.
|
||||
/// </summary>
|
||||
[XmlAttribute("language", DataType = "language")]
|
||||
public string Language { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Attribute to record language of the textblock.
|
||||
/// </summary>
|
||||
[XmlAttribute("LANG", DataType = "language")]
|
||||
public string Lang { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Join<AltoTextBlockTextLine>(" ", TextLines);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,101 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A single line of text.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoTextBlockTextLine : AltoPositionedElement
|
||||
{
|
||||
private float baseline;
|
||||
private bool correctionStatus;
|
||||
|
||||
/// <remarks/>
|
||||
public AltoShape Shape { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("String")]
|
||||
public AltoString[] Strings { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlElement("SP")]
|
||||
public AltoSP[] Sp { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// A hyphenation char. Can appear only at the end of a line.
|
||||
/// </summary>
|
||||
[XmlElement("HYP")]
|
||||
public AltoTextBlockTextLineHyp Hyp { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("STYLEREFS", DataType = "IDREFS")]
|
||||
public string StyleRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("TAGREFS", DataType = "IDREFS")]
|
||||
public string TagRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("PROCESSINGREFS", DataType = "IDREFS")]
|
||||
public string ProcessingRefs { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("BASELINE")]
|
||||
public float BaseLine
|
||||
{
|
||||
get => baseline;
|
||||
set
|
||||
{
|
||||
baseline = value;
|
||||
if (!float.IsNaN(value)) BaseLineSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool BaseLineSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("LANG", DataType = "language")]
|
||||
public string Language { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Correction Status. Indicates whether manual correction has been done or not.
|
||||
/// The correction status should be recorded at the highest level possible (Block, TextLine, String).
|
||||
/// </summary>
|
||||
[XmlAttribute("CS")]
|
||||
public bool CorrectionStatus
|
||||
{
|
||||
get => correctionStatus;
|
||||
set
|
||||
{
|
||||
correctionStatus = value;
|
||||
CorrectionStatusSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool CorrectionStatusSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Join<AltoString>(" ", Strings); // take in account order?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,26 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A hyphenation char. Can appear only at the end of a line.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(AnonymousType = true, Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoTextBlockTextLineHyp : AltoPositionedElement
|
||||
{
|
||||
/// <summary>
|
||||
/// Content.
|
||||
/// </summary>
|
||||
[XmlAttribute("CONTENT")]
|
||||
public string Content { get; set; }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,96 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] A text style defines font properties of text.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoTextStyle
|
||||
{
|
||||
private AltoFontType fontType;
|
||||
private AltoFontWidth fontWidth;
|
||||
private AltoFontStyles fontStyle;
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("ID", DataType = "ID")]
|
||||
public string Id { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The font name.
|
||||
/// </summary>
|
||||
[XmlAttribute("FONTFAMILY")]
|
||||
public string FontFamily { get; set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("FONTTYPE")]
|
||||
public AltoFontType FontType
|
||||
{
|
||||
get => fontType;
|
||||
set
|
||||
{
|
||||
fontType = value;
|
||||
FontTypeSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool FontTypeSpecified { get; private set; }
|
||||
|
||||
/// <remarks/>
|
||||
[XmlAttribute("FONTWIDTH")]
|
||||
public AltoFontWidth FontWidth
|
||||
{
|
||||
get => fontWidth;
|
||||
set
|
||||
{
|
||||
fontWidth = value;
|
||||
FontWidthSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool FontWidthSpecified { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// The font size, in points (1/72 of an inch).
|
||||
/// </summary>
|
||||
[XmlAttribute("FONTSIZE")]
|
||||
public float FontSize { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The font color as an RGB value.
|
||||
/// </summary>
|
||||
[XmlAttribute("FONTCOLOR", DataType = "hexBinary")]
|
||||
public byte[] FontColor { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The font style.
|
||||
/// </summary>
|
||||
[XmlAttribute("FONTSTYLE")]
|
||||
public AltoFontStyles FontStyle
|
||||
{
|
||||
get => fontStyle;
|
||||
set
|
||||
{
|
||||
fontStyle = value;
|
||||
FontStyleSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool FontStyleSpecified { get; private set; }
|
||||
}
|
||||
}
|
||||
}
|
56
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoVariant.cs
Normal file
56
src/UglyToad.PdfPig/Export/Alto/AltoDocument.AltoVariant.cs
Normal file
@@ -0,0 +1,56 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// [Alto] Alternative (combined) character for the glyph, outlined by OCR engine or similar recognition processes.
|
||||
/// In case the variant are two (combining) characters, two characters are outlined in one Variant element.
|
||||
/// E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn".
|
||||
/// Details for different use-cases see on the samples on GitHub.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
public class AltoVariant
|
||||
{
|
||||
private float vcField;
|
||||
|
||||
/// <summary>
|
||||
/// Each Variant represents an option for the glyph that the OCR software detected as possible alternatives.
|
||||
/// In case the variant are two(combining) characters, two characters are outlined in one Variant element.
|
||||
/// E.g.a Glyph element with CONTENT="m" can have a Variant element with the content "rn".
|
||||
///
|
||||
/// <para>Details for different use-cases see on the samples on GitHub.</para>
|
||||
/// </summary>
|
||||
[XmlAttribute("CONTENT")]
|
||||
public string Content { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// This VC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence
|
||||
/// for the variant where is 1 is certain.
|
||||
/// This attribute is optional. If it is not available, the default value for the variant is "0".
|
||||
/// The VC attribute semantic is the same as the GC attribute on the Glyph element.
|
||||
/// </summary>
|
||||
[XmlAttribute("VC")]
|
||||
public float Vc
|
||||
{
|
||||
get => vcField;
|
||||
set
|
||||
{
|
||||
vcField = value;
|
||||
if (!float.IsNaN(value)) VcSpecified = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <remarks/>
|
||||
[XmlIgnore]
|
||||
public bool VcSpecified { get; private set; }
|
||||
}
|
||||
}
|
||||
}
|
51
src/UglyToad.PdfPig/Export/Alto/AltoDocument.cs
Normal file
51
src/UglyToad.PdfPig/Export/Alto/AltoDocument.cs
Normal file
@@ -0,0 +1,51 @@
|
||||
namespace UglyToad.PdfPig.Export.Alto
|
||||
{
|
||||
using System;
|
||||
using System.ComponentModel;
|
||||
using System.Diagnostics;
|
||||
using System.Xml.Serialization;
|
||||
|
||||
/// <summary>
|
||||
/// [Alto] Alto Schema root
|
||||
/// <para>Version 4.1</para>
|
||||
/// See https://github.com/altoxml/schema
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
[Serializable]
|
||||
[DebuggerStepThrough]
|
||||
[XmlType(Namespace = "http://www.loc.gov/standards/alto/ns-v4#")]
|
||||
[XmlRoot("alto", Namespace = "http://www.loc.gov/standards/alto/ns-v4#", IsNullable = false)]
|
||||
public partial class AltoDocument
|
||||
{
|
||||
/// <summary>
|
||||
/// Describes general settings of the alto file like measurement units and metadata
|
||||
/// </summary>
|
||||
public AltoDescription Description { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Styles define properties of layout elements. A style defined in a parent element
|
||||
/// is used as default style for all related children elements.
|
||||
/// </summary>
|
||||
public AltoStyles Styles { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Tag define properties of additional characteristic. The tags are referenced from
|
||||
/// related content element on Block or String element by attribute TAGREF via the tag ID.
|
||||
///
|
||||
/// This container element contains the individual elements for LayoutTags, StructureTags,
|
||||
/// RoleTags, NamedEntityTags and OtherTags
|
||||
/// </summary>
|
||||
public AltoTags Tags { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// The root layout element.
|
||||
/// </summary>
|
||||
public AltoLayout Layout { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Schema version of the ALTO file.
|
||||
/// </summary>
|
||||
[XmlAttribute("SCHEMAVERSION")]
|
||||
public string SchemaVersion { get; set; }
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@@ -126,6 +126,25 @@
|
||||
BottomRight = bottomRight;
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Whether two rectangles overlap.
|
||||
/// </summary>
|
||||
public bool IntersectsWith(PdfRectangle rectangle)
|
||||
{
|
||||
if (Left > rectangle.Right || rectangle.Left > Right)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Top < rectangle.Bottom || rectangle.Top < Bottom)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// To string override.
|
||||
/// </summary>
|
||||
|
Reference in New Issue
Block a user