Merge pull request #184 from BobLd/docstrum-v2.3

Fix DocstrumBoundingBoxes when dXj=0
This commit is contained in:
Eliot Jones
2020-06-20 15:27:05 +01:00
committed by GitHub
7 changed files with 16 additions and 22 deletions

View File

@@ -16,7 +16,7 @@
/// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
/// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
/// left or right edge of the page.</para> /// left or right edge of the page.</para>
/// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para> /// <para>See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
/// </summary> /// </summary>
public static class DecorationTextBlockClassifier public static class DecorationTextBlockClassifier
{ {

View File

@@ -13,7 +13,6 @@
using UglyToad.PdfPig.Graphics; using UglyToad.PdfPig.Graphics;
using Util; using Util;
/// <inheritdoc />
/// <summary> /// <summary>
/// Alto 4.1 (XML) text exporter. /// Alto 4.1 (XML) text exporter.
/// <para>See https://github.com/altoxml/schema </para> /// <para>See https://github.com/altoxml/schema </para>
@@ -66,7 +65,6 @@
return Serialize(altoDocument); return Serialize(altoDocument);
} }
/// <inheritdoc />
/// <summary> /// <summary>
/// Get the Alto (XML) string of the page layout. Excludes <see cref="T:UglyToad.PdfPig.Geometry.PdfSubpath" />s. /// Get the Alto (XML) string of the page layout. Excludes <see cref="T:UglyToad.PdfPig.Geometry.PdfSubpath" />s.
/// </summary> /// </summary>

View File

@@ -10,7 +10,6 @@
using Graphics.Colors; using Graphics.Colors;
using Graphics.Core; using Graphics.Core;
/// <inheritdoc />
/// <summary> /// <summary>
/// Exports a page as an SVG. /// Exports a page as an SVG.
/// </summary> /// </summary>

View File

@@ -6,7 +6,6 @@
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
/// <inheritdoc />
/// <summary> /// <summary>
/// Default Page Segmenter. All words are included in one block. /// Default Page Segmenter. All words are included in one block.
/// </summary> /// </summary>
@@ -17,7 +16,6 @@
/// </summary> /// </summary>
public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter(); public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter();
/// <inheritdoc />
/// <summary> /// <summary>
/// Get the blocks using default options values. /// Get the blocks using default options values.
/// </summary> /// </summary>

View File

@@ -9,7 +9,6 @@
using System.Threading.Tasks; using System.Threading.Tasks;
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
/// <inheritdoc />
/// <summary> /// <summary>
/// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood /// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
/// clustering of connected components extracted from the document. /// clustering of connected components extracted from the document.
@@ -23,7 +22,6 @@
/// </summary> /// </summary>
public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes(); public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes();
/// <inheritdoc />
/// <summary> /// <summary>
/// Get the blocks using default options values. /// Get the blocks using default options values.
/// </summary> /// </summary>
@@ -34,7 +32,6 @@
return GetBlocks(words, new DocstrumBoundingBoxesOptions()); return GetBlocks(words, new DocstrumBoundingBoxesOptions());
} }
/// <inheritdoc />
/// <summary> /// <summary>
/// Get the blocks using options values. /// Get the blocks using options values.
/// </summary> /// </summary>
@@ -320,7 +317,6 @@
/// <summary> /// <summary>
/// Perpendicular overlapping distance. /// Perpendicular overlapping distance.
/// TODO: describe checks done
/// </summary> /// </summary>
/// <param name="line1"></param> /// <param name="line1"></param>
/// <param name="line2"></param> /// <param name="line2"></param>
@@ -419,7 +415,6 @@
overlap = false; overlap = false;
} }
//double pj = Math.Sqrt((Dj.Y - Cj.Y) * (Dj.Y - Cj.Y) + (Dj.X - Cj.X) * (Dj.X - Cj.X));
double pj = Distances.Euclidean(Cj, Dj); double pj = Distances.Euclidean(Cj, Dj);
normalisedOverlap = (overlap ? pj : -pj) / j.Length; normalisedOverlap = (overlap ? pj : -pj) / j.Length;
@@ -448,21 +443,29 @@
double dYidYj = dYi * dYj; double dYidYj = dYi * dYj;
double dXidXj = dXi * dXj; double dXidXj = dXi * dXj;
double denominator = dYidYj + dXidXj; double denominator = dYidYj + dXidXj;
if (denominator.AlmostEqualsToZero(epsilon)) if (denominator.AlmostEqualsToZero(epsilon))
{ {
// The denominator is 0 when translating points, meaning the lines are perpendicular. // The denominator is 0 when translating points, meaning the lines are perpendicular.
return null; return null;
} }
double xTj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator; double xAj;
double yTj = yPj; // TODO: need to check that double yAj;
if (dXj > epsilon) if (!dXj.AlmostEqualsToZero(epsilon)) // dXj != 0
{ {
yTj = dYj / dXj * (xTj - xPj) + yPj; xAj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator;
yAj = dYj / dXj * (xAj - xPj) + yPj;
}
else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that.
{
// TODO: check that
yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator;
xAj = xPj;
} }
return new PdfPoint(xTj, yTj); return new PdfPoint(xAj, yAj);
} }
/// <summary> /// <summary>
@@ -482,8 +485,7 @@
double dotProd1 = ax * bx + ay * by; double dotProd1 = ax * bx + ay * by;
if (dotProd1 < 0) return false; if (dotProd1 < 0) return false;
double dotProd2 = bx * bx + by * by; return dotProd1 <= (bx * bx + by * by);
return dotProd1 <= dotProd2;
} }
/// <summary> /// <summary>

View File

@@ -7,7 +7,6 @@
using System.Linq; using System.Linq;
using UglyToad.PdfPig.Geometry; using UglyToad.PdfPig.Geometry;
/// <inheritdoc />
/// <summary> /// <summary>
/// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
/// recursively into a set of rectangular blocks. This implementation leverages bounding boxes. /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
@@ -21,7 +20,6 @@
/// </summary> /// </summary>
public static RecursiveXYCut Instance { get; } = new RecursiveXYCut(); public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();
/// <inheritdoc />
/// <summary> /// <summary>
/// Get the blocks using default options values. /// Get the blocks using default options values.
/// </summary> /// </summary>
@@ -32,7 +30,6 @@
return GetBlocks(words, new RecursiveXYCutOptions()); return GetBlocks(words, new RecursiveXYCutOptions());
} }
/// <inheritdoc />
/// <summary> /// <summary>
/// Get the blocks using options values. /// Get the blocks using options values.
/// </summary> /// </summary>

View File

@@ -5,7 +5,7 @@
/// <summary> /// <summary>
/// Algorithm that retrieve the blocks' reading order using both (spatial) Allens interval relations and rendering order (TextSequence). /// Algorithm that retrieve the blocks' reading order using both (spatial) Allens interval relations and rendering order (TextSequence).
/// <para>See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.</para> /// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.</para>
/// </summary> /// </summary>
public class UnsupervisedReadingOrderDetector : IReadingOrderDetector public class UnsupervisedReadingOrderDetector : IReadingOrderDetector
{ {