Merge pull request #184 from BobLd/docstrum-v2.3

Fix DocstrumBoundingBoxes when dXj=0
2025-09-18 18:27:55 +08:00 · 2020-06-20 15:27:05 +01:00
parent 5fb04582a7 4b88f4adbe
commit 0f65397f48
7 changed files with 16 additions and 22 deletions
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DecorationTextBlockClassifier.cs
@@ -16,7 +16,7 @@
    /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
    /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
    /// left or right edge of the page.</para>
-    /// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
+    /// <para>See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern.</para>
    /// </summary>
    public static class DecorationTextBlockClassifier
    {
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/AltoXmlTextExporter.cs
@@ -13,7 +13,6 @@
    using UglyToad.PdfPig.Graphics;
    using Util;

-    /// <inheritdoc />
    /// <summary>
    /// Alto 4.1 (XML) text exporter.
    /// <para>See https://github.com/altoxml/schema </para>
@@ -66,7 +65,6 @@
            return Serialize(altoDocument);
        }

-        /// <inheritdoc />
        /// <summary>
        /// Get the Alto (XML) string of the page layout. Excludes <see cref="T:UglyToad.PdfPig.Geometry.PdfSubpath" />s.
        /// </summary>
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/Export/SvgTextExporter.cs
@@ -10,7 +10,6 @@
    using Graphics.Colors;
    using Graphics.Core;

-    /// <inheritdoc />
    /// <summary>
    /// Exports a page as an SVG.
    /// </summary>
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DefaultPageSegmenter.cs
@@ -6,7 +6,6 @@
    using System.Collections.Generic;
    using System.Linq;

-    /// <inheritdoc />
    /// <summary>
    /// Default Page Segmenter. All words are included in one block.
    /// </summary>
@@ -17,7 +16,6 @@
        /// </summary>
        public static DefaultPageSegmenter Instance { get; } = new DefaultPageSegmenter();

-        /// <inheritdoc />
        /// <summary>
        /// Get the blocks using default options values.
        /// </summary>
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/DocstrumBoundingBoxes.cs
@@ -9,7 +9,6 @@
    using System.Threading.Tasks;
    using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;

-    /// <inheritdoc />
    /// <summary>
    /// The Document Spectrum (Docstrum) algorithm is a bottom-up page segmentation technique based on nearest-neighbourhood
    /// clustering of connected components extracted from the document.
@@ -23,7 +22,6 @@
        /// </summary>
        public static DocstrumBoundingBoxes Instance { get; } = new DocstrumBoundingBoxes();

-        /// <inheritdoc />
        /// <summary>
        /// Get the blocks using default options values.
        /// </summary>
@@ -34,7 +32,6 @@
            return GetBlocks(words, new DocstrumBoundingBoxesOptions());
        }

-        /// <inheritdoc />
        /// <summary>
        /// Get the blocks using options values.
        /// </summary>
@@ -320,7 +317,6 @@

        /// <summary>
        /// Perpendicular overlapping distance.
-        /// TODO: describe checks done
        /// </summary>
        /// <param name="line1"></param>
        /// <param name="line2"></param>
@@ -419,7 +415,6 @@
                overlap = false;
            }

-            //double pj = Math.Sqrt((Dj.Y - Cj.Y) * (Dj.Y - Cj.Y) + (Dj.X - Cj.X) * (Dj.X - Cj.X));
            double pj = Distances.Euclidean(Cj, Dj);

            normalisedOverlap = (overlap ? pj : -pj) / j.Length;
@@ -448,21 +443,29 @@
            double dYidYj = dYi * dYj;
            double dXidXj = dXi * dXj;
            double denominator = dYidYj + dXidXj;
+
            if (denominator.AlmostEqualsToZero(epsilon))
            {
                // The denominator is 0 when translating points, meaning the lines are perpendicular.
                return null;
            }

-            double xTj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator;
-            double yTj = yPj; // TODO: need to check that
+            double xAj;
+            double yAj;

-            if (dXj > epsilon)
+            if (!dXj.AlmostEqualsToZero(epsilon)) // dXj != 0
            {
-                yTj = dYj / dXj * (xTj - xPj) + yPj;
+                xAj = (xPi * dXidXj + xPj * dYidYj + dXj * dYi * (yPi - yPj)) / denominator;
+                yAj = dYj / dXj * (xAj - xPj) + yPj;
+            }
+            else // If dXj = 0, then yAj is calculated first, and xAj is calculated from that.
+            {
+                // TODO: check that
+                yAj = (yPi * dYidYj + yPj * dXidXj + dYj * dXi * (xPi - xPj)) / denominator;
+                xAj = xPj;
            }

-            return new PdfPoint(xTj, yTj);
+            return new PdfPoint(xAj, yAj);
        }

        /// <summary>
@@ -482,8 +485,7 @@
            double dotProd1 = ax * bx + ay * by;
            if (dotProd1 < 0) return false;

-            double dotProd2 = bx * bx + by * by;
-            return dotProd1 <= dotProd2;
+            return dotProd1 <= (bx * bx + by * by);
        }

        /// <summary>
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/PageSegmenter/RecursiveXYCut.cs
@@ -7,7 +7,6 @@
    using System.Linq;
    using UglyToad.PdfPig.Geometry;

-    /// <inheritdoc />
    /// <summary>
    /// The recursive X-Y cut is a top-down page segmentation technique that decomposes a document
    /// recursively into a set of rectangular blocks. This implementation leverages bounding boxes.
@@ -21,7 +20,6 @@
        /// </summary>
        public static RecursiveXYCut Instance { get; } = new RecursiveXYCut();

-        /// <inheritdoc />
        /// <summary>
        /// Get the blocks using default options values.
        /// </summary>
@@ -32,7 +30,6 @@
            return GetBlocks(words, new RecursiveXYCutOptions());
        }

-        /// <inheritdoc />
        /// <summary>
        /// Get the blocks using options values.
        /// </summary>
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/ReadingOrderDetector/UnsupervisedReadingOrderDetector.cs
@@ -5,7 +5,7 @@

    /// <summary>
    /// Algorithm that retrieve the blocks' reading order using both (spatial) Allen’s interval relations and rendering order (TextSequence).
-    /// <para>See section 5.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.</para>
+    /// <para>See section 4.1 of 'Unsupervised document structure analysis of digital scientific articles' by S. Klampfl, M. Granitzer, K. Jack, R. Kern and 'Document Understanding for a Broad Class of Documents' by L. Todoran, M. Worring, M. Aiello and C. Monz.</para>
    /// </summary>
    public class UnsupervisedReadingOrderDetector : IReadingOrderDetector
    {