diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DuplicateOverlappingTextProcessor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DuplicateOverlappingTextProcessor.cs new file mode 100644 index 00000000..b3e3e2b1 --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/DuplicateOverlappingTextProcessor.cs @@ -0,0 +1,94 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis +{ + using System.Collections.Generic; + using System.Linq; + using UglyToad.PdfPig.Content; + using UglyToad.PdfPig.PdfFonts; + + /// + /// Checks if each letter is a duplicate and overlaps any other letter and remove the duplicate, and flag the remaining as bold. + /// Logic inspired from PdfBox's PDFTextStripper class. + /// + public static class DuplicateOverlappingTextProcessor + { + /// + /// Checks if each letter is a duplicate and overlaps any other letter and remove the duplicate, and flag the remaining as bold. + /// Logic inspired from PdfBox's PDFTextStripper class. + /// + /// Letters to be processed. + /// Letters with no duplicate overlapping. + public static IReadOnlyList Get(IEnumerable letters) + { + if (letters?.Any() != true) + { + return letters?.ToList(); + } + + var queue = new Queue(letters); + var cleanLetters = new List() { queue.Dequeue() }; // dequeue the first letter + + while (queue.Count > 0) + { + var letter = queue.Dequeue(); + bool addLetter = true; + int duplicatesOverlappingIndex = -1; + + var duplicates = cleanLetters.Where(l => l.Value.Equals(letter.Value) && l.FontName.Equals(letter.FontName)); // do other checks? + + if (duplicates.Any()) + { + double tolerance = letter.GlyphRectangle.Width / (letter.Value.Length == 0 ? 1 : letter.Value.Length) / 3.0; + double minX = letter.GlyphRectangle.BottomLeft.X - tolerance; + double maxX = letter.GlyphRectangle.BottomLeft.X + tolerance; + double minY = letter.GlyphRectangle.BottomLeft.Y - tolerance; + double maxY = letter.GlyphRectangle.BottomLeft.Y + tolerance; + + var duplicatesOverlapping = duplicates.FirstOrDefault(l => minX <= l.GlyphRectangle.BottomLeft.X && + maxX >= l.GlyphRectangle.BottomLeft.X && + minY <= l.GlyphRectangle.BottomLeft.Y && + maxY >= l.GlyphRectangle.BottomLeft.Y); + + if (duplicatesOverlapping != default) + { + // duplicate overlapping letter was found, keeping the existing one and not adding this one. + addLetter = false; + duplicatesOverlappingIndex = cleanLetters.IndexOf(duplicatesOverlapping); + } + } + + if (addLetter) + { + cleanLetters.Add(letter); + } + else if (duplicatesOverlappingIndex != -1) + { + // TODO: need to update the bounding box + // TODO: need to update bottom left/right + // TODO: need to update width + // update textSequence? + + // update font details to bold + var fontDetails = new FontDetails(letter.Font.Name, true, letter.Font.Weight, letter.Font.IsItalic); + + var newLetter = new Letter(letter.Value, + letter.GlyphRectangle, + letter.StartBaseLine, + letter.EndBaseLine, + letter.Width, + letter.FontSize, + fontDetails, + letter.Color, + letter.PointSize, + letter.TextSequence); + + // update markedContentStack? + + // update letters + cleanLetters[duplicatesOverlappingIndex] = newLetter; + } + } + + return cleanLetters; + } + } +} diff --git a/src/UglyToad.PdfPig/Content/Letter.cs b/src/UglyToad.PdfPig/Content/Letter.cs index 146b7d03..87d66c03 100644 --- a/src/UglyToad.PdfPig/Content/Letter.cs +++ b/src/UglyToad.PdfPig/Content/Letter.cs @@ -80,12 +80,12 @@ /// /// Create a new letter to represent some text drawn by the Tj operator. /// - internal Letter(string value, PdfRectangle glyphRectangle, - PdfPoint startBaseLine, + public Letter(string value, PdfRectangle glyphRectangle, + PdfPoint startBaseLine, PdfPoint endBaseLine, double width, double fontSize, - FontDetails font, + FontDetails font, IColor color, double pointSize, int textSequence)