diff --git a/examples/AdvancedTextExtraction.cs b/examples/AdvancedTextExtraction.cs new file mode 100644 index 00000000..ff605fb5 --- /dev/null +++ b/examples/AdvancedTextExtraction.cs @@ -0,0 +1,91 @@ +namespace UglyToad.Examples +{ + using PdfPig; + using System; + using System.Text; + using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; + using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; + using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; + + public static class AdvancedTextExtraction + { + public static void Run(string filePath) + { + var sb = new StringBuilder(); + + using (var document = PdfDocument.Open(filePath)) + { + foreach (var page in document.GetPages()) + { + // 0. Preprocessing + var letters = page.Letters; // no preprocessing + + // 1. Extract words + var wordExtractor = NearestNeighbourWordExtractor.Instance; + var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions() + { + Filter = (pivot, candidate) => + { + // check if white space (default implementation of 'Filter') + if (string.IsNullOrWhiteSpace(candidate.Value)) + { + // pivot and candidate letters cannot belong to the same word + // if candidate letter is null or white space. + // ('FilterPivot' already checks if the pivot is null or white space by default) + return false; + } + + // check for height difference + var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize); + var minHeight = Math.Min(pivot.PointSize, candidate.PointSize); + if (minHeight != 0 && maxHeight / minHeight > 2.0) + { + // pivot and candidate letters cannot belong to the same word + // if one letter is more than twice the size of the other. + return false; + } + + // check for colour difference + var pivotRgb = pivot.Color.ToRGBValues(); + var candidateRgb = candidate.Color.ToRGBValues(); + if (!pivotRgb.Equals(candidateRgb)) + { + // pivot and candidate letters cannot belong to the same word + // if they don't have the same colour. + return false; + } + + return true; + } + }; + + var words = wordExtractor.GetWords(letters, wordExtractorOptions); + + // 2. Segment page + var pageSegmenter = DocstrumBoundingBoxes.Instance; + var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions() + { + + }; + + var textBlocks = pageSegmenter.GetBlocks(words, pageSegmenterOptions); + + // 3. Postprocessing + var readingOrder = UnsupervisedReadingOrderDetector.Instance; + var orderedTextBlocks = readingOrder.Get(textBlocks); + + // 4. Extract text + foreach (var block in orderedTextBlocks) + { + sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text + sb.AppendLine(); + } + + sb.AppendLine(); + } + } + + Console.WriteLine(sb.ToString()); + } + } +} diff --git a/examples/Program.cs b/examples/Program.cs index dddb9d0b..5297a657 100644 --- a/examples/Program.cs +++ b/examples/Program.cs @@ -42,6 +42,10 @@ ("Generate PDF/A-2A compliant file", () => GeneratePdfA2AFile.Run(Path.Combine(filesDirectory, "..", "..", "Fonts", "TrueType", "Roboto-Regular.ttf"), Path.Combine(filesDirectory, "smile-250-by-160.jpg"))) + }, + {7, + ("Advance text extraction using layout analysis algorithms", + () => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf"))) } };