mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-19 10:47:56 +08:00
Merge pull request #187 from BobLd/dla-example-1
Add AdvancedTextExtraction example
This commit is contained in:
91
examples/AdvancedTextExtraction.cs
Normal file
91
examples/AdvancedTextExtraction.cs
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
namespace UglyToad.Examples
|
||||||
|
{
|
||||||
|
using PdfPig;
|
||||||
|
using System;
|
||||||
|
using System.Text;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||||
|
|
||||||
|
public static class AdvancedTextExtraction
|
||||||
|
{
|
||||||
|
public static void Run(string filePath)
|
||||||
|
{
|
||||||
|
var sb = new StringBuilder();
|
||||||
|
|
||||||
|
using (var document = PdfDocument.Open(filePath))
|
||||||
|
{
|
||||||
|
foreach (var page in document.GetPages())
|
||||||
|
{
|
||||||
|
// 0. Preprocessing
|
||||||
|
var letters = page.Letters; // no preprocessing
|
||||||
|
|
||||||
|
// 1. Extract words
|
||||||
|
var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
||||||
|
var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions()
|
||||||
|
{
|
||||||
|
Filter = (pivot, candidate) =>
|
||||||
|
{
|
||||||
|
// check if white space (default implementation of 'Filter')
|
||||||
|
if (string.IsNullOrWhiteSpace(candidate.Value))
|
||||||
|
{
|
||||||
|
// pivot and candidate letters cannot belong to the same word
|
||||||
|
// if candidate letter is null or white space.
|
||||||
|
// ('FilterPivot' already checks if the pivot is null or white space by default)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check for height difference
|
||||||
|
var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize);
|
||||||
|
var minHeight = Math.Min(pivot.PointSize, candidate.PointSize);
|
||||||
|
if (minHeight != 0 && maxHeight / minHeight > 2.0)
|
||||||
|
{
|
||||||
|
// pivot and candidate letters cannot belong to the same word
|
||||||
|
// if one letter is more than twice the size of the other.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check for colour difference
|
||||||
|
var pivotRgb = pivot.Color.ToRGBValues();
|
||||||
|
var candidateRgb = candidate.Color.ToRGBValues();
|
||||||
|
if (!pivotRgb.Equals(candidateRgb))
|
||||||
|
{
|
||||||
|
// pivot and candidate letters cannot belong to the same word
|
||||||
|
// if they don't have the same colour.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
var words = wordExtractor.GetWords(letters, wordExtractorOptions);
|
||||||
|
|
||||||
|
// 2. Segment page
|
||||||
|
var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
||||||
|
var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
|
||||||
|
{
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
var textBlocks = pageSegmenter.GetBlocks(words, pageSegmenterOptions);
|
||||||
|
|
||||||
|
// 3. Postprocessing
|
||||||
|
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
||||||
|
var orderedTextBlocks = readingOrder.Get(textBlocks);
|
||||||
|
|
||||||
|
// 4. Extract text
|
||||||
|
foreach (var block in orderedTextBlocks)
|
||||||
|
{
|
||||||
|
sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text
|
||||||
|
sb.AppendLine();
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.AppendLine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Console.WriteLine(sb.ToString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -42,6 +42,10 @@
|
|||||||
("Generate PDF/A-2A compliant file",
|
("Generate PDF/A-2A compliant file",
|
||||||
() => GeneratePdfA2AFile.Run(Path.Combine(filesDirectory, "..", "..", "Fonts", "TrueType", "Roboto-Regular.ttf"),
|
() => GeneratePdfA2AFile.Run(Path.Combine(filesDirectory, "..", "..", "Fonts", "TrueType", "Roboto-Regular.ttf"),
|
||||||
Path.Combine(filesDirectory, "smile-250-by-160.jpg")))
|
Path.Combine(filesDirectory, "smile-250-by-160.jpg")))
|
||||||
|
},
|
||||||
|
{7,
|
||||||
|
("Advance text extraction using layout analysis algorithms",
|
||||||
|
() => AdvancedTextExtraction.Run(Path.Combine(filesDirectory, "ICML03-081.pdf")))
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user