diff --git a/examples/ExtractTextWithNewlines.cs b/examples/ExtractTextWithNewlines.cs new file mode 100644 index 00000000..1fa86773 --- /dev/null +++ b/examples/ExtractTextWithNewlines.cs @@ -0,0 +1,22 @@ +namespace UglyToad.Examples +{ + using System; + using PdfPig; + using PdfPig.DocumentLayoutAnalysis.TextExtractor; + + internal static class ExtractTextWithNewlines + { + public static void Run(string filePath) + { + using (var document = PdfDocument.Open(filePath)) + { + foreach (var page in document.GetPages()) + { + var text = ContentOrderTextExtractor.GetText(page, true); + + Console.WriteLine(text); + } + } + } + } +} diff --git a/examples/Program.cs b/examples/Program.cs index caabfb6b..11989e41 100644 --- a/examples/Program.cs +++ b/examples/Program.cs @@ -17,12 +17,16 @@ var examples = new Dictionary { {1, - ("Extract Words with newline detection", + ("Extract Words with newline detection (example with algorithm)", () => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "Two Page Text Only - from libre office.pdf"))) }, {2, ("Extract images", () => ExtractImages.Run(Path.Combine(filesDirectory, "2006_Swedish_Touring_Car_Championship.pdf"))) + }, + {3, + ("Extract Text with newlines (using built-in content extractor)", + () => ExtractTextWithNewlines.Run(Path.Combine(filesDirectory, "Two Page Text Only - from libre office.pdf"))) } }; diff --git a/examples/UglyToad.Examples.csproj b/examples/UglyToad.Examples.csproj index 0ddd0028..8bc1ec4d 100644 --- a/examples/UglyToad.Examples.csproj +++ b/examples/UglyToad.Examples.csproj @@ -6,6 +6,7 @@ + diff --git a/examples/UglyToad.Examples.sln b/examples/UglyToad.Examples.sln index d1505dea..9dbe2655 100644 --- a/examples/UglyToad.Examples.sln +++ b/examples/UglyToad.Examples.sln @@ -15,6 +15,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UglyToad.PdfPig.Tokens", ". EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UglyToad.PdfPig", "..\src\UglyToad.PdfPig\UglyToad.PdfPig.csproj", "{75ED54D6-308F-44AD-B85E-C027F3AA80AE}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UglyToad.PdfPig.DocumentLayoutAnalysis", "..\src\UglyToad.PdfPig.DocumentLayoutAnalysis\UglyToad.PdfPig.DocumentLayoutAnalysis.csproj", "{70FEC330-CF3F-4815-9BA6-E622907086C9}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -45,6 +47,10 @@ Global {75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Debug|Any CPU.Build.0 = Debug|Any CPU {75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Release|Any CPU.ActiveCfg = Release|Any CPU {75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Release|Any CPU.Build.0 = Release|Any CPU + {70FEC330-CF3F-4815-9BA6-E622907086C9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {70FEC330-CF3F-4815-9BA6-E622907086C9}.Debug|Any CPU.Build.0 = Debug|Any CPU + {70FEC330-CF3F-4815-9BA6-E622907086C9}.Release|Any CPU.ActiveCfg = Release|Any CPU + {70FEC330-CF3F-4815-9BA6-E622907086C9}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs new file mode 100644 index 00000000..6371e143 --- /dev/null +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs @@ -0,0 +1,124 @@ +namespace UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor +{ + using System; + using System.Text; + using Content; + using Util; + + /// + /// Extracts text from a document based on the content order in the file. + /// + public static class ContentOrderTextExtractor + { + /// + /// Gets a human readable representation of the text from the page based on + /// the letter order of the original PDF document. + /// + /// A page from the document. + /// Whether to include a double new-line when the text is likely to be a new paragraph. + public static string GetText(Page page, bool addDoubleNewline = false) + { + var sb = new StringBuilder(); + + var previous = default(Letter); + var hasJustAddedWhitespace = false; + for (var i = 0; i < page.Letters.Count; i++) + { + var letter = page.Letters[i]; + + if (string.IsNullOrEmpty(letter.Value)) + { + continue; + } + + if (letter.Value == " " && !hasJustAddedWhitespace) + { + if (previous != null && IsNewline(previous, letter, page, out _)) + { + continue; + } + + sb.Append(" "); + previous = letter; + hasJustAddedWhitespace = true; + continue; + } + + hasJustAddedWhitespace = false; + + if (previous != null && letter.Value != " ") + { + var nwPrevious = GetNonWhitespacePrevious(page, i); + + if (IsNewline(nwPrevious, letter, page, out var isDoubleNewline)) + { + if (previous.Value == " ") + { + sb.Remove(sb.Length - 1, 1); + } + + sb.AppendLine(); + if (addDoubleNewline && isDoubleNewline) + { + sb.AppendLine(); + } + + hasJustAddedWhitespace = true; + } + else if (previous.Value != " ") + { + var gap = letter.StartBaseLine.X - previous.EndBaseLine.X; + + if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous)) + { + sb.Append(" "); + hasJustAddedWhitespace = true; + } + } + } + + sb.Append(letter.Value); + previous = letter; + } + + return sb.ToString(); + } + + private static Letter GetNonWhitespacePrevious(Page page, int index) + { + for (var i = index - 1; i >= 0; i--) + { + var letter = page.Letters[i]; + if (!string.IsNullOrWhiteSpace(letter.Value)) + { + return letter; + } + } + + return null; + } + + private static bool IsNewline(Letter previous, Letter letter, Page page, out bool isDoubleNewline) + { + isDoubleNewline = false; + + if (previous == null) + { + return false; + } + + var ptSizePrevious = (int)Math.Round(page.ExperimentalAccess.GetPointSize(previous)); + var ptSize = (int)Math.Round(page.ExperimentalAccess.GetPointSize(letter)); + var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious; + + var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y); + + if (gap > minPtSize * 1.7 && previous.StartBaseLine.Y > letter.StartBaseLine.Y) + { + isDoubleNewline = true; + } + + return gap > minPtSize * 0.9; + } + } +} diff --git a/src/UglyToad.PdfPig.Fonts/TrueType/Tables/GlyphDataTable.cs b/src/UglyToad.PdfPig.Fonts/TrueType/Tables/GlyphDataTable.cs index 9dd168e0..822a636b 100644 --- a/src/UglyToad.PdfPig.Fonts/TrueType/Tables/GlyphDataTable.cs +++ b/src/UglyToad.PdfPig.Fonts/TrueType/Tables/GlyphDataTable.cs @@ -15,7 +15,7 @@ { private readonly IReadOnlyList glyphOffsets; private readonly PdfRectangle maxGlyphBounds; - private readonly TrueTypeDataBytes tableBytes; + private TrueTypeDataBytes tableBytes; /// public string Tag => TrueTypeHeaderTable.Glyf; @@ -98,6 +98,11 @@ private IReadOnlyList ReadGlyphs() { + if (tableBytes == null) + { + throw new InvalidOperationException("Bytes cache was discarded before lazy value evaluated."); + } + var data = tableBytes; var offsets = glyphOffsets; @@ -149,6 +154,8 @@ result[compositeLocation.Key] = ReadCompositeGlyph(data, compositeLocation.Value, compositeLocations, result, emptyGlyph); } + tableBytes = null; + return result; } diff --git a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs index 960f986d..9d104952 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs @@ -1,4 +1,6 @@ -namespace UglyToad.PdfPig.Tests.Integration +using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; + +namespace UglyToad.PdfPig.Tests.Integration { using System; using System.Collections.Generic; @@ -159,6 +161,19 @@ used per estimate, we introduce a “complement class” Naive Bayes is often us } } + [Fact] + public void CanExtractContentOrderText() + { + using (var document = PdfDocument.Open(GetFilename())) + { + foreach (var page in document.GetPages()) + { + var text = ContentOrderTextExtractor.GetText(page); + Assert.NotNull(text); + } + } + } + private static IReadOnlyList GetPdfBoxPositionData() { var path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Integration", "Documents", "ICML03-081.Page1.Positions.txt"); diff --git a/src/UglyToad.PdfPig/Util/WhitespaceSizeStatistics.cs b/src/UglyToad.PdfPig/Util/WhitespaceSizeStatistics.cs new file mode 100644 index 00000000..74b5aa49 --- /dev/null +++ b/src/UglyToad.PdfPig/Util/WhitespaceSizeStatistics.cs @@ -0,0 +1,20 @@ +namespace UglyToad.PdfPig.Util +{ + using Content; + + /// + /// Measures of whitespace size based on point size. + /// + public static class WhitespaceSizeStatistics + { + /// + /// Get the average whitespace sized expected for a given letter. + /// + public static double GetExpectedWhitespaceSize(Letter letter) => letter.PointSize * 0.27; + + /// + /// Check if the measured gap is probably big enough to be a whitespace character based on the letter. + /// + public static bool IsProbablyWhitespace(double gap, Letter letter) => gap > (GetExpectedWhitespaceSize(letter) - (letter.PointSize * 0.05)); + } +}