diff --git a/src/UglyToad.Pdf.Tests/Integration/Documents/Two Page Text Only - from libre office.pdf b/src/UglyToad.Pdf.Tests/Integration/Documents/Two Page Text Only - from libre office.pdf new file mode 100644 index 00000000..526105bf Binary files /dev/null and b/src/UglyToad.Pdf.Tests/Integration/Documents/Two Page Text Only - from libre office.pdf differ diff --git a/src/UglyToad.Pdf.Tests/Integration/TwoPageTextOnlyLibreOfficeTests.cs b/src/UglyToad.Pdf.Tests/Integration/TwoPageTextOnlyLibreOfficeTests.cs new file mode 100644 index 00000000..3ba15361 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Integration/TwoPageTextOnlyLibreOfficeTests.cs @@ -0,0 +1,58 @@ +namespace UglyToad.Pdf.Tests.Integration +{ + using System; + using System.IO; + using Content; + using Xunit; + + public class TwoPageTextOnlyLibreOfficeTests + { + private static string GetFilename() + { + var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents")); + + return Path.Combine(documentFolder, "Two Page Text Only - from libre office.pdf"); + } + + [Fact] + public void HasCorrectNumberOfPages() + { + var file = GetFilename(); + + using (var document = PdfDocument.Open(File.ReadAllBytes(file))) + { + Assert.Equal(2, document.NumberOfPages); + } + } + + [Fact] + public void HasCorrectPageSize() + { + using (var document = PdfDocument.Open(GetFilename())) + { + var page = document.GetPage(1); + + Assert.Equal(PageSize.A4, page.Size); + + page = document.GetPage(2); + + Assert.Equal(PageSize.A4, page.Size); + } + } + + [Fact] + public void PagesStartWithCorrectText() + { + using (var document = PdfDocument.Open(GetFilename())) + { + var page = document.GetPage(1); + + Assert.StartsWith("Apache License", page.Text); + + page = document.GetPage(2); + + Assert.StartsWith("2. Grant of Copyright", page.Text); + } + } + } +} diff --git a/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj b/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj index fad17196..0a38ce05 100644 --- a/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj +++ b/src/UglyToad.Pdf.Tests/UglyToad.Pdf.Tests.csproj @@ -16,6 +16,7 @@ + @@ -43,6 +44,9 @@ PreserveNewest + + PreserveNewest + diff --git a/src/UglyToad.Pdf/Content/Page.cs b/src/UglyToad.Pdf/Content/Page.cs index b2621ac0..e694e54d 100644 --- a/src/UglyToad.Pdf/Content/Page.cs +++ b/src/UglyToad.Pdf/Content/Page.cs @@ -2,6 +2,7 @@ { using System; using System.Collections.Generic; + using System.Linq; public class Page { @@ -18,6 +19,8 @@ public IReadOnlyList Letters => Content?.Letters ?? new Letter[0]; + public string Text { get; } + /// /// Gets the width of the page in points. /// @@ -44,11 +47,22 @@ MediaBox = mediaBox; CropBox = cropBox; Content = content; + Text = GetText(content); Width = mediaBox.Bounds.Width; Height = mediaBox.Bounds.Height; Size = mediaBox.Bounds.GetPageSize(); } + + private static string GetText(PageContent content) + { + if (content?.Letters == null) + { + return string.Empty; + } + + return string.Join(string.Empty, content.Letters.Select(x => x.Value)); + } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Content/Pages.cs b/src/UglyToad.Pdf/Content/Pages.cs index fbd9c7aa..33c3abe3 100644 --- a/src/UglyToad.Pdf/Content/Pages.cs +++ b/src/UglyToad.Pdf/Content/Pages.cs @@ -91,6 +91,7 @@ bool found = pageNumber == soughtPageNumber; locatedPages[pageNumber] = currentPageDictionary; + pageNumbersObserved.Add(pageNumber); return found; } @@ -117,6 +118,7 @@ if (thisPageMatches) { childFound = true; + break; } }