Files
PdfPig/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
davebrokit f3e37eafae
Some checks failed
Build, test and publish draft / build (push) Has been cancelled
Build and test [MacOS] / build (push) Has been cancelled
Run Common Crawl Tests / build (0000-0001) (push) Has been cancelled
Run Common Crawl Tests / build (0002-0003) (push) Has been cancelled
Run Common Crawl Tests / build (0004-0005) (push) Has been cancelled
Run Common Crawl Tests / build (0006-0007) (push) Has been cancelled
Run Common Crawl Tests / build (0008-0009) (push) Has been cancelled
Run Common Crawl Tests / build (0010-0011) (push) Has been cancelled
Run Common Crawl Tests / build (0012-0013) (push) Has been cancelled
Run Integration Tests / build (push) Has been cancelled
Tag Release / tag_if_version_changed (push) Has been cancelled
Nightly Release / Check if this commit has already been published (push) Has been cancelled
Nightly Release / tests (push) Has been cancelled
Nightly Release / build_and_publish_nightly (push) Has been cancelled
Introduce IBlock and ILettersBlock interfaces (Round 2) (#1263)
* Code review changes
- Keep the Bounds property on the image classes so this isn't a major breaking API change
- Don't expose letters collection

* Minor fix

* Switch to using BoundingBox in the library

---------

Co-authored-by: davmarksman <david@brokit.co.uk>
2026-02-28 16:25:51 +00:00

198 lines
6.6 KiB
C#

using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
namespace UglyToad.PdfPig.Tests.Integration
{
using PdfPig.Core;
using DocumentLayoutAnalysis.Export;
public class LaTexTests
{
private static string GetFilename()
{
return IntegrationHelpers.GetDocumentPath("ICML03-081.pdf");
}
[Fact]
public void CanReadContent()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var page = document.GetPage(1);
Assert.Contains("TacklingthePoorAssumptionsofNaiveBayesTextClassifiers", page.Text);
var page2 = document.GetPage(2);
Assert.Contains("is~θc={θc1,θc2,...,θcn},", page2.Text);
}
}
[Fact]
public void LettersHaveHeight()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.NotEqual(0, page.Letters[0].BoundingBox.Height);
}
}
[Fact]
public void HasCorrectNumberOfPages()
{
using (var document = PdfDocument.Open(GetFilename()))
{
Assert.Equal(8, document.NumberOfPages);
}
}
[Fact]
public void LettersHaveCorrectPositionsXfinium()
{
var positions = GetXfiniumPositionData();
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
for (var i = 0; i < page.Letters.Count; i++)
{
if (i >= positions.Count)
{
break;
}
var letter = page.Letters[i];
var expected = positions[i];
expected.AssertWithinTolerance(letter, page, false);
}
}
}
[Fact]
public void LettersHaveCorrectPositionsPdfBox()
{
var positions = GetPdfBoxPositionData();
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
for (var i = 0; i < page.Letters.Count; i++)
{
if (i >= positions.Count)
{
break;
}
var letter = page.Letters[i];
var expected = positions[i];
expected.AssertWithinTolerance(letter, page);
}
}
}
[Fact]
public void Page1Words()
{
const string expectedString = @"Tackling the Poor Assumptions of Naive Bayes Text Classifiers
Jason D. M. Rennie jrennie@mit.edu
Lawrence Shih kai@mit.edu
Jaime Teevan teevan@mit.edu
David R. Karger karger@mit.edu
Artificial Intelligence Laboratory; Massachusetts Institute of Technology; Cambridge, MA 02139
Abstract amples. To balance the amount of training examples
used per estimate, we introduce a “complement class”
Naive Bayes is often used as a baseline in";
var expected = expectedString.Split(new[] {"\r", "\r\n", "\n", " "}, StringSplitOptions.RemoveEmptyEntries);
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
var words = page.GetWords().ToList();
for (var i = 0; i < words.Count; i++)
{
if (i == expected.Length)
{
break;
}
Assert.True(string.Equals(expected[i], words[i].Text, StringComparison.Ordinal),
$"Expected word {expected[i]} got word {words[i].Text} at index {i}.");
}
}
}
[Fact]
public void CanGetMetadata()
{
using (var document = PdfDocument.Open(GetFilename(), ParsingOptions.LenientParsingOff))
{
var hasMetadata = document.TryGetXmpMetadata(out var metadata);
Assert.True(hasMetadata);
var xDocument = metadata.GetXDocument();
Assert.NotNull(xDocument);
var text = OtherEncodings.BytesAsLatin1String(metadata.GetXmlBytes().ToArray());
Assert.StartsWith("<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'", text);
}
}
[Fact]
public void CanExportSvg()
{
using (var document = PdfDocument.Open(GetFilename(), new ParsingOptions{ ClipPaths = true }))
{
var page = document.GetPage(1);
var svg = new SvgTextExporter().Get(page);
Assert.NotNull(svg);
}
}
[Fact]
public void CanExtractContentOrderText()
{
using (var document = PdfDocument.Open(GetFilename()))
{
foreach (var page in document.GetPages())
{
var text = ContentOrderTextExtractor.GetText(page);
Assert.NotNull(text);
}
}
}
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
{
var path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Integration", "Documents", "ICML03-081.Page1.Positions.txt");
var data = File.ReadAllText(path);
var result = data.Split(new[] { "\r", "\n", "\r\n" }, StringSplitOptions.RemoveEmptyEntries)
.Select(AssertablePositionData.Parse)
.ToList();
return result;
}
private static IReadOnlyList<AssertablePositionData> GetXfiniumPositionData()
{
const string data = @"75.731 83.12866 11.218572 T 14.346 WDKAAR+CMBX12 9.956124
85.6153934 83.123866 7.847262 a 11.218572 WDKAAR+CMBX12 9.956124
93.462656 83.123866 7.173 c 11.218572 WDKAAR+CMBX12 9.956124
100.176584 83.123866 8.521524 k 11.218572 WDKAAR+CMBX12 9.956124
108.698108 83.123866 4.490298 l 11.218572 WDKAAR+CMBX12 9.956124";
var result = data.Split(new[] { "\r", "\n", "\r\n" }, StringSplitOptions.RemoveEmptyEntries)
.Select(AssertablePositionData.Parse)
.ToList();
return result;
}
}
}