Compare commits

...

5 Commits

Author SHA1 Message Date
Richard Flamsholt
d7d01f842e Update test Issue874: No longer missing a font
Some checks failed
Build, test and publish draft / build (push) Has been cancelled
Build and test [MacOS] / build (push) Has been cancelled
Run Common Crawl Tests / build (0000-0001) (push) Has been cancelled
Run Common Crawl Tests / build (0002-0003) (push) Has been cancelled
Run Common Crawl Tests / build (0004-0005) (push) Has been cancelled
Run Common Crawl Tests / build (0006-0007) (push) Has been cancelled
Run Integration Tests / build (push) Has been cancelled
Nightly Release / Check if this commit has already been published (push) Has been cancelled
Nightly Release / tests (push) Has been cancelled
Nightly Release / build_and_publish_nightly (push) Has been cancelled
Including the stream-xref means that the formerly missing font is no longer missing, so simply run the two test-cases under the (stricter) assumption of SkipMissingFonts=false.
2025-09-30 18:35:45 +01:00
Richard Flamsholt
33a8d829ee Update test Issue874: Also more text on page 2
Page two has had four more characters added, which is now delected by this xref-stream fix
2025-09-30 18:35:45 +01:00
Richard Flamsholt
57921c7e9b Update test Issue874: Now finds more text on page 1
With the fix for including associated streams, this test now finds more text on the first page. I've verified using Aspose.PDF and by viewing the ErcotFacts.pdf file being tested that yes, it was indeed missing part of the text before.
2025-09-30 18:35:45 +01:00
ricflams
5a6b3970f0 Add table-xref's associated stream-xrefs
- If an XrefTable has an associated stream, as indicated via the XrefStm-property, then read and add that XrefStream
- Any table can have 0 or 1 such associated streams
- A caveat: such an associated stream might also theoretically be part of the Parts-sequence in which case it would be encountered both by looping through all those parts along with all the regular tables and now also by association to any of those tables. It doesn't seem harmful since the offsets are flattened eventually anyway and stored by their offset-key into a mapping-table.
2025-09-30 18:35:45 +01:00
ricflams
397ccb15d6 Add xref-streams tied to any parts, not just the first
On a large sample of pdf-files PdfPig failed to read the correct StructTree-object for about 1% of them. The StructTree object was simply missing in the CrossReferenceTable.CrossReferenceTable.
It turned out that the constructed CrossReferenceTable could miss Stream-parts if there were multiple Table-parts because a stream will only be added if it's associated with the very first Table-part. The remedy would seem to be to check for and add streams that are associated with any of the Table-parts, not just the first one.
On a sample of 72 files where this failed, this changed fixed the StructTree for all of them.
2025-09-30 18:35:45 +01:00
4 changed files with 40 additions and 19 deletions

View File

@@ -472,19 +472,13 @@
{
var doc = IntegrationHelpers.GetDocumentPath("ErcotFacts.pdf");
using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true }))
{
var page1 = document.GetPage(1);
Assert.Equal(1788, page1.Letters.Count);
var page2 = document.GetPage(2);
Assert.Equal(2430, page2.Letters.Count);
}
using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = false }))
{
var ex = Assert.Throws<ArgumentNullException>(() => document.GetPage(1));
Assert.StartsWith("Value cannot be null.", ex.Message);
var page1 = document.GetPage(1);
Assert.Equal(1939, page1.Letters.Count);
var page2 = document.GetPage(2);
Assert.Equal(2434, page2.Letters.Count);
}
}

View File

@@ -56,16 +56,16 @@
// add this and follow chain defined by 'Prev' keys
xrefPartToBytePositionOrder.Add(firstCrossReferenceOffset);
// Get any streams that are tied to this table.
var activePart = currentPart;
var dependents = parts.Where(x => x.TiedToXrefAtOffset == activePart.Offset);
foreach (var dependent in dependents)
{
xrefPartToBytePositionOrder.Add(dependent.Offset);
}
while (currentPart.Dictionary != null)
{
// Get any streams that are tied to this table.
var activePart = currentPart;
var dependents = parts.Where(x => x.TiedToXrefAtOffset == activePart.Offset);
foreach (var dependent in dependents)
{
xrefPartToBytePositionOrder.Add(dependent.Offset);
}
long prevBytePos = currentPart.GetPreviousOffset();
if (prevBytePos == -1)
{

View File

@@ -153,6 +153,23 @@ internal static partial class FirstPassParser
{
results.Add(table);
nextLocation = table.GetPrevious();
// Also add any optional associated Stream
var xRefStm = table.GetXRefStm();
if (xRefStm is long xRefStmValue)
{
var stream = GetXrefStreamOrTable(
offset,
input,
scanner,
xRefStmValue,
log);
if (stream != null)
{
results.Add(stream);
}
}
}
else if (streamOrTable is XrefStream stream)
{

View File

@@ -44,4 +44,14 @@ internal sealed class XrefTable : IXrefSection
return null;
}
public long? GetXRefStm()
{
if (Dictionary != null && Dictionary.TryGet(NameToken.XrefStm, out NumericToken xRefStm))
{
return xRefStm.Long;
}
return null;
}
}