diff --git a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs index dd446e96..c4f4af00 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs @@ -27,7 +27,7 @@ var content = File.ReadAllText(path); var input = StringBytesTestConverter.Convert(content, false); - var result = parser.Parse(input.Bytes); + var result = parser.Parse(1, input.Bytes); Assert.NotEmpty(result); } @@ -39,7 +39,7 @@ var content = File.ReadAllText(path); var input = StringBytesTestConverter.Convert(content, false); - var result = parser.Parse(input.Bytes); + var result = parser.Parse(1, input.Bytes); var replacementRegex = new Regex(@"\s(\.\d+)\b"); @@ -72,7 +72,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(input.Bytes); + var result = parser.Parse(1, input.Bytes); using (var stream = new MemoryStream()) { @@ -102,7 +102,7 @@ ET"; ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(input.Bytes); + var result = parser.Parse(1, input.Bytes); Assert.Equal(7, result.Count); @@ -138,7 +138,7 @@ ET"; var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(input.Bytes); + var result = parser.Parse(1, input.Bytes); Assert.Equal(4, result.Count); @@ -163,7 +163,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf ( \(sleep 1; printf ""QUIT\\r\\n""\ var input = StringBytesTestConverter.Convert(s, false); - var result = parser.Parse(input.Bytes); + var result = parser.Parse(1, input.Bytes); Assert.Equal(9, result.Count); diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index 6b7302ca..8ccf7ec9 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -4,14 +4,10 @@ using System.Collections.Generic; using System.Diagnostics; using System.Linq; - using Colors; using Content; using Core; - using Exceptions; using Filters; - using Fonts; using Geometry; - using IO; using Logging; using Operations; using Parser; @@ -19,7 +15,6 @@ using PdfPig.Core; using Tokenization.Scanner; using Tokens; - using Util; using XObjects; internal class ContentStreamProcessor : IOperationContext @@ -52,6 +47,7 @@ private IFont activeExtendedGraphicsStateFont; private InlineImageBuilder inlineImageBuilder; private bool currentPathAdded; + private int pageNumber; /// /// A counter to track individual calls to operations used to determine if letters are likely to be @@ -97,8 +93,9 @@ ColorSpaceContext = new ColorSpaceContext(GetCurrentState, resourceStore); } - public PageContent Process(IReadOnlyList operations) + public PageContent Process(int pageNumberCurrent, IReadOnlyList operations) { + pageNumber = pageNumberCurrent; CloneAllStates(); ProcessOperations(operations); @@ -364,7 +361,7 @@ var contentStream = formStream.Decode(filterProvider); - var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentStream)); + var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream)); // 3. We don't respect clipping currently. diff --git a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs index e278b0a6..4ad56ba9 100644 --- a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs +++ b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs @@ -1,7 +1,6 @@ namespace UglyToad.PdfPig.Graphics { using System.Collections.Generic; - using Geometry; using Tokens; using PdfPig.Core; using Util.JetBrains.Annotations; diff --git a/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs b/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs index 60dd51e2..f6db3f22 100644 --- a/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs +++ b/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs @@ -6,7 +6,6 @@ using Colors; using Content; using Core; - using Exceptions; using Filters; using PdfPig.Core; using Tokenization.Scanner; diff --git a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs index 92fc7336..3f011f7f 100644 --- a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs @@ -6,6 +6,6 @@ internal interface IPageContentParser { - IReadOnlyList Parse(IInputBytes inputBytes); + IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs index 45bfbf4f..c18cc2ff 100644 --- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs +++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs @@ -1,6 +1,8 @@ namespace UglyToad.PdfPig.Parser { + using System; using System.Collections.Generic; + using System.Linq; using Core; using Graphics; using Graphics.Operations; @@ -17,13 +19,15 @@ this.operationFactory = operationFactory; } - public IReadOnlyList Parse(IInputBytes inputBytes) + public IReadOnlyList Parse(int pageNumber, IInputBytes inputBytes) { var scanner = new CoreTokenScanner(inputBytes); var precedingTokens = new List(); var graphicsStateOperations = new List(); + var lastEndImageOffset = new long?(); + while (scanner.MoveNext()) { var token = scanner.CurrentToken; @@ -47,15 +51,62 @@ graphicsStateOperations.Add(new BeginInlineImageData(dictionary)); graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data)); + + lastEndImageOffset = scanner.CurrentPosition - 2; + precedingTokens.Clear(); } else if (token is OperatorToken op) { - var operation = operationFactory.Create(op, precedingTokens); - - if (operation != null) + // Handle an end image where the stream of image data contained EI but was not actually a real end image operator. + if (op.Data == "EI") { - graphicsStateOperations.Add(operation); + // Check an end image operation was the last thing that happened. + IGraphicsStateOperation lastOperation = graphicsStateOperations.Count > 0 + ? graphicsStateOperations[graphicsStateOperations.Count - 1] + : null; + + if (lastEndImageOffset == null || lastOperation == null || !(lastOperation is EndInlineImage lastEndImage)) + { + throw new PdfDocumentFormatException("Encountered End Image token outside an inline image on " + + $"page {pageNumber} at offset in content: {scanner.CurrentPosition}."); + } + + // Work out how much data we missed between the false EI operator and the actual one. + var actualEndImageOffset = scanner.CurrentPosition - 3; + + var gap = (int)(actualEndImageOffset - lastEndImageOffset); + + var from = inputBytes.CurrentOffset; + inputBytes.Seek(lastEndImageOffset.Value); + + // Recover the full image data. + { + var missingData = new byte[gap]; + var read = inputBytes.Read(missingData); + if (read != gap) + { + throw new InvalidOperationException($"Failed to read expected buffer length {gap} on page {pageNumber} " + + $"when reading inline image at offset in content: {lastEndImageOffset.Value}."); + } + + // Replace the last end image operator with one containing the full set of data. + graphicsStateOperations.Remove(lastEndImage); + graphicsStateOperations.Add(new EndInlineImage(lastEndImage.ImageData.Concat(missingData).ToArray())); + } + + lastEndImageOffset = actualEndImageOffset; + + inputBytes.Seek(from); + } + else + { + var operation = operationFactory.Create(op, precedingTokens); + + if (operation != null) + { + graphicsStateOperations.Add(operation); + } } precedingTokens.Clear(); diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index 0637c1be..ea5ea9d0 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -109,7 +109,7 @@ } } - content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing); + content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing); } else { @@ -122,7 +122,7 @@ var bytes = contentStream.Decode(filterProvider); - content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing); + content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing); } var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, @@ -137,16 +137,18 @@ return page; } - private PageContent GetContent(IReadOnlyList contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit, PageRotationDegrees rotation, bool isLenientParsing) + private PageContent GetContent(int pageNumber, IReadOnlyList contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit, + PageRotationDegrees rotation, + bool isLenientParsing) { - var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes)); + var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes)); var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, pageContentParser, filterProvider, log); - return context.Process(operations); + return context.Process(pageNumber, operations); } private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)