handle incorrect end image detected

since an inline image's data stream may contain the characters 'ei' as a result of compression it's possible to read an end image operator mid-data, this results in the next operator also being end image and the content stream being in an invalid state. to recover from this when we detect this situation we remove the previous operator, read to the current operator and replace the operator and data bytes in the list of operations.
2025-10-14 02:44:58 +08:00 · 2020-01-08 12:17:30 +00:00
parent a083214da2
commit 4976fa1027
7 changed files with 74 additions and 26 deletions
--- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
@@ -27,7 +27,7 @@
            var content = File.ReadAllText(path);
            var input = StringBytesTestConverter.Convert(content, false);

-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);

            Assert.NotEmpty(result);
        }
@@ -39,7 +39,7 @@
            var content = File.ReadAllText(path);
            var input = StringBytesTestConverter.Convert(content, false);

-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);

            var replacementRegex = new Regex(@"\s(\.\d+)\b");

@@ -72,7 +72,7 @@
 ET";
            var input = StringBytesTestConverter.Convert(s, false);

-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);

            using (var stream = new MemoryStream())
            {
@@ -102,7 +102,7 @@ ET";
 ET";
            var input = StringBytesTestConverter.Convert(s, false);

-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);

            Assert.Equal(7, result.Count);

@@ -138,7 +138,7 @@ ET";

            var input = StringBytesTestConverter.Convert(s, false);

-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);

            Assert.Equal(4, result.Count);

@@ -163,7 +163,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf (   \(sleep 1; printf ""QUIT\\r\\n""\

            var input = StringBytesTestConverter.Convert(s, false);

-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);

            Assert.Equal(9, result.Count);

--- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
+++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
@@ -4,14 +4,10 @@
    using System.Collections.Generic;
    using System.Diagnostics;
    using System.Linq;
-    using Colors;
    using Content;
    using Core;
-    using Exceptions;
    using Filters;
-    using Fonts;
    using Geometry;
-    using IO;
    using Logging;
    using Operations;
    using Parser;
@@ -19,7 +15,6 @@
    using PdfPig.Core;
    using Tokenization.Scanner;
    using Tokens;
-    using Util;
    using XObjects;

    internal class ContentStreamProcessor : IOperationContext
@@ -52,6 +47,7 @@
        private IFont activeExtendedGraphicsStateFont;
        private InlineImageBuilder inlineImageBuilder;
        private bool currentPathAdded;
+        private int pageNumber;

        /// <summary>
        /// A counter to track individual calls to <see cref="ShowText"/> operations used to determine if letters are likely to be
@@ -97,8 +93,9 @@
            ColorSpaceContext = new ColorSpaceContext(GetCurrentState, resourceStore);
        }

-        public PageContent Process(IReadOnlyList<IGraphicsStateOperation> operations)
+        public PageContent Process(int pageNumberCurrent, IReadOnlyList<IGraphicsStateOperation> operations)
        {
+            pageNumber = pageNumberCurrent;
            CloneAllStates();

            ProcessOperations(operations);
@@ -364,7 +361,7 @@

            var contentStream = formStream.Decode(filterProvider);

-            var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentStream));
+            var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream));

            // 3. We don't respect clipping currently.

--- a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs
+++ b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs
@@ -1,7 +1,6 @@
 namespace UglyToad.PdfPig.Graphics
 {
    using System.Collections.Generic;
-    using Geometry;
    using Tokens;
    using PdfPig.Core;
    using Util.JetBrains.Annotations;
--- a/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs
+++ b/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs
@@ -6,7 +6,6 @@
    using Colors;
    using Content;
    using Core;
-    using Exceptions;
    using Filters;
    using PdfPig.Core;
    using Tokenization.Scanner;
--- a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
+++ b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
@@ -6,6 +6,6 @@

    internal interface IPageContentParser
    {
-        IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes);
+        IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes);
    }
 }
--- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs
+++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs
@@ -1,6 +1,8 @@
 namespace UglyToad.PdfPig.Parser
 {
+    using System;
    using System.Collections.Generic;
+    using System.Linq;
    using Core;
    using Graphics;
    using Graphics.Operations;
@@ -17,13 +19,15 @@
            this.operationFactory = operationFactory;
        }

-        public IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes)
+        public IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes)
        {
            var scanner = new CoreTokenScanner(inputBytes);

            var precedingTokens = new List<IToken>();
            var graphicsStateOperations = new List<IGraphicsStateOperation>();

+            var lastEndImageOffset = new long?();
+
            while (scanner.MoveNext())
            {
                var token = scanner.CurrentToken;
@@ -47,15 +51,62 @@

                    graphicsStateOperations.Add(new BeginInlineImageData(dictionary));
                    graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data));
+
+                    lastEndImageOffset = scanner.CurrentPosition - 2;
+
                    precedingTokens.Clear();
                }
                else if (token is OperatorToken op)
                {
-                    var operation = operationFactory.Create(op, precedingTokens);
-
-                    if (operation != null)
+                    // Handle an end image where the stream of image data contained EI but was not actually a real end image operator.
+                    if (op.Data == "EI")
                    {
-                        graphicsStateOperations.Add(operation);
+                        // Check an end image operation was the last thing that happened.
+                        IGraphicsStateOperation lastOperation = graphicsStateOperations.Count > 0
+                            ? graphicsStateOperations[graphicsStateOperations.Count - 1]
+                            : null;
+
+                        if (lastEndImageOffset == null || lastOperation == null || !(lastOperation is EndInlineImage lastEndImage))
+                        {
+                            throw new PdfDocumentFormatException("Encountered End Image token outside an inline image on " +
+                                                                 $"page {pageNumber} at offset in content: {scanner.CurrentPosition}.");
+                        }
+
+                        // Work out how much data we missed between the false EI operator and the actual one.
+                        var actualEndImageOffset = scanner.CurrentPosition - 3;
+
+                        var gap = (int)(actualEndImageOffset - lastEndImageOffset);
+
+                        var from = inputBytes.CurrentOffset;
+                        inputBytes.Seek(lastEndImageOffset.Value);
+
+                        // Recover the full image data.
+                        {
+                            var missingData = new byte[gap];
+                            var read = inputBytes.Read(missingData);
+                            if (read != gap)
+                            {
+                                throw new InvalidOperationException($"Failed to read expected buffer length {gap} on page {pageNumber} " +
+                                                                    $"when reading inline image at offset in content: {lastEndImageOffset.Value}.");
+                            }
+                            
+                            // Replace the last end image operator with one containing the full set of data.
+                            graphicsStateOperations.Remove(lastEndImage);
+                            graphicsStateOperations.Add(new EndInlineImage(lastEndImage.ImageData.Concat(missingData).ToArray()));
+                        }
+
+                        lastEndImageOffset = actualEndImageOffset;
+
+                        inputBytes.Seek(from);
+                    }
+                    else
+                    {
+                        var operation = operationFactory.Create(op, precedingTokens);
+
+                        if (operation != null)
+                        {
+                            graphicsStateOperations.Add(operation);
+                        }
                    }

                    precedingTokens.Clear();
--- a/src/UglyToad.PdfPig/Parser/PageFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs
@@ -109,7 +109,7 @@
                    }
                }
                
-                content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
+                content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
            }
            else
            {
@@ -122,7 +122,7 @@

                var bytes = contentStream.Decode(filterProvider);

-                content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
+                content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
            }

            var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, 
@@ -137,16 +137,18 @@
            return page;
        }

-        private PageContent GetContent(IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit, PageRotationDegrees rotation, bool isLenientParsing)
+        private PageContent GetContent(int pageNumber, IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit,
+            PageRotationDegrees rotation,
+            bool isLenientParsing)
        {
-            var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes));
+            var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes));

            var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, 
                pageContentParser,
                filterProvider, 
                log);

-            return context.Process(operations);
+            return context.Process(pageNumber, operations);
        }

        private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)