From 4976fa102776c9c0cef743310064689c0e1ce088 Mon Sep 17 00:00:00 2001
From: Eliot Jones <elioty@hotmail.co.uk>
Date: Wed, 8 Jan 2020 12:17:30 +0000
Subject: [PATCH] handle incorrect end image detected

since an inline image's data stream may contain the characters 'ei' as a result of compression it's possible to read an end image operator mid-data, this results in the next operator also being end image and the content stream being in an invalid state. to recover from this when we detect this situation we remove the previous operator, read to the current operator and replace the operator and data bytes in the list of operations.
---
 .../Parser/PageContentParserTests.cs          | 12 ++--
 .../Graphics/ContentStreamProcessor.cs        | 11 ++--
 .../Graphics/IOperationContext.cs             |  1 -
 .../Graphics/InlineImageBuilder.cs            |  1 -
 .../Parser/IPageContentParser.cs              |  2 +-
 .../Parser/PageContentParser.cs               | 61 +++++++++++++++++--
 src/UglyToad.PdfPig/Parser/PageFactory.cs     | 12 ++--
 7 files changed, 74 insertions(+), 26 deletions(-)

diff --git a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
index dd446e96..c4f4af00 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/PageContentParserTests.cs
@@ -27,7 +27,7 @@
             var content = File.ReadAllText(path);
             var input = StringBytesTestConverter.Convert(content, false);
 
-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);
 
             Assert.NotEmpty(result);
         }
@@ -39,7 +39,7 @@
             var content = File.ReadAllText(path);
             var input = StringBytesTestConverter.Convert(content, false);
 
-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);
 
             var replacementRegex = new Regex(@"\s(\.\d+)\b");
 
@@ -72,7 +72,7 @@
 ET";
             var input = StringBytesTestConverter.Convert(s, false);
 
-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);
 
             using (var stream = new MemoryStream())
             {
@@ -102,7 +102,7 @@ ET";
 ET";
             var input = StringBytesTestConverter.Convert(s, false);
 
-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);
 
             Assert.Equal(7, result.Count);
 
@@ -138,7 +138,7 @@ ET";
 
             var input = StringBytesTestConverter.Convert(s, false);
 
-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);
 
             Assert.Equal(4, result.Count);
 
@@ -163,7 +163,7 @@ cm BT 0.0001 Tc 19 0 0 19 0 0 Tm /Tc1 1 Tf (   \(sleep 1; printf ""QUIT\\r\\n""\
 
             var input = StringBytesTestConverter.Convert(s, false);
 
-            var result = parser.Parse(input.Bytes);
+            var result = parser.Parse(1, input.Bytes);
 
             Assert.Equal(9, result.Count);
 
diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
index 6b7302ca..8ccf7ec9 100644
--- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
+++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
@@ -4,14 +4,10 @@
     using System.Collections.Generic;
     using System.Diagnostics;
     using System.Linq;
-    using Colors;
     using Content;
     using Core;
-    using Exceptions;
     using Filters;
-    using Fonts;
     using Geometry;
-    using IO;
     using Logging;
     using Operations;
     using Parser;
@@ -19,7 +15,6 @@
     using PdfPig.Core;
     using Tokenization.Scanner;
     using Tokens;
-    using Util;
     using XObjects;
 
     internal class ContentStreamProcessor : IOperationContext
@@ -52,6 +47,7 @@
         private IFont activeExtendedGraphicsStateFont;
         private InlineImageBuilder inlineImageBuilder;
         private bool currentPathAdded;
+        private int pageNumber;
 
         /// <summary>
         /// A counter to track individual calls to <see cref="ShowText"/> operations used to determine if letters are likely to be
@@ -97,8 +93,9 @@
             ColorSpaceContext = new ColorSpaceContext(GetCurrentState, resourceStore);
         }
 
-        public PageContent Process(IReadOnlyList<IGraphicsStateOperation> operations)
+        public PageContent Process(int pageNumberCurrent, IReadOnlyList<IGraphicsStateOperation> operations)
         {
+            pageNumber = pageNumberCurrent;
             CloneAllStates();
 
             ProcessOperations(operations);
@@ -364,7 +361,7 @@
 
             var contentStream = formStream.Decode(filterProvider);
 
-            var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentStream));
+            var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream));
 
             // 3. We don't respect clipping currently.
 
diff --git a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs
index e278b0a6..4ad56ba9 100644
--- a/src/UglyToad.PdfPig/Graphics/IOperationContext.cs
+++ b/src/UglyToad.PdfPig/Graphics/IOperationContext.cs
@@ -1,7 +1,6 @@
 ﻿namespace UglyToad.PdfPig.Graphics
 {
     using System.Collections.Generic;
-    using Geometry;
     using Tokens;
     using PdfPig.Core;
     using Util.JetBrains.Annotations;
diff --git a/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs b/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs
index 60dd51e2..f6db3f22 100644
--- a/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs
+++ b/src/UglyToad.PdfPig/Graphics/InlineImageBuilder.cs
@@ -6,7 +6,6 @@
     using Colors;
     using Content;
     using Core;
-    using Exceptions;
     using Filters;
     using PdfPig.Core;
     using Tokenization.Scanner;
diff --git a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
index 92fc7336..3f011f7f 100644
--- a/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
+++ b/src/UglyToad.PdfPig/Parser/IPageContentParser.cs
@@ -6,6 +6,6 @@
 
     internal interface IPageContentParser
     {
-        IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes);
+        IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes);
     }
 }
\ No newline at end of file
diff --git a/src/UglyToad.PdfPig/Parser/PageContentParser.cs b/src/UglyToad.PdfPig/Parser/PageContentParser.cs
index 45bfbf4f..c18cc2ff 100644
--- a/src/UglyToad.PdfPig/Parser/PageContentParser.cs
+++ b/src/UglyToad.PdfPig/Parser/PageContentParser.cs
@@ -1,6 +1,8 @@
 ﻿namespace UglyToad.PdfPig.Parser
 {
+    using System;
     using System.Collections.Generic;
+    using System.Linq;
     using Core;
     using Graphics;
     using Graphics.Operations;
@@ -17,13 +19,15 @@
             this.operationFactory = operationFactory;
         }
 
-        public IReadOnlyList<IGraphicsStateOperation> Parse(IInputBytes inputBytes)
+        public IReadOnlyList<IGraphicsStateOperation> Parse(int pageNumber, IInputBytes inputBytes)
         {
             var scanner = new CoreTokenScanner(inputBytes);
 
             var precedingTokens = new List<IToken>();
             var graphicsStateOperations = new List<IGraphicsStateOperation>();
 
+            var lastEndImageOffset = new long?();
+
             while (scanner.MoveNext())
             {
                 var token = scanner.CurrentToken;
@@ -47,15 +51,62 @@
 
                     graphicsStateOperations.Add(new BeginInlineImageData(dictionary));
                     graphicsStateOperations.Add(new EndInlineImage(inlineImageData.Data));
+
+                    lastEndImageOffset = scanner.CurrentPosition - 2;
+
                     precedingTokens.Clear();
                 }
                 else if (token is OperatorToken op)
                 {
-                    var operation = operationFactory.Create(op, precedingTokens);
-
-                    if (operation != null)
+                    // Handle an end image where the stream of image data contained EI but was not actually a real end image operator.
+                    if (op.Data == "EI")
                     {
-                        graphicsStateOperations.Add(operation);
+                        // Check an end image operation was the last thing that happened.
+                        IGraphicsStateOperation lastOperation = graphicsStateOperations.Count > 0
+                            ? graphicsStateOperations[graphicsStateOperations.Count - 1]
+                            : null;
+
+                        if (lastEndImageOffset == null || lastOperation == null || !(lastOperation is EndInlineImage lastEndImage))
+                        {
+                            throw new PdfDocumentFormatException("Encountered End Image token outside an inline image on " +
+                                                                 $"page {pageNumber} at offset in content: {scanner.CurrentPosition}.");
+                        }
+
+                        // Work out how much data we missed between the false EI operator and the actual one.
+                        var actualEndImageOffset = scanner.CurrentPosition - 3;
+
+                        var gap = (int)(actualEndImageOffset - lastEndImageOffset);
+
+                        var from = inputBytes.CurrentOffset;
+                        inputBytes.Seek(lastEndImageOffset.Value);
+
+                        // Recover the full image data.
+                        {
+                            var missingData = new byte[gap];
+                            var read = inputBytes.Read(missingData);
+                            if (read != gap)
+                            {
+                                throw new InvalidOperationException($"Failed to read expected buffer length {gap} on page {pageNumber} " +
+                                                                    $"when reading inline image at offset in content: {lastEndImageOffset.Value}.");
+                            }
+                            
+                            // Replace the last end image operator with one containing the full set of data.
+                            graphicsStateOperations.Remove(lastEndImage);
+                            graphicsStateOperations.Add(new EndInlineImage(lastEndImage.ImageData.Concat(missingData).ToArray()));
+                        }
+
+                        lastEndImageOffset = actualEndImageOffset;
+
+                        inputBytes.Seek(from);
+                    }
+                    else
+                    {
+                        var operation = operationFactory.Create(op, precedingTokens);
+
+                        if (operation != null)
+                        {
+                            graphicsStateOperations.Add(operation);
+                        }
                     }
 
                     precedingTokens.Clear();
diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs
index 0637c1be..ea5ea9d0 100644
--- a/src/UglyToad.PdfPig/Parser/PageFactory.cs
+++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs
@@ -109,7 +109,7 @@
                     }
                 }
                 
-                content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
+                content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
             }
             else
             {
@@ -122,7 +122,7 @@
 
                 var bytes = contentStream.Decode(filterProvider);
 
-                content = GetContent(bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
+                content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, isLenientParsing);
             }
 
             var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, 
@@ -137,16 +137,18 @@
             return page;
         }
 
-        private PageContent GetContent(IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit, PageRotationDegrees rotation, bool isLenientParsing)
+        private PageContent GetContent(int pageNumber, IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit,
+            PageRotationDegrees rotation,
+            bool isLenientParsing)
         {
-            var operations = pageContentParser.Parse(new ByteArrayInputBytes(contentBytes));
+            var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes));
 
             var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, isLenientParsing, pdfScanner, 
                 pageContentParser,
                 filterProvider, 
                 log);
 
-            return context.Process(operations);
+            return context.Process(pageNumber, operations);
         }
 
         private static UserSpaceUnit GetUserSpaceUnits(DictionaryToken dictionary)