- Fix of Stream invalid Length issue (causing stream data not being fully read).

- Improve Stream read performance by simplifying TryReadStream(), avoiding use of MemoryStream, with benefice of already existing Memory Span of "inputBytes"
2025-08-20 09:37:44 +08:00 · 2024-05-21 13:52:07 +02:00 · 2024-05-21 13:52:07 +02:00 · bb5a757e8c
commit bb5a757e8c
parent d86c2f44f0
1 changed files with 102 additions and 119 deletions
--- a/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
+++ b/src/UglyToad.PdfPig/Tokenization/Scanner/PdfTokenScanner.cs
@ -1,4 +1,7 @@
-namespace UglyToad.PdfPig.Tokenization.Scanner
+//TODO: https://blog.cerbero.io/cve-2010-0188-pdfformtiff/
+// https://beta-v1.malva.re/file/f1e966769e544b4b67f203ca51909b9d/report
+
+namespace UglyToad.PdfPig.Tokenization.Scanner
 {
    using System;
    using System.Collections.Generic;
@ -7,6 +10,7 @@
    using System.Globalization;
    using System.IO;
    using System.Linq;
+    using System.Text;
    using System.Text.RegularExpressions;
    using Core;
    using Encryption;
@ -320,7 +324,7 @@
            int endStreamPosition = 0;
            int commonPartPosition = 0;

-            const string commonPart = "end";
+            const string endWordPart = "end";
            const string streamPart = "stream";
            const string objPart = "obj";

@ -330,151 +334,130 @@
                return true;
            }

-            // Track any 'endobj' or 'endstream' operators we see.
-            var observedEndLocations = new List<PossibleStreamEndLocation>();
+            long streamDataStart = inputBytes.CurrentOffset;

-            // Begin reading the stream.
-            using (var memoryStream = new MemoryStream())
-            using (var binaryWrite = new BinaryWriter(memoryStream))
+            PossibleStreamEndLocation? possibleEndLocation = null;
+
+            while (inputBytes.MoveNext())
            {
-                while (inputBytes.MoveNext())
+                if (length.HasValue && read == length)
                {
-                    if (length.HasValue && read == length)
-                    {
-                        // TODO: read ahead and check we're at the end...
-                        // break;
-                    }
+                    // TODO: read ahead and check we're at the end...
+                    // break;
+                }

-                    // We are reading 'end' (possibly).
-                    if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
+                // We are reading 'end' (possibly).
+                if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
+                {
+                    commonPartPosition++;
+                }
+                else if (commonPartPosition == endWordPart.Length)
+                {
+                    // We are reading 'stream' after 'end'
+                    if (inputBytes.CurrentByte == streamPart[endStreamPosition])
                    {
-                        commonPartPosition++;
-                    }
-                    else if (commonPartPosition == commonPart.Length)
-                    {
-                        // We are reading 'stream' after 'end'
-                        if (inputBytes.CurrentByte == streamPart[endStreamPosition])
+                        endObjPosition = 0;
+                        endStreamPosition++;
+
+                        // We've finished reading 'endstream', add it to the end tokens we've seen.
+                        if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
                        {
-                            endObjPosition = 0;
-                            endStreamPosition++;
+                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);

-                            // We've finished reading 'endstream', add it to the end tokens we've seen.
-                            if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
+                            possibleEndLocation = token;
+                            //observedEndLocations.Add(token);
+
+                            if (length.HasValue && read > length)
                            {
-                                var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
-
-                                observedEndLocations.Add(token);
-
-                                if (length.HasValue && read > length)
-                                {
-                                    break;
-                                }
-
-                                endStreamPosition = 0;
+                                break;
                            }
-                        }
-                        else if (inputBytes.CurrentByte == objPart[endObjPosition])
-                        {
-                            // We are reading 'obj' after 'end'

                            endStreamPosition = 0;
-                            endObjPosition++;
-
-                            // We have finished reading 'endobj'.
-                            if (endObjPosition == objPart.Length)
-                            {
-                                // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
-                                if (observedEndLocations.Count > 0)
-                                {
-                                    var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
-
-                                    inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
-
-                                    break;
-                                }
-
-                                var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
-                                observedEndLocations.Add(token);
-
-                                if (read > length)
-                                {
-                                    break;
-                                }
-                            }
                        }
-                        else
-                        {
-                            // We were reading 'end' but then we had a character mismatch.
-                            // Reset all the counters.
+                    }
+                    else if (inputBytes.CurrentByte == objPart[endObjPosition])
+                    {
+                        // We are reading 'obj' after 'end'

-                            endStreamPosition = 0;
-                            endObjPosition = 0;
-                            commonPartPosition = 0;
+                        endStreamPosition = 0;
+                        endObjPosition++;
+
+                        // We have finished reading 'endobj'.
+                        if (endObjPosition == objPart.Length)
+                        {
+                            // If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
+                            if (possibleEndLocation != null)
+                            {
+                                var lastEndToken = possibleEndLocation.Value; //observedEndLocations[observedEndLocations.Count - 1];
+
+                                inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
+                                //streamDataEnd = lastEndToken.Offset + lastEndToken.Type.Data.Length + 1;
+
+                                break;
+                            }
+
+                            var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
+                            possibleEndLocation = token;
+
+                            if (read > length)
+                            {
+                                break;
+                            }
                        }
                    }
                    else
                    {
-                        // For safety reset every counter in case we had a partial read.
+                        // We were reading 'end' but then we had a character mismatch.
+                        // Reset all the counters.

                        endStreamPosition = 0;
                        endObjPosition = 0;
-                        commonPartPosition = (inputBytes.CurrentByte == commonPart[0]) ? 1 : 0;
+                        commonPartPosition = 0;
                    }
-
-                    binaryWrite.Write(inputBytes.CurrentByte);
-
-                    read++;
-                }
-
-                binaryWrite.Flush();
-
-                if (observedEndLocations.Count == 0)
-                {
-                    return false;
-                }
-
-                memoryStream.Seek(0, SeekOrigin.Begin);
-                if (length.HasValue && memoryStream.Length >= length)
-                {
-                    // Use the declared length to copy just the data we want.
-                    byte[] data = new byte[length.Value];
-
-                    memoryStream.Read(data, 0, (int)length.Value);
-
-                    stream = new StreamToken(streamDictionaryToken, data);
                }
                else
                {
-                    // Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
-                    var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
+                    // For safety reset every counter in case we had a partial read.

-                    var dataLength = lastEnd.Offset - startDataOffset;
-
-                    var current = inputBytes.CurrentOffset;
-
-                    // 3 characters, 'e', '\n' and possibly '\r'
-                    inputBytes.Seek(lastEnd.Offset - 3);
-                    inputBytes.MoveNext();
-
-                    if (inputBytes.CurrentByte == '\r')
-                    {
-                        dataLength -= 3;
-                    }
-                    else
-                    {
-                        dataLength -= 2;
-                    }
-
-                    inputBytes.Seek(current);
-
-                    byte[] data = new byte[dataLength];
-
-                    memoryStream.Read(data, 0, (int)dataLength);
-
-                    stream = new StreamToken(streamDictionaryToken, data);
+                    endStreamPosition = 0;
+                    endObjPosition = 0;
+                    commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
                }
+
+                read++;
            }

+            long streamDataEnd = inputBytes.CurrentOffset + 1;
+
+            if (possibleEndLocation == null)
+                return false;
+
+            var lastEnd = possibleEndLocation;
+
+            var dataLength = lastEnd.Value.Offset - startDataOffset;
+
+            // 3 characters, 'e', '\n' and possibly '\r'
+            inputBytes.Seek(lastEnd.Value.Offset - 3);
+            inputBytes.MoveNext();
+
+            if (inputBytes.CurrentByte == '\r')
+            {
+                dataLength -= 3;
+            }
+            else
+            {
+                dataLength -= 2;
+            }
+
+            Span<byte> data = new byte[dataLength];
+
+            inputBytes.Seek(streamDataStart);
+            inputBytes.Read(data);
+
+            inputBytes.Seek(streamDataEnd);
+
+            stream = new StreamToken(streamDictionaryToken, data.ToArray());
+
            return true;
        }