PdfPig/src/UglyToad.PdfPig/Parser/Parts/CrossReference/CrossReferenceStreamParser.cs

namespace UglyToad.PdfPig.Parser.Parts.CrossReference
{
    using System.Collections.Generic;
    using Core;
    using Filters;
    using PdfPig.CrossReference;
    using Tokens;
    using Util;

    internal class CrossReferenceStreamParser
    {
        private readonly IFilterProvider filterProvider;

        public CrossReferenceStreamParser(IFilterProvider filterProvider)
        {
            this.filterProvider = filterProvider;
        }

        /// <summary>
        /// Parses through the unfiltered stream and populates the xrefTable HashMap.
        /// </summary>
        public CrossReferenceTablePart Parse(long streamOffset, long? fromTableAtOffset, StreamToken stream)
        {
            var decoded = stream.Decode(filterProvider).Span;

            var fieldSizes = new CrossReferenceStreamFieldSize(stream.StreamDictionary);

            var lineCount = decoded.Length / fieldSizes.LineLength;
            
            long previousOffset = -1;
            if (stream.StreamDictionary.TryGet(NameToken.Prev, out var prevToken) && prevToken is NumericToken prevNumeric)
            {
                previousOffset = prevNumeric.Long;
            }

            var builder = new CrossReferenceTablePartBuilder
            {
                Offset = streamOffset,
                Previous = previousOffset,
                Dictionary = stream.StreamDictionary,
                XRefType = CrossReferenceType.Stream,
                TiedToPreviousAtOffset = fromTableAtOffset
            };

            var objectNumbers = GetObjectNumbers(stream.StreamDictionary);

            var lineNumber = 0;
            var lineBuffer = new byte[fieldSizes.LineLength];
            foreach (var objectNumber in objectNumbers)
            {
                if (lineNumber >= lineCount)
                {
                    break;
                }

                var byteOffset = lineNumber * fieldSizes.LineLength;

                for (var i = 0; i < fieldSizes.LineLength; i++)
                {
                    lineBuffer[i] = decoded[byteOffset + i];
                }

                int type;
                if (fieldSizes.Field1Size == 0)
                {
                    type = 1;
                }
                else
                {
                    type = 0;

                    for (var i = 0; i < fieldSizes.Field1Size; i++)
                    {
                        type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8);
                    }
                }

                ReadNextStreamObject(type, objectNumber, fieldSizes, builder, lineBuffer);

                lineNumber++;
            }

            return builder.Build();
        }

        private static void ReadNextStreamObject(int type, long objectNumber, CrossReferenceStreamFieldSize fieldSizes,
            CrossReferenceTablePartBuilder builder, byte[] lineBuffer)
        {
            switch (type)
            {
                case 0:
                    // Ignore free objects.
                    break;
                case 1:
                    // Non object stream entries.
                    var offset = 0;
                    for (var i = 0; i < fieldSizes.Field2Size; i++)
                    {
                        offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
                    }
                    var genNum = 0;
                    for (var i = 0; i < fieldSizes.Field3Size; i++)
                    {
                        genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8);
                    }

                    builder.Add(objectNumber, genNum, offset);

                    break;
                case 2:
                    /*
                     * object stored in object stream: 
                     * 2nd argument is object number of object stream
                     * 3rd argument is index of object within object stream
                     * 
                     * For sequential PDFParser we do not need this information
                     * because
                     * These objects are handled by the dereferenceObjects() method
                     * since they're only pointing to object numbers
                     * 
                     * However for XRef aware parsers we have to know which objects contain
                     * object streams. We will store this information in normal xref mapping
                     * table but add object stream number with minus sign in order to
                     * distinguish from file offsets
                     */
                    var objstmObjNr = 0;
                    for (var i = 0; i < fieldSizes.Field2Size; i++)
                    {
                        objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
                    }

                    builder.Add(objectNumber, 0, -objstmObjNr);

                    break;
            }
        }

        private static IEnumerable<long> GetObjectNumbers(DictionaryToken dictionary)
        {
            //  The number one greater than the highest object number used in this section or in any section for which this is an update.
            if (!dictionary.TryGet(NameToken.Size, out var sizeToken) || !(sizeToken is NumericToken sizeNumeric))
            {
                throw new PdfDocumentFormatException($"The stream dictionary must contain a numeric size value: {dictionary}.");
            }
            
            var objNums = new List<long>();

            if (dictionary.TryGet(NameToken.Index, out var indexToken) && indexToken is ArrayToken indexArrayToken)
            {
                // An array containing a pair of integers for each subsection in this section. 
                // Pair[0] is the first object number in the subsection; Pair[1] is the number of entries in the subsection.
                for (var i = 0; i < indexArrayToken.Length; i += 2)
                {
                    var firstObjectNumber = indexArrayToken.GetNumeric(i).Int;
                    var size = indexArrayToken.GetNumeric(i + 1).Int;

                    for (var j = 0; j < size; j++)
                    {
                        objNums.Add(firstObjectNumber + j);
                    }
                }
            }
            else
            {
                for (var i = 0; i < sizeNumeric.Int; i++)
                {
                    objNums.Add(i);
                }
            }

            return objNums;
        }
    }
}
-												change the project name to something silly

											
										
										
											2018-01-11 03:49:32 +08:00
+								namespace UglyToad.PdfPig.Parser.Parts.CrossReference
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								{
 								    using System.Collections.Generic;
-												complete move of truetype, afm and standard14 fonts

the 3 font types mentioned are moved to the new fonts project, any referenced types are moved to the core project. most truetype classes are made public #8.

											
										
										
											2020-01-05 06:39:13 +08:00
+								    using Core;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								    using Filters;
-												#10 make all token classes public and expose via a public structure member on pdf document

											
										
										
											2018-11-25 03:02:06 +08:00
+								    using PdfPig.CrossReference;
-												#10 move tokens to the root namespace for discoverability. upgrade xunit versions. there is a bug with test discovery for stringtokenizertests

											
										
										
											2018-11-17 04:00:12 +08:00
+								    using Tokens;
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								    using Util;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
 								    internal class CrossReferenceStreamParser
 								    {
 								        private readonly IFilterProvider filterProvider;
 								        public CrossReferenceStreamParser(IFilterProvider filterProvider)
 								        {
 								            this.filterProvider = filterProvider;
 								        }
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        /// <summary>
 								        /// Parses through the unfiltered stream and populates the xrefTable HashMap.
 								        /// </summary>
-												#434 ensure companion stream is added to cross reference on building

											
										
										
											2022-04-03 03:58:22 +08:00
+								        public CrossReferenceTablePart Parse(long streamOffset, long? fromTableAtOffset, StreamToken stream)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        {
-												Spanify filters

											
										
										
											2024-04-02 07:04:54 +08:00
+								            var decoded = stream.Decode(filterProvider).Span;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								            var fieldSizes = new CrossReferenceStreamFieldSize(stream.StreamDictionary);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												Spanify filters

											
										
										
											2024-04-02 07:04:54 +08:00
+								            var lineCount = decoded.Length / fieldSizes.LineLength;
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
 								            long previousOffset = -1;
 								            if (stream.StreamDictionary.TryGet(NameToken.Prev, out var prevToken) && prevToken is NumericToken prevNumeric)
 								            {
 								                previousOffset = prevNumeric.Long;
 								            }
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
 								            var builder = new CrossReferenceTablePartBuilder
 								            {
 								                Offset = streamOffset,
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                Previous = previousOffset,
 								                Dictionary = stream.StreamDictionary,
-												#434 ensure companion stream is added to cross reference on building

											
										
										
											2022-04-03 03:58:22 +08:00
+								                XRefType = CrossReferenceType.Stream,
 								                TiedToPreviousAtOffset = fromTableAtOffset
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            };
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								            var objectNumbers = GetObjectNumbers(stream.StreamDictionary);
 								            var lineNumber = 0;
 								            var lineBuffer = new byte[fieldSizes.LineLength];
 								            foreach (var objectNumber in objectNumbers)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            {
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                if (lineNumber >= lineCount)
 								                {
 								                    break;
 								                }
 								                var byteOffset = lineNumber * fieldSizes.LineLength;
 								                for (var i = 0; i < fieldSizes.LineLength; i++)
 								                {
 								                    lineBuffer[i] = decoded[byteOffset + i];
 								                }
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                int type;
 								                if (fieldSizes.Field1Size == 0)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                {
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                    type = 1;
 								                }
 								                else
 								                {
 								                    type = 0;
 								                    for (var i = 0; i < fieldSizes.Field1Size; i++)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    {
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                        type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    }
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                }
 								                ReadNextStreamObject(type, objectNumber, fieldSizes, builder, lineBuffer);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                lineNumber++;
 								            }
 								            return builder.Build();
 								        }
 								        private static void ReadNextStreamObject(int type, long objectNumber, CrossReferenceStreamFieldSize fieldSizes,
 								            CrossReferenceTablePartBuilder builder, byte[] lineBuffer)
 								        {
 								            switch (type)
 								            {
 								                case 0:
 								                    // Ignore free objects.
 								                    break;
 								                case 1:
 								                    // Non object stream entries.
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								                    var offset = 0;
 								                    for (var i = 0; i < fieldSizes.Field2Size; i++)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    {
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                        offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    }
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								                    var genNum = 0;
 								                    for (var i = 0; i < fieldSizes.Field3Size; i++)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    {
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                        genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    }
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
 								                    builder.Add(objectNumber, genNum, offset);
 								                    break;
 								                case 2:
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    /*
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                     * object stored in object stream:
 								                     * 2nd argument is object number of object stream
 								                     * 3rd argument is index of object within object stream
 								                     *
 								                     * For sequential PDFParser we do not need this information
 								                     * because
 								                     * These objects are handled by the dereferenceObjects() method
 								                     * since they're only pointing to object numbers
 								                     *
 								                     * However for XRef aware parsers we have to know which objects contain
 								                     * object streams. We will store this information in normal xref mapping
 								                     * table but add object stream number with minus sign in order to
 								                     * distinguish from file offsets
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                     */
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								                    var objstmObjNr = 0;
 								                    for (var i = 0; i < fieldSizes.Field2Size; i++)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    {
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                        objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								                    }
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                    builder.Add(objectNumber, 0, -objstmObjNr);
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                    break;
 								            }
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        }
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								        private static IEnumerable<long> GetObjectNumbers(DictionaryToken dictionary)
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								        {
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								            //  The number one greater than the highest object number used in this section or in any section for which this is an update.
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								            if (!dictionary.TryGet(NameToken.Size, out var sizeToken) || !(sizeToken is NumericToken sizeNumeric))
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            {
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                throw new PdfDocumentFormatException($"The stream dictionary must contain a numeric size value: {dictionary}.");
 								            }
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
 								            var objNums = new List<long>();
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
 								            if (dictionary.TryGet(NameToken.Index, out var indexToken) && indexToken is ArrayToken indexArrayToken)
 								            {
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								                // An array containing a pair of integers for each subsection in this section.
 								                // Pair[0] is the first object number in the subsection; Pair[1] is the number of entries in the subsection.
 								                for (var i = 0; i < indexArrayToken.Length; i += 2)
-												continue migrating code to tokenizer

											
										
										
											2018-01-21 02:42:29 +08:00
+								                {
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								                    var firstObjectNumber = indexArrayToken.GetNumeric(i).Int;
 								                    var size = indexArrayToken.GetNumeric(i + 1).Int;
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								                    for (var j = 0; j < size; j++)
 								                    {
 								                        objNums.Add(firstObjectNumber + j);
 								                    }
 								                }
 								            }
 								            else
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            {
-												fix bug where cross reference stream subsections were skipped

a single cross-reference stream may contain multiple disjoint runs of object numbers, previously we only took the first now we load all objects.

adds indexer to array token for ease-of-use.

adds page number and bounds information to all form fields.

											
										
										
											2019-10-10 23:05:21 +08:00
+								                for (var i = 0; i < sizeNumeric.Int; i++)
 								                {
 								                    objNums.Add(i);
 								                }
-												Move the code from the forked version to clean repository

											
										
										
											2017-11-10 03:14:09 +08:00
+								            }
 								            return objNums;
 								        }
 								    }
 								}