PdfPig/src/UglyToad.PdfPig/Parser/FileStructure/FileHeaderParser.cs
Eliot Jones d98b8b43c1 small performance tweaks and remove package license expression
package license url is deprecated in favour of package license expression but nuget doesn't seem to support expressions properly for published packages yet so we'll keep the deprecated url for the time being. having both url and expression causes the build to fail.

small obvious performance improvements for file header passing and getting the encoding information using the existing reverse name to code map.
2019-08-18 13:47:01 +01:00

97 lines
3.4 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using Content;
using Exceptions;
using Logging;
using Tokenization.Scanner;
using Tokens;
using Util.JetBrains.Annotations;
/// <summary>
/// Used to retrieve the version header from the PDF file.
/// </summary>
/// <remarks>
/// The first line of a PDF file should be a header consisting of the 5 characters %PDF followed by a version number of the form 1.N, where N is a digit between 0 and 7.
/// A conforming reader should accept files with any of the following headers:
/// %PDF1.0
/// %PDF1.1
/// %PDF1.2
/// %PDF1.3
/// %PDF1.4
/// %PDF1.5
/// %PDF1.6
/// %PDF1.7
/// This parser allows versions up to 1.9.
/// For versions equal or greater to PDF 1.4, the optional Version entry in the documents catalog dictionary should be used instead of the header version.
/// </remarks>
internal static class FileHeaderParser
{
[NotNull]
public static HeaderVersion Parse([NotNull]ISeekableTokenScanner scanner, bool isLenientParsing, ILog log)
{
if (scanner == null)
{
throw new ArgumentNullException(nameof(scanner));
}
// Read the first token
if (!scanner.MoveNext())
{
throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}.");
}
var comment = scanner.CurrentToken as CommentToken;
var junkSkip = isLenientParsing ? 2 : 0;
var attempts = 0;
while (comment == null)
{
if (attempts == junkSkip)
{
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
}
if (!scanner.MoveNext())
{
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
}
comment = scanner.CurrentToken as CommentToken;
attempts++;
}
if (comment.Data.IndexOf("PDF-1.", StringComparison.OrdinalIgnoreCase) != 0 && comment.Data.IndexOf("FDF-1.", StringComparison.OrdinalIgnoreCase) != 0)
{
return HandleMissingVersion(comment, isLenientParsing, log);
}
const int toDecimalStartLength = 4;
if (!decimal.TryParse(comment.Data.Substring(toDecimalStartLength), out var version))
{
return HandleMissingVersion(comment, isLenientParsing, log);
}
scanner.Seek(0);
var result = new HeaderVersion(version, comment.Data);
return result;
}
private static HeaderVersion HandleMissingVersion(CommentToken comment, bool isLenientParsing, ILog log)
{
if (isLenientParsing)
{
log.Warn($"Did not find a version header of the correct format, defaulting to 1.4 since lenient. Header was: {comment.Data}.");
return new HeaderVersion(1.4m, "PDF-1.4");
}
throw new PdfDocumentFormatException($"The comment which should have provided the version was in the wrong format: {comment.Data}.");
}
}
}