I'm using itextSharp to extract images from PDF files.
I used this code as the basis: link
Here is my modified version to support files in memory instead of having to work with files on disk:
/// <summary>Helper class to extract images from a PDF file. Works with the most
/// common image types embedded in PDF files, as far as I can tell.</summary>
/// <example>
/// Usage example:
/// <code>
/// foreach (var filename in Directory.GetFiles(searchPath, "*.pdf", SearchOption.TopDirectoryOnly))
/// {
/// var images = ImageExtractor.ExtractImages(filename);
/// var directory = Path.GetDirectoryName(filename);
///
/// foreach (var image in images)
/// {
/// image.Save(Path.Combine(directory, name));
/// }
/// }
/// </code></example>
public static class PdfImageExtractor
{
#region Methods
#region Public Methods
/// <summary>Checks whether a specified page of a PDF file contains images.</summary>
/// <returns>True if the page contains at least one image; false otherwise.</returns>
public static bool PageContainsImages(byte[] pdfFile, int pageNumber)
{
using (var reader = new PdfReader(pdfFile))
{
var parser = new PdfReaderContentParser(reader);
ImageRenderListener listener = null;
parser.ProcessContent(pageNumber, (listener = new ImageRenderListener()));
return listener.Images.Count > 0;
}
}
/// <summary>Extracts all images (of types that iTextSharp knows how to decode) from a PDF file.</summary>
public static List<System.Drawing.Image> ExtractImages(byte[] pdfFile)
{
var images = new List<System.Drawing.Image>();
using (var reader = new PdfReader(pdfFile))
{
var parser = new PdfReaderContentParser(reader);
ImageRenderListener listener = null;
for (var i = 1; i <= reader.NumberOfPages; i++)
{
parser.ProcessContent(i, (listener = new ImageRenderListener()));
var index = 1;
if (listener.Images.Count > 0)
{
Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, i);
foreach (var pair in listener.Images)
{
images.Add(pair);
index++;
}
}
}
return images;
}
}
/// <summary>Extracts all images (of types that iTextSharp knows how to decode)
/// from a specified page of a PDF file.</summary>
/// <returns>Returns a generic <see cref="List<System.Drawing.Image>"/>,
/// where the key is a suggested file name, in the format: PDF filename without extension,
/// page number and image index in the page.</returns>
public static List<System.Drawing.Image> ExtractImages(byte[] pdfFile, int pageNumber)
{
var images = new List<System.Drawing.Image>();
using (var reader = new PdfReader(pdfFile))
{
var parser = new PdfReaderContentParser(reader);
ImageRenderListener listener = null;
parser.ProcessContent(pageNumber, (listener = new ImageRenderListener()));
int index = 1;
if (listener.Images.Count > 0)
{
Console.WriteLine("Found {0} images on page {1}.", listener.Images.Count, pageNumber);
foreach (System.Drawing.Image image in listener.Images)
{
images.Add(image);
index++;
}
}
}
return images;
}
#endregion Public Methods
#endregion Methods
}
internal class ImageRenderListener : IRenderListener
{
#region Fields
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
#endregion Fields
#region Properties
public List<System.Drawing.Image> Images
{
get { return images; }
}
#endregion Properties
#region Methods
#region Public Methods
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo)
{
PdfImageObject image = renderInfo.GetImage();
var imageBytes = image.GetImageAsBytes();
var bytesType = image.GetImageBytesType();
var fileExtension = bytesType.FileExtension;
using (var memoryStream = new MemoryStream(imageBytes))
{
var drawingImage = System.Drawing.Image.FromStream(memoryStream);
var dpiX = drawingImage.HorizontalResolution;
this.Images.Add(drawingImage);
}
}
public void RenderText(TextRenderInfo renderInfo) { }
#endregion Public Methods
#endregion Methods
}
The problem is that in the RenderImage
method the value of the dpiX
variable is always 96
. I would like to get the original DPI image resolution , is there any way to do this?
To use the method suffice:
var pageImages = PdfImageExtractor.ExtractImages(fileBytes, pageNumber);
For each image, I tried to convert to Bitmap too, but I got the same result:
var bitmapImage = new Bitmap(pageImage);
var dpiX = bitmapImage.HorizontalResolution;
var dpiY = bitmapImage.VerticalResolution;
I do not want to set a resolution on my own, I'd like to get the original resolution of the image.