lib/thingfish/processor/pdf.rb
author Mahlon E. Smith <mahlon@laika.com>
Wed, 16 Nov 2016 13:22:21 -0800
changeset 1 70dba2d6deb8
parent 0 266fe36d11dd
permissions -rw-r--r--
Added signature for changeset 266fe36d11dd

# -*- ruby -*-
#encoding: utf-8

require 'pdf-reader'

require 'strelka'
require 'thingfish' unless defined?( Thingfish )
require 'thingfish/processor' unless defined?( Thingfish::Processor )


# Attach PDF metadata to the Adobe Portable Document Format.
class Thingfish::Processor::PDF < Thingfish::Processor
	extend Loggability

	# Package version
	VERSION = '0.1.0'

	# Version control revision
	REVISION = %q$Revision$


	# Loggability API -- log to the :thingfish logger
	log_to :thingfish

	# The list of handled types
	handled_types 'application/pdf'


	### Synchronous processor API -- extract metadata from uploaded PDFs
	###
	def on_request( request )
		reader   = ::PDF::Reader.new( request.body )
		metadata = self.extract_pdf_metadata( reader )

		request.add_metadata( metadata )
	end


	#########
	protected
	#########

	### Normalize metadata from the PDFReader object and return it as a hash.
	###
	def extract_pdf_metadata( reader )
		self.log.debug "Extracting PDF metadata..."

		pdf_metadata = {
			'pdf:version'   => reader.pdf_version,
			'pdf:pagecount' => reader.page_count,
		}.reject {|_,v| v.nil? }

		reader.info.each_pair do |key, val|
			pdf_metadata[ "pdf:#{key}" ] = val unless val.is_a?( ::PDF::Reader::Reference )
		end

		self.log.debug "  raw PDF metadata: %p" % [ pdf_metadata ]
		return pdf_metadata
	end

end # class Thingfish::Processor::PDF