lib/thingfish/processor/pdf.rb
changeset 0 266fe36d11dd
equal deleted inserted replaced
-1:000000000000 0:266fe36d11dd
       
     1 # -*- ruby -*-
       
     2 #encoding: utf-8
       
     3 
       
     4 require 'pdf-reader'
       
     5 
       
     6 require 'strelka'
       
     7 require 'thingfish' unless defined?( Thingfish )
       
     8 require 'thingfish/processor' unless defined?( Thingfish::Processor )
       
     9 
       
    10 
       
    11 # Attach PDF metadata to the Adobe Portable Document Format.
       
    12 class Thingfish::Processor::PDF < Thingfish::Processor
       
    13 	extend Loggability
       
    14 
       
    15 	# Package version
       
    16 	VERSION = '0.1.0'
       
    17 
       
    18 	# Version control revision
       
    19 	REVISION = %q$Revision$
       
    20 
       
    21 
       
    22 	# Loggability API -- log to the :thingfish logger
       
    23 	log_to :thingfish
       
    24 
       
    25 	# The list of handled types
       
    26 	handled_types 'application/pdf'
       
    27 
       
    28 
       
    29 	### Synchronous processor API -- extract metadata from uploaded PDFs
       
    30 	###
       
    31 	def on_request( request )
       
    32 		reader   = ::PDF::Reader.new( request.body )
       
    33 		metadata = self.extract_pdf_metadata( reader )
       
    34 
       
    35 		request.add_metadata( metadata )
       
    36 	end
       
    37 
       
    38 
       
    39 	#########
       
    40 	protected
       
    41 	#########
       
    42 
       
    43 	### Normalize metadata from the PDFReader object and return it as a hash.
       
    44 	###
       
    45 	def extract_pdf_metadata( reader )
       
    46 		self.log.debug "Extracting PDF metadata..."
       
    47 
       
    48 		pdf_metadata = {
       
    49 			'pdf:version'   => reader.pdf_version,
       
    50 			'pdf:pagecount' => reader.page_count,
       
    51 		}.reject {|_,v| v.nil? }
       
    52 
       
    53 		reader.info.each_pair do |key, val|
       
    54 			pdf_metadata[ "pdf:#{key}" ] = val unless val.is_a?( ::PDF::Reader::Reference )
       
    55 		end
       
    56 
       
    57 		self.log.debug "  raw PDF metadata: %p" % [ pdf_metadata ]
       
    58 		return pdf_metadata
       
    59 	end
       
    60 
       
    61 end # class Thingfish::Processor::PDF
       
    62