lib/thingfish/processor/pdf.rb
author Mahlon E. Smith <mahlon@laika.com>
Wed, 16 Nov 2016 13:22:21 -0800
changeset 1 70dba2d6deb8
parent 0 266fe36d11dd
permissions -rw-r--r--
Added signature for changeset 266fe36d11dd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     1
# -*- ruby -*-
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     2
#encoding: utf-8
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     3
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     4
require 'pdf-reader'
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     5
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     6
require 'strelka'
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     7
require 'thingfish' unless defined?( Thingfish )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     8
require 'thingfish/processor' unless defined?( Thingfish::Processor )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
     9
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    10
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    11
# Attach PDF metadata to the Adobe Portable Document Format.
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    12
class Thingfish::Processor::PDF < Thingfish::Processor
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    13
	extend Loggability
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    14
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    15
	# Package version
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    16
	VERSION = '0.1.0'
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    17
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    18
	# Version control revision
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    19
	REVISION = %q$Revision$
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    20
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    21
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    22
	# Loggability API -- log to the :thingfish logger
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    23
	log_to :thingfish
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    24
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    25
	# The list of handled types
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    26
	handled_types 'application/pdf'
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    27
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    28
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    29
	### Synchronous processor API -- extract metadata from uploaded PDFs
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    30
	###
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    31
	def on_request( request )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    32
		reader   = ::PDF::Reader.new( request.body )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    33
		metadata = self.extract_pdf_metadata( reader )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    34
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    35
		request.add_metadata( metadata )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    36
	end
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    37
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    38
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    39
	#########
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    40
	protected
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    41
	#########
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    42
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    43
	### Normalize metadata from the PDFReader object and return it as a hash.
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    44
	###
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    45
	def extract_pdf_metadata( reader )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    46
		self.log.debug "Extracting PDF metadata..."
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    47
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    48
		pdf_metadata = {
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    49
			'pdf:version'   => reader.pdf_version,
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    50
			'pdf:pagecount' => reader.page_count,
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    51
		}.reject {|_,v| v.nil? }
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    52
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    53
		reader.info.each_pair do |key, val|
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    54
			pdf_metadata[ "pdf:#{key}" ] = val unless val.is_a?( ::PDF::Reader::Reference )
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    55
		end
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    56
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    57
		self.log.debug "  raw PDF metadata: %p" % [ pdf_metadata ]
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    58
		return pdf_metadata
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    59
	end
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    60
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    61
end # class Thingfish::Processor::PDF
266fe36d11dd Initial release.
Mahlon E. Smith <mahlon@laika.com>
parents:
diff changeset
    62