equal
deleted
inserted
replaced
|
1 # -*- ruby -*- |
|
2 #encoding: utf-8 |
|
3 |
|
4 require 'pdf-reader' |
|
5 |
|
6 require 'strelka' |
|
7 require 'thingfish' unless defined?( Thingfish ) |
|
8 require 'thingfish/processor' unless defined?( Thingfish::Processor ) |
|
9 |
|
10 |
|
11 # Attach PDF metadata to the Adobe Portable Document Format. |
|
12 class Thingfish::Processor::PDF < Thingfish::Processor |
|
13 extend Loggability |
|
14 |
|
15 # Package version |
|
16 VERSION = '0.1.0' |
|
17 |
|
18 # Version control revision |
|
19 REVISION = %q$Revision$ |
|
20 |
|
21 |
|
22 # Loggability API -- log to the :thingfish logger |
|
23 log_to :thingfish |
|
24 |
|
25 # The list of handled types |
|
26 handled_types 'application/pdf' |
|
27 |
|
28 |
|
29 ### Synchronous processor API -- extract metadata from uploaded PDFs |
|
30 ### |
|
31 def on_request( request ) |
|
32 reader = ::PDF::Reader.new( request.body ) |
|
33 metadata = self.extract_pdf_metadata( reader ) |
|
34 |
|
35 request.add_metadata( metadata ) |
|
36 end |
|
37 |
|
38 |
|
39 ######### |
|
40 protected |
|
41 ######### |
|
42 |
|
43 ### Normalize metadata from the PDFReader object and return it as a hash. |
|
44 ### |
|
45 def extract_pdf_metadata( reader ) |
|
46 self.log.debug "Extracting PDF metadata..." |
|
47 |
|
48 pdf_metadata = { |
|
49 'pdf:version' => reader.pdf_version, |
|
50 'pdf:pagecount' => reader.page_count, |
|
51 }.reject {|_,v| v.nil? } |
|
52 |
|
53 reader.info.each_pair do |key, val| |
|
54 pdf_metadata[ "pdf:#{key}" ] = val unless val.is_a?( ::PDF::Reader::Reference ) |
|
55 end |
|
56 |
|
57 self.log.debug " raw PDF metadata: %p" % [ pdf_metadata ] |
|
58 return pdf_metadata |
|
59 end |
|
60 |
|
61 end # class Thingfish::Processor::PDF |
|
62 |