@@ -29,18 +29,23 @@ def initialize
2929    def  extract ( pdfs ,  opts ) 
3030      extract_options  opts 
3131      FileUtils . mkdir_p  @output  unless  File . exists? ( @output ) 
32+       pdfs  =  Array ( pdfs ) 
33+       paths  =  [ ] 
3234      [ pdfs ] . flatten . each  do  |pdf |
3335        @pdf_name  =  File . basename ( pdf ,  File . extname ( pdf ) ) 
3436        pages  =  ( @pages  == 'all' )  ? 1 ..Docsplit . extract_length ( pdf )  : @pages 
35-         if  @force_ocr  || ( !@forbid_ocr  && !contains_text? ( pdf ) ) 
36-           extract_from_ocr ( pdf ,  pages ) 
37-         else 
38-           extract_from_pdf ( pdf ,  pages ) 
39-           if  !@forbid_ocr  && DEPENDENCIES [ :tesseract ]  && !@pages_to_ocr . empty? 
40-             extract_from_ocr ( pdf ,  @pages_to_ocr ) 
41-           end 
42-         end 
37+         return_value  =  if  @force_ocr  || ( !@forbid_ocr  && !contains_text? ( pdf ) ) 
38+                          extract_from_ocr ( pdf ,  pages ) 
39+                        else 
40+                          if  !@forbid_ocr  && DEPENDENCIES [ :tesseract ]  && !@pages_to_ocr . empty? 
41+                            extract_from_ocr ( pdf ,  @pages_to_ocr ) 
42+                          else 
43+                            extract_from_pdf ( pdf ,  pages ) 
44+                          end 
45+                        end 
46+         paths  << return_value 
4347      end 
48+       return  paths . flatten . compact 
4449    end 
4550
4651    # Does a PDF have any text embedded? 
@@ -52,31 +57,37 @@ def contains_text?(pdf)
5257    # Extract a page range worth of text from a PDF, directly. 
5358    def  extract_from_pdf ( pdf ,  pages ) 
5459      return  extract_full ( pdf )  unless  pages 
55-       pages . each  { |page | extract_page ( pdf ,  page )  } 
60+       pages . map  { |page | extract_page ( pdf ,  page )  } 
5661    end 
5762
5863    # Extract a page range worth of text from a PDF via OCR. 
5964    def  extract_from_ocr ( pdf ,  pages ) 
6065      tempdir  =  Dir . mktmpdir 
6166      base_path  =  File . join ( @output ,  @pdf_name ) 
6267      escaped_pdf  =  ESCAPE [ pdf ] 
68+       paths  =  [ ] 
6369      if  pages 
6470        pages . each  do  |page |
6571          tiff  =  "#{ tempdir } #{ @pdf_name } #{ page }  
6672          escaped_tiff  =  ESCAPE [ tiff ] 
6773          file  =  "#{ base_path } #{ page }  
6874          run  "MAGICK_TMPDIR=#{ tempdir } #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ escaped_pdf } #{ page  - 1 } #{ escaped_tiff }  
6975          run  "tesseract #{ escaped_tiff } #{ ESCAPE [ file ] } #{ @language }  
70-           clean_text ( file  + '.txt' )  if  @clean_ocr 
76+           file_name  =  file  + '.txt' 
77+           paths  << file_name 
78+           clean_text ( file_name )  if  @clean_ocr 
7179          FileUtils . remove_entry_secure  tiff 
7280        end 
7381      else 
7482        tiff  =  "#{ tempdir } #{ @pdf_name }  
7583        escaped_tiff  =  ESCAPE [ tiff ] 
7684        run  "MAGICK_TMPDIR=#{ tempdir } #{ MEMORY_ARGS } #{ OCR_FLAGS } #{ escaped_pdf } #{ escaped_tiff }  
7785        run  "tesseract #{ escaped_tiff } #{ base_path } #{ @language }  
78-         clean_text ( base_path  + '.txt' )  if  @clean_ocr 
86+         file_name  =  base_path  + '.txt' 
87+         paths  << file_name 
88+         clean_text ( file_name )  if  @clean_ocr 
7989      end 
90+       return  paths 
8091    ensure 
8192      FileUtils . remove_entry_secure  tempdir  if  File . exists? ( tempdir ) 
8293    end 
@@ -104,16 +115,19 @@ def run(command)
104115    def  extract_full ( pdf ) 
105116      text_path  =  File . join ( @output ,  "#{ @pdf_name }  ) 
106117      run  "pdftotext -enc UTF-8 #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] }  
118+       return  text_path 
107119    end 
108120
109121    # Extract the contents of a single page of text, directly, adding it to 
110122    # the `@pages_to_ocr` list if the text length is inadequate. 
111123    def  extract_page ( pdf ,  page ) 
112124      text_path  =  File . join ( @output ,  "#{ @pdf_name } #{ page }  ) 
113125      run  "pdftotext -enc UTF-8 -f #{ page } #{ page } #{ ESCAPE [ pdf ] } #{ ESCAPE [ text_path ] }  
126+ 
114127      unless  @forbid_ocr 
115128        @pages_to_ocr . push ( page )  if  File . read ( text_path ) . length  < MIN_TEXT_PER_PAGE 
116129      end 
130+       return  text_path 
117131    end 
118132
119133    def  extract_options ( options ) 
@@ -127,4 +141,4 @@ def extract_options(options)
127141
128142  end 
129143
130- end 
144+ end 
0 commit comments