@@ -272,55 +272,68 @@ def automated_validation
272272 end
273273
274274 ##
275- # Check if the PDF file(s) include accession and all list names, and report
276- # results as register list notes
275+ # Check if the PDF and XLSX file(s) include accession and all list names, and
276+ # report results as register list notes
277277 #
278278 # Returns boolean, with true indicating all checks passed and false otherwise
279279 #
280280 # IMPORTANT: Notes are soft-registered, remember to +save+ to make them
281- # persistent
281+ # persistent, but the individual +Check+ entries are actually saved
282282 def check_pdf_files
283- inames = Hash [ names . map { |n | [ n , false ] } ]
284- anames = Hash [ names . map { |n | [ n , false ] } ]
285- [ publication_pdf , supplementary_pdf ] . each do |as |
286- next unless as . attached?
287- break if anames . values . all? && inames . values . all?
288-
289- if as . filename . extension == 'pdf' || as . content_type == 'application/pdf'
290- as . open do |file |
291- render = PDF ::Reader . new ( file . path )
283+ xnames = Hash [ names . map { |n | [ n , [ false , false ] ] } ]
284+ %i[ publication supplementary ] . each do |file |
285+ next unless file? ( file )
286+ break if xnames . values . flatten . all?
287+
288+ file ( file ) . open do |fh |
289+ if file_is_pdf? ( file )
290+ render = PDF ::Reader . new ( fh . path )
292291 render . pages . each do |page |
293- txt = page . text . unicode_normalize ( :nfkc )
294- anames . each { |n , _ | anames [ n ] = true } if txt . index ( accession )
295- names . each do |n |
296- inames [ n ] ||= n . pdf_variants . find { |i | txt . index ( i ) } . present?
297- anames [ n ] ||= txt . index ( n . seqcode_url ( false ) ) . present?
298- end
299- break if anames . values . all? && inames . values . all?
292+ break if _search_names_in_text (
293+ xnames , names , accession ,
294+ page . text . unicode_normalize ( :nfkc )
295+ )
296+ end
297+ elsif file_is_xlsx? ( file )
298+ xlsx = Roo ::Spreadsheet . open ( fh . path )
299+ xlsx . each do |row |
300+ break if _search_names_in_text (
301+ xnames , names , accession ,
302+ row . select ( &:present? ) . join ( ' ' )
303+ )
300304 end
301305 end
302- elsif as . filename . extension == 'xlsx'
303- # TODO
304- # Parse spreadsheets!
305306 end
306307 end
307308
308309 names . each do |n |
309- par = { pass : anames [ n ] , user : nil }
310+ par = { pass : xnames [ n ] [ 0 ] , user : nil }
310311 Check . create_with ( par ) . find_or_create_by (
311312 name : n , kind : :effective_publication_missing_accession
312313 ) . update ( par )
313314
314- par = { pass : inames [ n ] , user : nil }
315+ par = { pass : xnames [ n ] [ 1 ] , user : nil }
315316 Check . create_with ( par ) . find_or_create_by (
316317 name : n , kind : :name_missing_in_effective_publication
317318 ) . update ( par )
318319 end
319320
320321 add_note ( 'The effective publication files have been parsed' )
321- anames . values . all? && inames . values . all?
322+ xnames . values . flatten . all?
322323 rescue => e
323324 add_note ( 'ERROR: The effective publication files could not be parsed' )
324325 raise e
325326 end
327+
328+ private
329+
330+ def _search_names_in_text ( xnames , names , accession , txt )
331+ xnames . each { |n , _ | xnames [ n ] [ 0 ] = true } if txt . index ( accession )
332+ names . each do |n |
333+ xnames [ n ] [ 0 ] ||= txt . index ( n . seqcode_url ( false ) ) . present?
334+ xnames [ n ] [ 1 ] ||= n . pdf_variants . find { |i | txt . index ( i ) } . present?
335+ end
336+ xnames . values . flatten . all?
337+ end
326338end
339+
0 commit comments