Skip to content

Commit 59e777c

Browse files
committed
Refactor genome sample set
1 parent d466948 commit 59e777c

4 files changed

Lines changed: 327 additions & 203 deletions

File tree

app/models/genome.rb

Lines changed: 1 addition & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class Genome < ApplicationRecord
3131

3232
include HasExternalResources
3333
include Genome::ExternalResources
34+
include Genome::SampleSet
3435

3536
attr_accessor :queue_for_source_update
3637

@@ -77,34 +78,6 @@ def required
7778
kind? source_database? source_accession?
7879
]
7980
end
80-
81-
def important_sample_attributes
82-
{
83-
date: %i[collection_date event_date_time_start event_date_time_end],
84-
location: %i[
85-
lat_lon lat lon
86-
geographic_location_latitude geographic_location_longitude
87-
latitude_start latitude_end longitude_start longitude_end
88-
],
89-
toponym: %i[
90-
geo_loc_name geographic_location_country_and_or_sea marine_region
91-
],
92-
environment: %i[
93-
env_material sample_type env_biome isolation_source analyte_type
94-
env_broad_scale env_local_scale env_medium
95-
environment_biome environment_feature gold_ecosystem_classification
96-
broad_scale_environmental_context local_environmental_context
97-
environmental_medium
98-
],
99-
other: %i[
100-
host ph depth temp temperature rel_to_oxygen geographic_location_depth
101-
chlorophyll isol_growth_condt
102-
],
103-
package: %i[
104-
ncbi_package ena_checklist ncbi_submission_package biosamplemodel
105-
]
106-
}
107-
end
10881
end
10982

11083
@@FIELDS_WITH_AUTO = %i[
@@ -218,174 +191,6 @@ def source_links
218191
end
219192
end
220193

221-
def source_extra_biosamples
222-
return [] unless source_hash
223-
return @source_extra_biosamples if @source_extra_biosamples
224-
225-
@source_extra_biosamples = []
226-
%i[derived_from sample_derived_from].each do |attribute|
227-
next unless attr = source_attributes[attribute]
228-
229-
attr.each do |i|
230-
@source_extra_biosamples +=
231-
i.gsub(/.*: */, '').gsub(/[\.]/, '').split(/ *,(?: and)? */)
232-
end
233-
end
234-
@source_extra_biosamples.uniq!
235-
@source_extra_biosamples -= source_hash[:samples].keys.map(&:to_s)
236-
@source_extra_biosamples -= source_accessions
237-
@source_extra_biosamples
238-
end
239-
240-
def source_attribute_groups
241-
return {} unless source_hash
242-
return @source_attribute_groups if @source_attribute_groups
243-
244-
@source_attribute_groups = {}
245-
self.class.important_sample_attributes.each do |group, attributes|
246-
@source_attribute_groups[group] = {}
247-
attributes.each do |attribute|
248-
attr = source_attributes[attribute]
249-
@source_attribute_groups[group][attribute] = attr if attr.present?
250-
end
251-
end
252-
@source_attribute_groups
253-
end
254-
255-
##
256-
# Finds the locations of all source samples associated to this genome, and
257-
# returns them as an Array of 2-element Arrays ([lat, lon]) or +nil+
258-
def source_sample_locations
259-
coord = /([-+] *)?(\d+(?:[\.\,]\d+)?|\d+°(?:\d+['"])*)( *[NSEW])?/
260-
keys = {
261-
lat: %i[lat geographic_location_latitude latitude_start latitude_end],
262-
lon: %i[lon geographic_location_longitude longitude_start longitude_end]
263-
}
264-
265-
coords = { lat: nil, lon: nil }
266-
@_source_sample_locations ||=
267-
source_cannonical_samples.map do |sample|
268-
# Try joint keys
269-
if sample[:lat_lon]
270-
m = sample[:lat_lon].match(/^ *(#{coord})[ ,;\/\-]+(#{coord}) *$/i)
271-
m ||= []
272-
coords[:lat] = m[2..4]
273-
coords[:lon] = m[6..8]
274-
end
275-
276-
# Try individual keys
277-
if coords.values.any?(&:nil?)
278-
keys.each do |dim, list|
279-
list.each do |key|
280-
if sample[key]
281-
m = sample[key].match(/^#{coord}$/i) || []
282-
coords[dim] = m[1..3]
283-
end
284-
break unless coords[dim].nil?
285-
end
286-
end
287-
end
288-
289-
# Parse each coordinate
290-
if coords.values.any?(&:nil?)
291-
nil
292-
else
293-
coords.map do |k, v|
294-
v.map!(&:to_s).map!(&:strip)
295-
decimal =
296-
if m = v[1].match(/^(\d) *°(?: *(\d+) *'(?: *(\d+) *(?:"|''))?)?/)
297-
m[1].to_f + (m[2].to_f + m[3].to_f / 60) / 60
298-
else
299-
v[1].gsub(',', '.').to_f
300-
end
301-
302-
if %w[S s W w].include?(v[2]) || v[0] == '-'
303-
-decimal
304-
else
305-
decimal
306-
end
307-
end
308-
end
309-
end
310-
end
311-
312-
##
313-
# Finds the rectangular bounds of all sample locations, with a minimum range
314-
# of latitudes and longitudes of +min+ after expanding both by a factor of
315-
# +pad+. Since +pad+ is a multiplicative factor, no padding is added if only
316-
# one location is found (but the +min+ is still applied). It returns the
317-
# bounds as an Array in the [south, west, north, east] order
318-
def source_sample_area(min = 0.1, pad = 0.5)
319-
loc = source_sample_locations.compact
320-
return unless loc.present?
321-
322-
rng = {
323-
lat: loc.map { |i| i[0] }.minmax,
324-
lon: loc.map { |i| i[1] }.minmax
325-
}
326-
327-
rng.each do |k, v|
328-
width = v.inject(:-).abs
329-
v[0] -= width * pad / 2
330-
v[1] += width * pad / 2
331-
width = v.inject(:-).abs
332-
if width < min
333-
pad_extra = (min - width) / 2
334-
rng[k][0] -= pad_extra
335-
rng[k][1] += pad_extra
336-
end
337-
end
338-
339-
[rng[:lat][0], rng[:lon][0], rng[:lat][1], rng[:lon][1]]
340-
end
341-
342-
##
343-
# TODO
344-
# Use source_cannonical_samples instead!
345-
def source_attributes
346-
return unless source_hash
347-
return @source_attributes if @source_attributes
348-
349-
not_provided = [
350-
'not provided', 'not collected', 'unavailable', 'not applicable',
351-
'missing', '-', 'n/a', 'null'
352-
]
353-
@source_attributes = {}
354-
source_hash[:samples].each_value do |sample|
355-
sample[:attributes].each do |key, value|
356-
value.strip!
357-
nice_key = key.to_s.downcase.gsub(/[^A-Za-z0-9]/, '_')
358-
.gsub(/_+/, '_').gsub(/^_|_$/, '').to_sym
359-
if value.present? && !not_provided.include?(value.downcase)
360-
@source_attributes[nice_key] ||= []
361-
@source_attributes[nice_key] << value
362-
end
363-
end if sample[:attributes].present?
364-
end
365-
@source_attributes.each_value(&:uniq!)
366-
@source_attributes
367-
end
368-
369-
def source_cannonical_samples
370-
not_provided = [
371-
'not provided', 'unavailable', 'missing', 'not applicable',
372-
'-', 'n/a', 'null'
373-
]
374-
@_source_cannonical_samples ||=
375-
source_hash[:samples].each_value.map do |sample|
376-
Hash[
377-
sample[:attributes].map do |key, value|
378-
value.strip!
379-
nice_key = key.to_s.downcase.gsub(/[^A-Za-z0-9]/, '_')
380-
.gsub(/_+/, '_').gsub(/^_|_$/, '').to_sym
381-
if value.present? && !not_provided.include?(value.downcase)
382-
[nice_key, value]
383-
end
384-
end.compact
385-
]
386-
end
387-
end
388-
389194
##
390195
# Returns registered BioSample accessions, directly from the database
391196
# if the source database is +:biosample+, or through the external links

0 commit comments

Comments
 (0)