-
Notifications
You must be signed in to change notification settings - Fork 27
Open
Description
A bug for --geno is a bgen file with code like below:
python /home/xutingfeng/github_code/others/polyfun/finemapper.py --geno data.bgen --sumstats polyfun_sumstats.tsv --chr 1 --start 15333350 --verbose --allow-missing \
--end 15833356 \
--method susie \
--max-num-causal 5 \
--out output/test.gz \
--n 383290 \
--non-funct \
--ldstore2 $(which ldstore2) \
--sample-file data.sample --threads 4 --memory 10240bug source
the bug is caused by compute_ld_bgen at df_snp = df_z.query('SNP == "%s"'%(rsid))
def compute_ld_bgen(self, locus_start, locus_end, verbose=False):
#create df_z
df_z = self.df_sumstats_locus[['SNP', 'CHR', 'BP', 'A1', 'A2']].copy()
#add leading zeros to chromosome numbers if needed
try:
import bgen
except (ImportError, ModuleNotFoundError):
raise ValueError('\n\nPlease install the bgen package (using "pip install bgen")')
from bgen.reader import BgenFile
bfile = BgenFile(self.genotypes_file)
bgen_chromosomes = bfile.chroms()
if bgen_chromosomes[0].startswith('0'):
df_z['CHR'] = '0' + df_z['CHR'].astype(str)
#sync the order of the alleles between the sumstats and the bgen file
list_bgen = []
#rsids = bfile.rsids()
#small change reduces the time for bgen processing
#the previous implementation would iterate through all the SNPs in the bgen file
#this implementation loops over just the snps in the locus
rsids = bfile.fetch(self.chr, locus_start, locus_end)
for snp_i, rsid in enumerate(rsids):
# if rsid not in df_z['SNP'].values: continue
# snp_alleles = bfile[snp_i].alleles
# snp_chrom = bfile[snp_i].chrom
# snp_pos = bfile[snp_i].pos
if rsid.rsid not in df_z['SNP'].values: continue
snp_alleles = rsid.alleles
snp_chrom = rsid.chrom
snp_pos = rsid.pos
assert len(snp_alleles) == 2, 'cannot handle SNPs with more than two alleles'
df_snp = df_z.query('SNP == "%s"'%(rsid)) # NOTE: bug is here
assert df_snp.shape[0]==1
a1, a2 = df_snp['A1'].iloc[0], df_snp['A2'].iloc[0]
snp_a1, snp_a2 = snp_alleles[0], snp_alleles[1]
if set([a1,a2]) != set([snp_a1, snp_a2]):
raise ValueError('The alleles for SNP %s are different in the sumstats and in the bgen file:\n \
bgen: A1=%s A2=%s\n \
sumstats: A1=%s A2=%s \
'%(rsid, snp_alleles[0], snp_alleles[1], a1, a2))
d = {'SNP':rsid, 'CHR':snp_chrom, 'BP':snp_pos, 'A1':snp_a1, 'A2':snp_a2}
list_bgen.append(d)
df_bgen = pd.DataFrame(list_bgen)
df_bgen = set_snpid_index(df_bgen, allow_swapped_indel_alleles=self.allow_swapped_indel_alleles)
df_z = set_snpid_index(df_z, allow_swapped_indel_alleles=self.allow_swapped_indel_alleles)
df_z = df_z[[]].merge(df_bgen, left_index=True, right_index=True)
df_z = df_z[['SNP', 'CHR', 'BP', 'A1', 'A2']]
#rename columns
df_z.rename(columns={'SNP':'rsid', 'CHR':'chromosome', 'BP':'position', 'A1':'allele1', 'A2':'allele2'}, inplace=True)
#Create LDstore input files
temp_dir = tempfile.mkdtemp()
incl_file = os.path.join(temp_dir, 'incl.incl')
master_file = os.path.join(temp_dir, 'master.master')
z_file = os.path.join(temp_dir, 'chr%s.%s_%s.z'%(self.chr, locus_start, locus_end))
dose_file = os.path.join(temp_dir, 'dosages.bdose')
df_z.to_csv(z_file, sep=' ', index=False)
#find number of samples
if self.incl_samples is None:
num_samples = pd.read_table(self.sample_file).shape[0]-1
else:
num_samples = pd.read_table(self.incl_samples, header=None).shape[0]
#get output file name
bcor_file = self.get_ld_output_file_prefix(locus_start, locus_end, temp_dir) + '.bcor'
#Create LDstore master file
df_master = pd.DataFrame(columns=['z','bgen','bgi','bcor','dose','sample','n_samples'])
df_master['z'] = [z_file]
df_master['bgen'] = [self.genotypes_file]
df_master['bgi'] = [self.genotypes_file+'.bgi']
df_master['bcor'] = [bcor_file]
df_master['bdose'] = [dose_file]
df_master['sample'] = [self.sample_file]
df_master['n_samples'] = num_samples
if self.incl_samples is not None:
df_master['incl'] = self.incl_samples
df_master.to_csv(master_file, sep=';', header=True, index=False)
#run LDstore
ldstore_cmd = [self.ldstore_exe, '--in-files', master_file, '--write-bcor', '--write-bdose', '--bdose-version', '1.0']
if self.memory is not None:
ldstore_cmd += ['--memory', str(self.memory)]
if self.n_threads is not None:
ldstore_cmd += ['--n-threads', str(self.n_threads)]
run_executable(ldstore_cmd, 'LDStore', measure_time=True, show_output=verbose, show_command=verbose)
if not os.path.exists(bcor_file):
raise IOError('Could not find output BCOR file')
return bcor_filethe reason is
rsid is kind of BgenVar("", "1:15333718:C:T", "1", 15333718, ['C', 'T']) ;while the truth variable should used here is rsid.rsid (which is correct used at above code, but not used at below).
SOLUTION
Let rsid = rsid.rsid or change the variable Name can solve the problem. code like below
test_polyfun.py doesn't have a test code for bgen file input
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels

