Merge branch 'release/v0.9' into develop

moonso · moonso · commit c328aafd4a81 · 2014-12-11T17:56:16.000+01:00
diff --git a/README.md b/README.md
@@ -5,21 +5,35 @@ Small library for parsing vcf files. Based on [PyVCF](https://github.com/jamesca
 
 ```python3
     from vcf_parser import parser
-    my_parser = parser.VCFParser('infile.vcf')
+    my_parser = parser.VCFParser(infile='infile.vcf')
     for variant in my_parser:
         print(variant)
 ```
 
-**vcf_parser also works on streams now.**
+**vcf_parser can split multi allelic calls in vcf now.**
 
 Vcf parser is really a lightweight version of [PyVCF](https://github.com/jamescasbon/PyVCF) with most of it's code borrowed and modified from there.
 
-The idea was to make a faster and more flexible tool that mostly work with python dictionarys.
-The drawback is inacurracy, while **PyVCF** tests if each row in the vcf is on the correct format vcf_parser is much more sloppier.
+The idea was to make a faster and more flexible tool that mostly work with python dictionaries.
+It is more inaccurate , while **PyVCF** tests if each row in the vcf is on the correct format, vcf_parser is much more sloppier.
 
 It is easy to access information for each variant, edit the information and edit the headers.
 
+## Basic function ##
+
+
 Returns dictionary with the vcf info for each variant.
+To split the multiallelic calls(and accurate splitting of INFO field including the VEP CSQ fiels) use:
+    
+    my_parser = parser.VCFParser(infile='infile.vcf', split_variants=True)
+
+The ordinary vcf entrys is stored by there header names, like
+    
+    variant['CHROM']
+    variant['ALT']
+
+etc.
+
 The genotype information is converted to a genotype object and stored in a dictionary
 
     variant['genotypes']
@@ -53,7 +67,8 @@ Vep information, if present, is parsed into
 
 and looks like:
 
-    'vep_info': {'NOC2L': {'Allele': 'G',
+    'vep_info': {<alternative_allele>: {
+                            'Allele': 'G',
                             'Amino_acids': '',
                             'CDS_position': '',
                             'Codons': '',
@@ -74,7 +89,8 @@ and looks like:
                             'SYMBOL': 'NOC2L',
                             'SYMBOL_SOURCE': '',
                             'cDNA_position': ''},
-                  'SAMD11': {'Allele': 'G',
+                  <alternative_allele>: {
+                              'Allele': 'G',
                              'Amino_acids': '',
                              'CDS_position': '',
                              'Codons': '',
@@ -94,36 +110,43 @@ and looks like:
                              'STRAND': '1',
                              'SYMBOL': 'SAMD11',
                              'SYMBOL_SOURCE': '',
-                             'cDNA_position': ''}}
+                             'cDNA_position': ''
+                         }
+                    'gene_ids':set([SAMD1, NOC2L])
+                     }
 
-INFO field is parsed into 
+INFO field is parsed into, where the keys are the names of the info field. Values are lists, if there is no value in the vcf the value in info_dict is False.
 
     variant['info_dict]
 
 and looks like
 
-    'info_dict': {'AC': '1',
-                   'AF': '0.167',
-                   'AN': '6',
-                   'BaseQRankSum': '2.286',
-                   'DB': True,
-                   'DP': '1306',
-                   'FS': '1.539',
-                   'InbreedingCoeff': '0.1379',
-                   'MQ': '39.83',
-                   'MQ0': '0',
-                   'MQRankSum': '-2.146',
-                   'POSITIVE_TRAIN_SITE': True,
-                   'QD': '29.57',
-                   'ReadPosRankSum': '0.897',
-                   'VQSLOD': '4.52',
-                   'culprit': 'FS',
-                   'set': 'variant'}
-
-
-###Print a variant in it´s original format:###
-
-	print '\t'.join([[variant[head] for head in my_parser.header])
+    'info_dict': {'AC': ['1'],
+                   'AF': ['0.167'],
+                   'AN': ['6'],
+                   'BaseQRankSum': ['2.286'],
+                   'DB': False,
+                   'DP': ['1306'],
+                   'FS': ['1.539'],
+                   'InbreedingCoeff': ['0.1379'],
+                   'MQ': ['39.83'],
+                   'MQ0': ['0'],
+                   'MQRankSum': ['-2.146'],
+                   'POSITIVE_TRAIN_SITE': False,
+                   'QD': ['29.57'],
+                   'ReadPosRankSum': ['0.897'],
+                   'VQSLOD': ['4.52'],
+                   'culprit': ['FS'],
+                   'set': ['variant']}
+
+
+### Print a vcf in it´s original format: ###
+
+    my_parser = parser.VCFParser(infile='infile.vcf')
+    for line in my_parser.metadata.print_header():
+        print(line)
+    for variant in my_parser:
+	    print('\t'.join([[variant[head] for head in my_parser.header]))
 
 ###Add metadata information:###
 
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 long_description = 'Tool for parsing Variant Call Format (VCF) files. Works like a lightweight version of PyVCF.'
 
 setup(name='vcf_parser',
-    version='0.8.3',
+    version='0.9',
     description='Parsing vcf files',
     author = 'Mans Magnusson',
     author_email = 'mans.magnusson@scilifelab.se',