Skip to content

Commit

Permalink
Merge pull request #14 from krassowski/get-coding-consequence
Browse files Browse the repository at this point in the history
Get coding consequence, add range example
  • Loading branch information
krassowski committed Jan 19, 2023
2 parents 50b1e4a + 7c1909b commit 6cd14fb
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 3 deletions.
65 changes: 65 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,71 @@ The base position should use the latest genome assembly (GRCh38 at the time of w
you can use the position in previous assembly coordinates by replacing `POSITION` with `POSITION_GRCH37`.
For more information of the arguments accepted by the SNP database see the [entrez help page](https://www.ncbi.nlm.nih.gov/snp/docs/entrez_help/) on NCBI website.

#### Obtaining amino acids change information for variants in given range

First we search for dbSNP rs identifiers for variants in given region:

```python
dbsnp_ids = (
entrez_api
.search(
'12[CHROMOSOME] AND human[ORGANISM] AND 21178600:21178720[POSITION]',
database='snp',
max_results=100
)
.data
['esearchresult']
['idlist']
)
```

Then fetch the variant data for identifiers:

```python
variant_data = entrez_api.fetch(
['rs' + rs_id for rs_id in dbsnp_ids],
max_results=10,
database='snp'
)
```

And parse the data, extracting the HGVS out of summary:

```python
from easy_entrez.parsing import parse_dbsnp_variants
from pandas import Series


def select_protein_hgvs(items):
return [
[sequence, hgvs]
for entry in items
for sequence, hgvs in [entry.split(':')]
if hgvs.startswith('p.')
]


protein_hgvs = (
parse_dbsnp_variants(variant_data)
.summary
.HGVS
.apply(select_protein_hgvs)
.explode()
.dropna()
.apply(Series)
.rename(columns={0: 'sequence', 1: 'hgvs'})
)
protein_hgvs.head()
```

> | rs_id | sequence | hgvs |
> |:-------------|:------------|:------------|
> | rs1940853486 | NP_006437.3 | p.Gly203Ter |
> | rs1940853414 | NP_006437.3 | p.Glu202Gly |
> | rs1940853378 | NP_006437.3 | p.Glu202Lys |
> | rs1940853299 | NP_006437.3 | p.Lys201Thr |
> | rs1940852987 | NP_006437.3 | p.Asp198Glu |
#### Find PubMed ID from DOI

When searching GWAS catalog PMID is needed over DOI. You can covert one to the other using:
Expand Down
28 changes: 27 additions & 1 deletion easy_entrez/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,25 @@ class VariantSet:
alt_frequencies: DataFrame
#: Preferred identifiers map (old → new); old != new for merged variants.
preferred_ids: dict
#: Data from DOCSUM field including GENE, HGVS, etc.
summary: DataFrame

def __repr__(self):
return f'<VariantSet with {len(self.coordinates)} variants>'


def parse_docsum(docsum: str) -> dict:
result = {}
for entry in docsum.split('|'):
key, value = entry.split('=', maxsplit=1)
result[key] = value
if 'HGVS' in result:
result['HGVS'] = result['HGVS'].replace('&gt;', '>').split(',')
if 'LEN' in result:
result['LEN'] = float(result['LEN'])
return result


def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) -> VariantSet:
"""Parse coordinates, frequencies and preferred IDs of dbSNP variants.
Expand All @@ -62,6 +76,7 @@ def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) ->
results = []
alt_frequencies = []
preferred_id = {}
summaries = []

for i, snp in enumerate(snps):
error = snp.find('.//ns0:error', namespaces)
Expand All @@ -80,6 +95,16 @@ def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) ->
chrom_prev, pos_prev = snp.find('.//ns0:CHRPOS_PREV_ASSM', namespaces).text.split(':')
sig_class = snp.find('.//ns0:FXN_CLASS', namespaces).text

doc_sum = snp.find('.//ns0:DOCSUM', namespaces).text
try:
doc_sum = parse_docsum(doc_sum)
summaries.append({
**doc_sum,
'rs_id': f'rs{rs_id}'
})
except Exception as e:
warn(f'Failed to parse DOCSUM: {e}')

merged_into = snp.find('.//ns0:SNP_ID', namespaces).text
if rs_id != merged_into:
was_merged = snp.find('.//ns0:MERGED_SORT', namespaces).text
Expand Down Expand Up @@ -138,7 +163,8 @@ def parse_dbsnp_variants(snps_result: EntrezResponse, verbose: bool = False) ->
return VariantSet(
coordinates=DataFrame(results).set_index('rs_id'),
alt_frequencies=DataFrame(alt_frequencies),
preferred_ids=preferred_id
preferred_ids=preferred_id,
summary=DataFrame(summaries).set_index('rs_id')
)


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_long_description(file_name):
package_data={'easy_entrez': ['data/*.tsv', 'py.typed']},
# required for mypy to work
zip_safe=False,
version='0.3.4',
version='0.3.5',
license='MIT',
description='Python REST API for Entrez E-Utilities: stateless, easy to use, reliable.',
long_description=get_long_description('README.md'),
Expand Down
26 changes: 25 additions & 1 deletion tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Dict, Union
from dataclasses import dataclass
from xml.etree.ElementTree import Element, fromstring
from easy_entrez.parsing import parse_dbsnp_variants, VariantSet
from easy_entrez.parsing import parse_dbsnp_variants, VariantSet, parse_docsum
from easy_entrez.queries import FetchQuery
try:
from typing import Literal
Expand All @@ -17,6 +17,25 @@ class DummyResponse:
data: Union[Element, Dict]


DOCSUM_CODING = "HGVS=NC_000012.12:g.21178699A&gt;G,NC_000012.11:g.21331633A&gt;G,NG_011745.1:g.52506A&gt;G,NM_006446.5:c.605A&gt;G,NM_006446.4:c.605A&gt;G,NP_006437.3:p.Glu202Gly|SEQ=[A/G]|LEN=1|GENE=SLCO1B1:10599"


def test_docsum():
assert parse_docsum(DOCSUM_CODING) == {
'HGVS': [
'NC_000012.12:g.21178699A>G',
'NC_000012.11:g.21331633A>G',
'NG_011745.1:g.52506A>G',
'NM_006446.5:c.605A>G',
'NM_006446.4:c.605A>G',
'NP_006437.3:p.Glu202Gly'
],
'SEQ': '[A/G]',
'LEN': 1,
'GENE': 'SLCO1B1:10599'
}


@pytest.mark.optional
def test_parse_two_snps():
response = DummyResponse(
Expand Down Expand Up @@ -50,6 +69,11 @@ def test_parse_two_snps():
assert frequencies.total_count.min() > 0
assert '1000Genomes' in set(frequencies.study)

summary = variant_set.summary
assert len(summary) == 2
assert set(summary.index) == {'rs6311', 'rs662138'}
assert set(summary.columns) == {'HGVS', 'SEQ', 'LEN', 'GENE'}


@pytest.mark.optional
def test_merged_variant_solving():
Expand Down

0 comments on commit 6cd14fb

Please sign in to comment.