Skip to content

Commit

Permalink
Merge pull request #4 from MikeDacre/dev
Browse files Browse the repository at this point in the history
Converting to the Plink many-to-many model for SNP LD Calculations

Results in a substantial speed improvement
  • Loading branch information
MikeDacre authored May 13, 2017
2 parents 3466121 + e033f07 commit ec403fa
Show file tree
Hide file tree
Showing 6 changed files with 1,091 additions and 520 deletions.
174 changes: 78 additions & 96 deletions LD_Direction/LDpair.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,18 @@
"""
import os
import sys
import json
import math
import argparse
import subprocess as _sub

# https://github.com/MikeDacre/dbSNP
import dbSNP

DB_SNP_LOC = '/godot/dbsnp'
DB_SNP_VER = 150
from . import _run

###############################################################################
# User Defined Globals #
###############################################################################

POPULATIONS = ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
"CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
Expand All @@ -25,7 +27,13 @@

# Set data directories
DATA_DIR = "/godot/1000genomes/1000GP_Phase3"
SNP_DB = "/godot/dbsnp/dbsnp144.db"
DB_SNP_LOC = '/godot/dbsnp'
DB_SNP_VER = 150


###############################################################################
# Helper Functions #
###############################################################################


class SNP_Lookup_Failure(Exception):
Expand All @@ -35,42 +43,60 @@ class SNP_Lookup_Failure(Exception):
pass


def output_json(output):
"""Return JSON formated string."""
return json.dumps(output, sort_keys=True, indent=2)


def run_cmnd(cmnd):
"""Run a command and return the output split into a list by newline."""
output, _, _ = run(cmnd, raise_on_error=True)
return output.strip().split('\n')


def run(command, raise_on_error=False):
"""Run a command with subprocess the way it should be.
def print_summary(json_dict, outfile=None):
"""Write a formatted summary to file from a dictionary.
Parameters
----------
command : str
A command to execute, piping is fine.
raise_on_error : bool
Raise a subprocess.CalledProcessError on exit_code != 0
json_dict : dict
A dictionary of the kind generated by calculate_pair()
filehandle : IOBase, optional
An open writable file handle
Returns
-------
stdout : str
stderr : str
exit_code : int
str
A formatted string, the same one that is written to outfile if outfile
is provided.
"""
pp = _sub.Popen(command, shell=True, universal_newlines=True,
stdout=_sub.PIPE, stderr=_sub.PIPE)
out, err = pp.communicate()
code = pp.returncode
if raise_on_error and code != 0:
raise _sub.CalledProcessError(
returncode=code, cmd=command, output=out, stderr=err
)
return out, err, code
outstr = "Query SNPs:\n"
outstr += json_dict["snp1"]["rsnum"] + " (" + json_dict["snp1"]["coord"] + ")\n"
outstr += json_dict["snp2"]["rsnum"] + " (" + json_dict["snp2"]["coord"] + ")\n"
outstr += "\n"
outstr += ','.join(json_dict['populations']) + " Haplotypes:\n"
outstr += " " * 15 + json_dict["snp2"]["rsnum"] + '\n'
outstr += " " * 15 + json_dict["snp2"]["allele_1"]["allele"] + " " * 7 + json_dict["snp2"]["allele_2"]["allele"] + '\n'
outstr += " " * 13 + "-" * 17 + '\n'
outstr += " " * 11 + json_dict["snp1"]["allele_1"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c11"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c11"])) + " | " + json_dict["two_by_two"]["cells"]["c12"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c12"])) + " | " + json_dict["snp1"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_1"]["count"])) + " (" + json_dict["snp1"]["allele_1"]["frequency"] + ")\n"
outstr += json_dict["snp1"]["rsnum"] + " " * (10 - len(json_dict["snp1"]["rsnum"])) + " " * 3 + "-" * 17 + '\n'
outstr += " " * 11 + json_dict["snp1"]["allele_2"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c21"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c21"])) + " | " + json_dict["two_by_two"]["cells"]["c22"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c22"])) + " | " + json_dict["snp1"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_2"]["count"])) + " (" + json_dict["snp1"]["allele_2"]["frequency"] + ")\n"
outstr += " " * 13 + "-" * 17 + '\n'
outstr += " " * 15 + json_dict["snp2"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_1"]["count"])) + " " * 3 + json_dict["snp2"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_2"]["count"])) + " " * 3 + json_dict["two_by_two"]["total"] + '\n'
outstr += " " * 14 + "(" + json_dict["snp2"]["allele_1"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_1"]["frequency"])) + " (" + json_dict["snp2"]["allele_2"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_2"]["frequency"])) + '\n'
outstr += "\n"
outstr += " " + json_dict["haplotypes"]["hap1"]["alleles"] + ": " + json_dict["haplotypes"]["hap1"]["count"] + " (" + json_dict["haplotypes"]["hap1"]["frequency"] + ")\n"
outstr += " " + json_dict["haplotypes"]["hap2"]["alleles"] + ": " + json_dict["haplotypes"]["hap2"]["count"] + " (" + json_dict["haplotypes"]["hap2"]["frequency"] + ")\n"
outstr += " " + json_dict["haplotypes"]["hap3"]["alleles"] + ": " + json_dict["haplotypes"]["hap3"]["count"] + " (" + json_dict["haplotypes"]["hap3"]["frequency"] + ")\n"
outstr += " " + json_dict["haplotypes"]["hap4"]["alleles"] + ": " + json_dict["haplotypes"]["hap4"]["count"] + " (" + json_dict["haplotypes"]["hap4"]["frequency"] + ")\n"
outstr += "\n"
outstr += " D': " + json_dict["statistics"]["d_prime"] + '\n'
outstr += " R\u00b2: " + json_dict["statistics"]["r2"] + '\n'
outstr += " Chi-sq: " + json_dict["statistics"]["chisq"] + '\n'
outstr += " p-value: " + json_dict["statistics"]["p"] + '\n'
outstr += "\n"
if len(json_dict["corr_alleles"]) == 2:
outstr += json_dict["corr_alleles"][0] + '\n'
outstr += json_dict["corr_alleles"][1] + '\n'
else:
outstr += json_dict["corr_alleles"][0] + '\n'

if 'warning' in json_dict:
outstr += "WARNING: " + json_dict["warning"] + "!\n"

if outfile:
outfile.write(outstr)

return outstr


def get_snp_info(snp):
Expand Down Expand Up @@ -142,7 +168,7 @@ def get_snp_info(snp):
)
err = None
try:
vcf = run_cmnd(tabix_cmd)
vcf = _run.run_cmnd(tabix_cmd)
except _sub.CalledProcessError as e:
err = e
raise
Expand All @@ -153,9 +179,11 @@ def get_snp_info(snp):
)
vcf = [i for i in vcf if 'END' not in i]
# Import SNP VCF files
if len(vcf) == 0:
if not vcf:
raise SNP_Lookup_Failure(snp + " is not in 1000G reference panel.")
elif len(vcf) == 1:
if not vcf[0].strip():
raise SNP_Lookup_Failure(snp + " is not in 1000G reference panel.")
geno = vcf[0].strip().split()
else:
geno = []
Expand Down Expand Up @@ -196,11 +224,16 @@ def get_snp_info(snp):

# Get headers
tabix_header_cmd = "tabix -H {} | grep CHROM".format(vcf_file)
head = run_cmnd(tabix_header_cmd)[0].strip().split()
head = _run.run_cmnd(tabix_header_cmd)[0].strip().split()

return snp, snp_chrom, snp_loc, geno, allele, head, snp_a1, snp_a2


###############################################################################
# Primary Function Adapted from LDpair #
###############################################################################


def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
"""Find LD information for any two SNPs.
Expand Down Expand Up @@ -235,7 +268,7 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
except SNP_Lookup_Failure as e:
if return_json:
output["error"] = str(e)
return output_json(output)
return _run.output_json(output)
else:
raise

Expand Down Expand Up @@ -263,14 +296,14 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
err = "VCF File does not match variant coordinates for SNP1."
if return_json:
output["error"] = err
return output_json(output)
return _run.output_json(output)
else:
raise SNP_Lookup_Failure(err)
if geno2[1] != str(snp2_loc):
err = "VCF File does not match variant coordinates for SNP2."
if return_json:
output["error"] = err
return output_json(output)
return _run.output_json(output)
else:
raise SNP_Lookup_Failure(err)

Expand Down Expand Up @@ -309,7 +342,7 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
).format(bad_pops, POPULATIONS)
if return_json:
output["error"] = err
return output_json(output)
return _run.output_json(output)
else:
raise SNP_Lookup_Failure(err)

Expand Down Expand Up @@ -559,64 +592,13 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):

# Return output
if return_json:
return output_json(output)
else:
return output

def print_summary(json_dict, outfile=None):
"""Write a formatted summary to file from a dictionary.
Parameters
----------
json_dict : dict
A dictionary of the kind generated by calculate_pair()
filehandle : IOBase, optional
An open writable file handle
Returns
-------
str
A formatted string, the same one that is written to outfile if outfile
is provided.
"""
outstr = "Query SNPs:\n"
outstr += json_dict["snp1"]["rsnum"] + " (" + json_dict["snp1"]["coord"] + ")\n"
outstr += json_dict["snp2"]["rsnum"] + " (" + json_dict["snp2"]["coord"] + ")\n"
outstr += "\n"
outstr += ','.join(json_dict['populations']) + " Haplotypes:\n"
outstr += " " * 15 + json_dict["snp2"]["rsnum"] + '\n'
outstr += " " * 15 + json_dict["snp2"]["allele_1"]["allele"] + " " * 7 + json_dict["snp2"]["allele_2"]["allele"] + '\n'
outstr += " " * 13 + "-" * 17 + '\n'
outstr += " " * 11 + json_dict["snp1"]["allele_1"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c11"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c11"])) + " | " + json_dict["two_by_two"]["cells"]["c12"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c12"])) + " | " + json_dict["snp1"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_1"]["count"])) + " (" + json_dict["snp1"]["allele_1"]["frequency"] + ")\n"
outstr += json_dict["snp1"]["rsnum"] + " " * (10 - len(json_dict["snp1"]["rsnum"])) + " " * 3 + "-" * 17 + '\n'
outstr += " " * 11 + json_dict["snp1"]["allele_2"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c21"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c21"])) + " | " + json_dict["two_by_two"]["cells"]["c22"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c22"])) + " | " + json_dict["snp1"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_2"]["count"])) + " (" + json_dict["snp1"]["allele_2"]["frequency"] + ")\n"
outstr += " " * 13 + "-" * 17 + '\n'
outstr += " " * 15 + json_dict["snp2"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_1"]["count"])) + " " * 3 + json_dict["snp2"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_2"]["count"])) + " " * 3 + json_dict["two_by_two"]["total"] + '\n'
outstr += " " * 14 + "(" + json_dict["snp2"]["allele_1"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_1"]["frequency"])) + " (" + json_dict["snp2"]["allele_2"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_2"]["frequency"])) + '\n'
outstr += "\n"
outstr += " " + json_dict["haplotypes"]["hap1"]["alleles"] + ": " + json_dict["haplotypes"]["hap1"]["count"] + " (" + json_dict["haplotypes"]["hap1"]["frequency"] + ")\n"
outstr += " " + json_dict["haplotypes"]["hap2"]["alleles"] + ": " + json_dict["haplotypes"]["hap2"]["count"] + " (" + json_dict["haplotypes"]["hap2"]["frequency"] + ")\n"
outstr += " " + json_dict["haplotypes"]["hap3"]["alleles"] + ": " + json_dict["haplotypes"]["hap3"]["count"] + " (" + json_dict["haplotypes"]["hap3"]["frequency"] + ")\n"
outstr += " " + json_dict["haplotypes"]["hap4"]["alleles"] + ": " + json_dict["haplotypes"]["hap4"]["count"] + " (" + json_dict["haplotypes"]["hap4"]["frequency"] + ")\n"
outstr += "\n"
outstr += " D': " + json_dict["statistics"]["d_prime"] + '\n'
outstr += " R\u00b2: " + json_dict["statistics"]["r2"] + '\n'
outstr += " Chi-sq: " + json_dict["statistics"]["chisq"] + '\n'
outstr += " p-value: " + json_dict["statistics"]["p"] + '\n'
outstr += "\n"
if len(json_dict["corr_alleles"]) == 2:
outstr += json_dict["corr_alleles"][0] + '\n'
outstr += json_dict["corr_alleles"][1] + '\n'
else:
outstr += json_dict["corr_alleles"][0] + '\n'
return _run.output_json(output)
return output

if 'warning' in json_dict:
outstr += "WARNING: " + json_dict["warning"] + "!\n"

if outfile:
outfile.write(outstr)

return outstr
###############################################################################
# Run as a Script #
###############################################################################


def get_arg_parser():
Expand Down
44 changes: 31 additions & 13 deletions LD_Direction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,51 @@
Organization: Stanford University
License: MIT License, property of Stanford and NCI, use as you wish
Created: 2017-21-21 10:04
Version: 0.1.0a
Version: 0.2.0b1
Modules
-------
snp_link : provides functions and classes to do one-to-one LD comparisons on
either just two SNPs, or a list of SNP pairs
compare_snp_lists : provides functions to do a one-to-many comparison for
arbitrarily large lists of SNPs (limited only by memory)
compare_snp_lists
provides functions to do a one-to-many comparison for arbitrarily large
lists of SNPs (limited only by memory)
snp_link
provides functions and classes to do one-to-one LD comparisons on either
just two SNPs, or a list of SNP pairs
LDpair
Adapted from LDlink, provides LD calculation tools directly from the VCF
file without the need for plink
PLINK
-----
https://www.cog-genomics.org/plink
https://github.com/chrchang/plink-ng/
compare_snp_lists depends entirely upon plink and could not function without
it. Many, many thanks to Christopher Chang for creating it, and for his help
with getting this package working.
LDLink
------
https://analysistools.nci.nih.gov/LDlink
https://github.com/CBIIT/nci-webtools-dceg-linkage
This tool uses the above API entirely, thanks to Mitchell Machiela and the
Chanock lab for writing that tool!
This package makes extensive use of the LDlink API entirely, thanks to Mitchell
Machiela and the Chanock lab for writing that tool!
Citation
~~~~~~~~
Citations
---------
Chang, Christopher C and Chow, Carson C and Tellier, Laurent C A M and
Vattikuti, Shashaank and Purcell, Shaun M and Lee, James J. Second-
generation {PLINK}: {R}ising to the challenge of larger and richer
datasets. GigaScience. 2015. DOI 10.1186/s13742-015-0047-8
Machiela MJ, Chanock SJ. LDlink a web-based application for exploring
population-specific haplotype structure and linking correlated alleles of
possible functional variants. Bioinformatics. 2015 Jul 2. PMID: 26139635.
population-specific haplotype structure and linking correlated alleles of
possible functional variants. Bioinformatics. 2015 Jul 2. PMID: 26139635.
"""
__version__ = '0.1.0a'
__version__ = '0.2.0b1'

__all__ = ['snp_link', 'compare_snp_lists']

# Make core functions easily available
from . import snp_link
from . import compare_snp_lists
from . import snp_link
Loading

0 comments on commit ec403fa

Please sign in to comment.