Merge pull request #4 from MikeDacre/dev

Converting to the Plink many-to-many model for SNP LD Calculations Results in a substantial speed improvement
MikeDacre · May 13, 2017 · ec403fa · ec403fa
2 parents 3466121 + e033f07
commit ec403fa
Show file tree

Hide file tree

Showing 6 changed files with 1,091 additions and 520 deletions.
diff --git a/LD_Direction/LDpair.py b/LD_Direction/LDpair.py
@@ -7,16 +7,18 @@
 """
 import os
 import sys
-import json
 import math
 import argparse
 import subprocess as _sub
 
 # https://github.com/MikeDacre/dbSNP
 import dbSNP
 
-DB_SNP_LOC = '/godot/dbsnp'
-DB_SNP_VER = 150
+from . import _run
+
+###############################################################################
+#                            User Defined Globals                             #
+###############################################################################
 
 POPULATIONS = ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
                "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
@@ -25,7 +27,13 @@
 
 # Set data directories
 DATA_DIR = "/godot/1000genomes/1000GP_Phase3"
-SNP_DB   = "/godot/dbsnp/dbsnp144.db"
+DB_SNP_LOC = '/godot/dbsnp'
+DB_SNP_VER = 150
+
+
+###############################################################################
+#                              Helper Functions                               #
+###############################################################################
 
 
 class SNP_Lookup_Failure(Exception):
@@ -35,42 +43,60 @@ class SNP_Lookup_Failure(Exception):
     pass
 
 
-def output_json(output):
-    """Return JSON formated string."""
-    return json.dumps(output, sort_keys=True, indent=2)
-
-
-def run_cmnd(cmnd):
-    """Run a command and return the output split into a list by newline."""
-    output, _, _ = run(cmnd, raise_on_error=True)
-    return output.strip().split('\n')
-
-
-def run(command, raise_on_error=False):
-    """Run a command with subprocess the way it should be.
+def print_summary(json_dict, outfile=None):
+    """Write a formatted summary to file from a dictionary.
 
     Parameters
     ----------
-    command : str
-        A command to execute, piping is fine.
-    raise_on_error : bool
-        Raise a subprocess.CalledProcessError on exit_code != 0
+    json_dict : dict
+        A dictionary of the kind generated by calculate_pair()
+    filehandle : IOBase, optional
+        An open writable file handle
 
     Returns
     -------
-    stdout : str
-    stderr : str
-    exit_code : int
+    str
+        A formatted string, the same one that is written to outfile if outfile
+        is provided.
     """
-    pp = _sub.Popen(command, shell=True, universal_newlines=True,
-                    stdout=_sub.PIPE, stderr=_sub.PIPE)
-    out, err = pp.communicate()
-    code = pp.returncode
-    if raise_on_error and code != 0:
-        raise _sub.CalledProcessError(
-            returncode=code, cmd=command, output=out, stderr=err
-        )
-    return out, err, code
+    outstr = "Query SNPs:\n"
+    outstr += json_dict["snp1"]["rsnum"] + " (" + json_dict["snp1"]["coord"] + ")\n"
+    outstr += json_dict["snp2"]["rsnum"] + " (" + json_dict["snp2"]["coord"] + ")\n"
+    outstr += "\n"
+    outstr += ','.join(json_dict['populations']) + " Haplotypes:\n"
+    outstr += " " * 15 + json_dict["snp2"]["rsnum"] + '\n'
+    outstr += " " * 15 + json_dict["snp2"]["allele_1"]["allele"] + " " * 7 + json_dict["snp2"]["allele_2"]["allele"] + '\n'
+    outstr += " " * 13 + "-" * 17 + '\n'
+    outstr += " " * 11 + json_dict["snp1"]["allele_1"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c11"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c11"])) + " | " + json_dict["two_by_two"]["cells"]["c12"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c12"])) + " | " + json_dict["snp1"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_1"]["count"])) + " (" + json_dict["snp1"]["allele_1"]["frequency"] + ")\n"
+    outstr += json_dict["snp1"]["rsnum"] + " " * (10 - len(json_dict["snp1"]["rsnum"])) + " " * 3 + "-" * 17 + '\n'
+    outstr += " " * 11 + json_dict["snp1"]["allele_2"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c21"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c21"])) + " | " + json_dict["two_by_two"]["cells"]["c22"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c22"])) + " | " + json_dict["snp1"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_2"]["count"])) + " (" + json_dict["snp1"]["allele_2"]["frequency"] + ")\n"
+    outstr += " " * 13 + "-" * 17 + '\n'
+    outstr += " " * 15 + json_dict["snp2"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_1"]["count"])) + " " * 3 + json_dict["snp2"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_2"]["count"])) + " " * 3 + json_dict["two_by_two"]["total"] + '\n'
+    outstr += " " * 14 + "(" + json_dict["snp2"]["allele_1"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_1"]["frequency"])) + " (" + json_dict["snp2"]["allele_2"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_2"]["frequency"])) + '\n'
+    outstr += "\n"
+    outstr += "          " + json_dict["haplotypes"]["hap1"]["alleles"] + ": " + json_dict["haplotypes"]["hap1"]["count"] + " (" + json_dict["haplotypes"]["hap1"]["frequency"] + ")\n"
+    outstr += "          " + json_dict["haplotypes"]["hap2"]["alleles"] + ": " + json_dict["haplotypes"]["hap2"]["count"] + " (" + json_dict["haplotypes"]["hap2"]["frequency"] + ")\n"
+    outstr += "          " + json_dict["haplotypes"]["hap3"]["alleles"] + ": " + json_dict["haplotypes"]["hap3"]["count"] + " (" + json_dict["haplotypes"]["hap3"]["frequency"] + ")\n"
+    outstr += "          " + json_dict["haplotypes"]["hap4"]["alleles"] + ": " + json_dict["haplotypes"]["hap4"]["count"] + " (" + json_dict["haplotypes"]["hap4"]["frequency"] + ")\n"
+    outstr += "\n"
+    outstr += "          D': " + json_dict["statistics"]["d_prime"] + '\n'
+    outstr += "          R\u00b2: " + json_dict["statistics"]["r2"] + '\n'
+    outstr += "      Chi-sq: " + json_dict["statistics"]["chisq"] + '\n'
+    outstr += "     p-value: " + json_dict["statistics"]["p"] + '\n'
+    outstr += "\n"
+    if len(json_dict["corr_alleles"]) == 2:
+        outstr += json_dict["corr_alleles"][0] + '\n'
+        outstr += json_dict["corr_alleles"][1] + '\n'
+    else:
+        outstr += json_dict["corr_alleles"][0] + '\n'
+
+    if 'warning' in json_dict:
+        outstr += "WARNING: " + json_dict["warning"] + "!\n"
+
+    if outfile:
+        outfile.write(outstr)
+
+    return outstr
 
 
 def get_snp_info(snp):
@@ -142,7 +168,7 @@ def get_snp_info(snp):
     )
     err = None
     try:
-        vcf = run_cmnd(tabix_cmd)
+        vcf = _run.run_cmnd(tabix_cmd)
     except _sub.CalledProcessError as e:
         err = e
         raise
@@ -153,9 +179,11 @@ def get_snp_info(snp):
         )
     vcf = [i for i in vcf if 'END' not in i]
     # Import SNP VCF files
-    if len(vcf) == 0:
+    if not vcf:
         raise SNP_Lookup_Failure(snp + " is not in 1000G reference panel.")
     elif len(vcf) == 1:
+        if not vcf[0].strip():
+            raise SNP_Lookup_Failure(snp + " is not in 1000G reference panel.")
         geno = vcf[0].strip().split()
     else:
         geno = []
@@ -196,11 +224,16 @@ def get_snp_info(snp):
 
     # Get headers
     tabix_header_cmd = "tabix -H {} | grep CHROM".format(vcf_file)
-    head = run_cmnd(tabix_header_cmd)[0].strip().split()
+    head = _run.run_cmnd(tabix_header_cmd)[0].strip().split()
 
     return snp, snp_chrom, snp_loc, geno, allele, head, snp_a1, snp_a2
 
 
+###############################################################################
+#                    Primary Function Adapted from LDpair                     #
+###############################################################################
+
+
 def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
     """Find LD information for any two SNPs.
 
@@ -235,7 +268,7 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
     except SNP_Lookup_Failure as e:
         if return_json:
             output["error"] = str(e)
-            return output_json(output)
+            return _run.output_json(output)
         else:
             raise
 
@@ -263,14 +296,14 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
         err = "VCF File does not match variant coordinates for SNP1."
         if return_json:
             output["error"] = err
-            return output_json(output)
+            return _run.output_json(output)
         else:
             raise SNP_Lookup_Failure(err)
     if geno2[1] != str(snp2_loc):
         err = "VCF File does not match variant coordinates for SNP2."
         if return_json:
             output["error"] = err
-            return output_json(output)
+            return _run.output_json(output)
         else:
             raise SNP_Lookup_Failure(err)
 
@@ -309,7 +342,7 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
         ).format(bad_pops, POPULATIONS)
         if return_json:
             output["error"] = err
-            return output_json(output)
+            return _run.output_json(output)
         else:
             raise SNP_Lookup_Failure(err)
 
@@ -559,64 +592,13 @@ def calculate_pair(snp1, snp2, pops, write_summary=None, return_json=False):
 
     # Return output
     if return_json:
-        return output_json(output)
-    else:
-        return output
-
-def print_summary(json_dict, outfile=None):
-    """Write a formatted summary to file from a dictionary.
-
-    Parameters
-    ----------
-    json_dict : dict
-        A dictionary of the kind generated by calculate_pair()
-    filehandle : IOBase, optional
-        An open writable file handle
-
-    Returns
-    -------
-    str
-        A formatted string, the same one that is written to outfile if outfile
-        is provided.
-    """
-    outstr = "Query SNPs:\n"
-    outstr += json_dict["snp1"]["rsnum"] + " (" + json_dict["snp1"]["coord"] + ")\n"
-    outstr += json_dict["snp2"]["rsnum"] + " (" + json_dict["snp2"]["coord"] + ")\n"
-    outstr += "\n"
-    outstr += ','.join(json_dict['populations']) + " Haplotypes:\n"
-    outstr += " " * 15 + json_dict["snp2"]["rsnum"] + '\n'
-    outstr += " " * 15 + json_dict["snp2"]["allele_1"]["allele"] + " " * 7 + json_dict["snp2"]["allele_2"]["allele"] + '\n'
-    outstr += " " * 13 + "-" * 17 + '\n'
-    outstr += " " * 11 + json_dict["snp1"]["allele_1"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c11"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c11"])) + " | " + json_dict["two_by_two"]["cells"]["c12"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c12"])) + " | " + json_dict["snp1"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_1"]["count"])) + " (" + json_dict["snp1"]["allele_1"]["frequency"] + ")\n"
-    outstr += json_dict["snp1"]["rsnum"] + " " * (10 - len(json_dict["snp1"]["rsnum"])) + " " * 3 + "-" * 17 + '\n'
-    outstr += " " * 11 + json_dict["snp1"]["allele_2"]["allele"] + " | " + json_dict["two_by_two"]["cells"]["c21"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c21"])) + " | " + json_dict["two_by_two"]["cells"]["c22"] + " " * (5 - len(json_dict["two_by_two"]["cells"]["c22"])) + " | " + json_dict["snp1"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp1"]["allele_2"]["count"])) + " (" + json_dict["snp1"]["allele_2"]["frequency"] + ")\n"
-    outstr += " " * 13 + "-" * 17 + '\n'
-    outstr += " " * 15 + json_dict["snp2"]["allele_1"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_1"]["count"])) + " " * 3 + json_dict["snp2"]["allele_2"]["count"] + " " * (5 - len(json_dict["snp2"]["allele_2"]["count"])) + " " * 3 + json_dict["two_by_two"]["total"] + '\n'
-    outstr += " " * 14 + "(" + json_dict["snp2"]["allele_1"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_1"]["frequency"])) + " (" + json_dict["snp2"]["allele_2"]["frequency"] + ")" + " " * (5 - len(json_dict["snp2"]["allele_2"]["frequency"])) + '\n'
-    outstr += "\n"
-    outstr += "          " + json_dict["haplotypes"]["hap1"]["alleles"] + ": " + json_dict["haplotypes"]["hap1"]["count"] + " (" + json_dict["haplotypes"]["hap1"]["frequency"] + ")\n"
-    outstr += "          " + json_dict["haplotypes"]["hap2"]["alleles"] + ": " + json_dict["haplotypes"]["hap2"]["count"] + " (" + json_dict["haplotypes"]["hap2"]["frequency"] + ")\n"
-    outstr += "          " + json_dict["haplotypes"]["hap3"]["alleles"] + ": " + json_dict["haplotypes"]["hap3"]["count"] + " (" + json_dict["haplotypes"]["hap3"]["frequency"] + ")\n"
-    outstr += "          " + json_dict["haplotypes"]["hap4"]["alleles"] + ": " + json_dict["haplotypes"]["hap4"]["count"] + " (" + json_dict["haplotypes"]["hap4"]["frequency"] + ")\n"
-    outstr += "\n"
-    outstr += "          D': " + json_dict["statistics"]["d_prime"] + '\n'
-    outstr += "          R\u00b2: " + json_dict["statistics"]["r2"] + '\n'
-    outstr += "      Chi-sq: " + json_dict["statistics"]["chisq"] + '\n'
-    outstr += "     p-value: " + json_dict["statistics"]["p"] + '\n'
-    outstr += "\n"
-    if len(json_dict["corr_alleles"]) == 2:
-        outstr += json_dict["corr_alleles"][0] + '\n'
-        outstr += json_dict["corr_alleles"][1] + '\n'
-    else:
-        outstr += json_dict["corr_alleles"][0] + '\n'
+        return _run.output_json(output)
+    return output
 
-    if 'warning' in json_dict:
-        outstr += "WARNING: " + json_dict["warning"] + "!\n"
 
-    if outfile:
-        outfile.write(outstr)
-
-    return outstr
+###############################################################################
+#                               Run as a Script                               #
+###############################################################################
 
 
 def get_arg_parser():

diff --git a/LD_Direction/__init__.py b/LD_Direction/__init__.py
@@ -15,33 +15,51 @@
 Organization: Stanford University
 License: MIT License, property of Stanford and NCI, use as you wish
 Created: 2017-21-21 10:04
-Version: 0.1.0a
+Version: 0.2.0b1
 
 Modules
 -------
-snp_link : provides functions and classes to do one-to-one LD comparisons on
-either just two SNPs, or a list of SNP pairs
-compare_snp_lists : provides functions to do a one-to-many comparison for
-arbitrarily large lists of SNPs (limited only by memory)
+compare_snp_lists
+    provides functions to do a one-to-many comparison for arbitrarily large
+    lists of SNPs (limited only by memory)
+snp_link
+    provides functions and classes to do one-to-one LD comparisons on either
+    just two SNPs, or a list of SNP pairs
+LDpair
+    Adapted from LDlink, provides LD calculation tools directly from the VCF
+    file without the need for plink
+
+PLINK
+-----
+https://www.cog-genomics.org/plink
+https://github.com/chrchang/plink-ng/
+
+compare_snp_lists depends entirely upon plink and could not function without
+it. Many, many thanks to Christopher Chang for creating it, and for his help
+with getting this package working.
 
 LDLink
 ------
 https://analysistools.nci.nih.gov/LDlink
 https://github.com/CBIIT/nci-webtools-dceg-linkage
 
-This tool uses the above API entirely, thanks to Mitchell Machiela and the
-Chanock lab for writing that tool!
+This package makes extensive use of the LDlink API entirely, thanks to Mitchell
+Machiela and the Chanock lab for writing that tool!
 
-Citation
-~~~~~~~~
+Citations
+---------
+Chang, Christopher C and Chow, Carson C and Tellier, Laurent C A M and
+    Vattikuti, Shashaank and Purcell, Shaun M and Lee, James J. Second-
+    generation {PLINK}: {R}ising to the challenge of larger and richer
+    datasets. GigaScience. 2015. DOI 10.1186/s13742-015-0047-8
 Machiela MJ, Chanock SJ. LDlink a web-based application for exploring
-population-specific haplotype structure and linking correlated alleles of
-possible functional variants. Bioinformatics. 2015 Jul 2. PMID: 26139635.
+    population-specific haplotype structure and linking correlated alleles of
+    possible functional variants. Bioinformatics. 2015 Jul 2. PMID: 26139635.
 """
-__version__ = '0.1.0a'
+__version__ = '0.2.0b1'
 
 __all__ = ['snp_link', 'compare_snp_lists']
 
 # Make core functions easily available
-from . import snp_link
 from . import compare_snp_lists
+from . import snp_link