create _fit_simple and _fit_ci helpers

phobson · phobson · commit 3a3b401f53e2 · 2016-07-13T22:15:01.000-07:00
diff --git a/probscale/tests/test_viz.py b/probscale/tests/test_viz.py
@@ -393,6 +393,42 @@ def plot_data():
     return data
 
 
+@pytest.mark.parametrize(('fitlogs', 'known_yhat'), [
+    (None, numpy.array([0.7887, 3.8946, 7.0005, 10.1065, 13.2124, 16.3183])),
+    ('x', numpy.array([0.2711, 1.2784, 1.5988, 1.7953, 1.9373, 2.0487])),
+    ('y', numpy.array([2.2006e+00, 4.9139e+01, 1.0972e+03, 2.4501e+04, 5.4711e+05, 1.2217e+07])),
+    ('both', numpy.array([1.3114, 3.5908, 4.9472, 6.0211, 6.9402, 7.7577])),
+])
+def test__fit_simple(plot_data, fitlogs, known_yhat):
+    x = numpy.arange(1, len(plot_data)+1)
+    known_results = {'slope': 0.5177, 'intercept': 0.2711}
+    xhat = x[::6]
+    yhat, results = viz._fit_simple(x, plot_data, xhat, fitlogs=fitlogs)
+    assert abs(results['intercept'] - known_results['intercept']) < 0.0001
+    assert abs(results['slope'] - known_results['slope']) < 0.0001
+    nptest.assert_allclose(yhat, known_yhat, rtol=0.0001)
+
+
+@seed
+@pytest.mark.parametrize(('fitlogs', 'known_lo', 'known_hi'), [
+    (None, numpy.array([-0.7944, 2.7051, 6.1974,  9.2612, 11.9382, 14.4290]),
+           numpy.array([ 2.1447, 4.8360, 7.7140, 10.8646, 14.1014, 17.4432])),
+    ('x', numpy.array([-1.4098, -0.2210, 0.1387, 0.3585, 0.5147, 0.6417]),
+          numpy.array([ 1.7067,  2.5661, 2.8468, 3.0169, 3.1400, 3.2341])),
+    ('y', numpy.array([4.5187e-01, 1.4956e+01, 4.9145e+02, 1.0522e+04, 1.5299e+05, 1.8468e+06]),
+          numpy.array([8.5396e+00, 1.2596e+02, 2.2396e+03, 5.2290e+04, 1.3310e+06, 3.7627e+07])),
+    ('both', numpy.array([0.2442,  0.8017,  1.1488,  1.4312,  1.6731,  1.8997]),
+             numpy.array([5.5107, 13.0148 , 17.232, 20.4285, 23.1035, 25.3843])),
+])
+def test__fit_ci(plot_data, fitlogs, known_lo, known_hi):
+    x = numpy.arange(1, len(plot_data)+1)
+    xhat = x[::6]
+    yhat_lo, yhat_hi = viz._fit_ci(x, plot_data, xhat, fitlogs=fitlogs, niter=1000)
+
+    nptest.assert_allclose(yhat_lo, known_lo, rtol=0.001)
+    nptest.assert_allclose(yhat_hi, known_hi, rtol=0.001)
+
+
 @pytest.mark.mpl_image_compare(baseline_dir=BASELINE_DIR, tolerance=10)
 def test_probplot_prob(plot_data):
     fig, ax = plt.subplots()
diff --git a/probscale/viz.py b/probscale/viz.py
@@ -1,4 +1,6 @@
-﻿import numpy
+﻿import copy
+
+import numpy
 from matplotlib import pyplot
 
 from .probscale import _minimal_norm
@@ -387,17 +389,21 @@ def fit_line(x, y, xhat=None, fitprobs=None, fitlogs=None, dist=None,
     -------
     xhat, yhat : numpy arrays
         Linear model estimates of ``x`` and ``y``.
-    results : a statmodels result object
-        The object returned by numpy.polyfit
+    results : dict
+        Dictionary of linear fit results. Keys include:
 
-    """
+          - slope
+          - intersept
+          - yhat_lo (lower confidence interval of the estimated y-vals)
+          - yhat_hi (upper confidence interval of the estimated y-vals)
 
+    """
     fitprobs = validate.fit_argument(fitprobs, "fitprobs")
     fitlogs = validate.fit_argument(fitlogs, "fitlogs")
 
     # maybe set xhat to default values
     if xhat is None:
-        xhat = numpy.array([numpy.min(x), numpy.max(x)])
+        xhat = copy.copy(x)
 
     # maybe set dist to default value
     if dist is None:
@@ -420,23 +426,109 @@ def fit_line(x, y, xhat=None, fitprobs=None, fitlogs=None, dist=None,
     if fitlogs in ['y', 'both']:
         y = numpy.log(y)
 
-    # do the best-fit
-    coeffs = numpy.polyfit(x, y, 1)
+    yhat, results =  _fit_simple(x, y, xhat, fitlogs=fitlogs)
 
-    # estimate y values
-    yhat = _estimate_from_fit(xhat, coeffs[0], coeffs[1],
-                                  xlog=fitlogs in ['x', 'both'],
-                                  ylog=fitlogs in ['y', 'both'])
+    if estimate_ci:
+        yhat_lo, yhat_hi = _fit_ci(x, y, xhat, fitlogs=fitlogs,
+                                   niter=niter, alpha=alpha)
+    else:
+        yhat_lo, yhat_hi = None, None
 
     # maybe undo the ppf transform
     if fitprobs in ['y', 'both']:
-        yhat = 100.* dist.cdf(yhat)
+        yhat = 100. * dist.cdf(yhat)
+        if yhat_lo is not None:
+            yhat_lo = 100. * dist.cdf(yhat_lo)
+            yhat_hi = 100. * dist.cdf(yhat_hi)
 
     # maybe undo ppf transform
     if fitprobs in ['x', 'both']:
-        xhat = 100.* dist.cdf(xhat)
+        xhat = 100. * dist.cdf(xhat)
+
+    results['yhat_lo'] = yhat_lo
+    results['yhat_hi'] = yhat_hi
+
+    return xhat, yhat, results
+
+
+def _fit_simple(x, y, xhat, fitlogs=None):
+    """
+    Simple linear fit of x and y data using ``numpy.polyfit``.
+
+    Parameters
+    ----------
+    x, y : array-like
+    fitlogs : str, optional.
+        Defines which data should be log-transformed. Valid values are
+        'x', 'y', or 'both'.
+
+    Returns
+    -------
+    xhat, yhat : array-like
+        Estimates of x and y based on the linear fit
+    results : dict
+        Dictionary of the fit coefficients
+
+    See also
+    --------
+    numpy.polyfit
+
+    """
+
+    # do the best-fit
+    coeffs = numpy.polyfit(x, y, 1)
+
+    results = {
+        'slope': coeffs[0],
+        'intercept': coeffs[1]
+    }
+
+    # estimate y values
+    yhat = _estimate_from_fit(xhat, coeffs[0], coeffs[1],
+                              xlog=fitlogs in ['x', 'both'],
+                              ylog=fitlogs in ['y', 'both'])
+
+    return yhat, results
+
+
+def _fit_ci(x, y, xhat, fitlogs=None, niter=10000, alpha=0.05):
+    """
+    Percentile method bootstrapping of linear fit of x and y data using
+    ``numpy.polyfit``.
+
+    Parameters
+    ----------
+    x, y : array-like
+    fitlogs : str, optional.
+        Defines which data should be log-transformed. Valid values are
+        'x', 'y', or 'both'.
+    niter : int, optional (default is 10000)
+        Number of bootstrap iterations to use
+    alpha : float, optional
+        Confidence level of the estimate.
+
+    Returns
+    -------
+    xhat, yhat : array-like
+        Estimates of x and y based on the linear fit
+    results : dict
+        Dictionary of the fit coefficients
+
+    See also
+    --------
+    numpy.polyfit
+
+    """
+
+    index = _make_boot_index(len(x), niter)
+    yhat_array = numpy.array([
+        _fit_simple(x[ii], y[ii], xhat, fitlogs=fitlogs)[0]
+        for ii in index
+    ])
 
-    return xhat, yhat, coeffs
+    percentiles = 100 * numpy.array([alpha*0.5, 1 - alpha*0.5])
+    yhat_lo, yhat_hi = numpy.percentile(yhat_array, percentiles, axis=0)
+    return yhat_lo, yhat_hi
 
 
 def _estimate_from_fit(xdata, slope, intercept, xlog=False, ylog=False):