Skip to content

Commit

Permalink
Fixing get_pmcodes, making multi-level indexing optional, and adding …
Browse files Browse the repository at this point in the history
…missing data to get_qwdata and get_gwlevels (#53)

* New get_pmcodes function

This upload fixes issue #17.

* Update nwis.py

* Update nwis.py

* Update waterdata_pmcodes.txt

* This PR fixes issue #18.

* Update waterdata_pmcodes.txt

* fix the metadata test for variable_info

* update tests to mock multiple parameter_cd

* update test_get_info parameter_cd to a list

* This upload fixes issue #20

* Fix issue #21

* This upload fixes issue #22

* Revert "This upload fixes issue #20"

This reverts commit 938c3d5.

* Revert 66b8b7c, acd1a54, 938c3d5

* Making multi-level indexing optional

This modification addresses issue #25  by adding an additional parameter multi_index to some functions. When multi_index=False the output will be a dataframe with a single-level index (datetime) independently of the number of sites being queried.

* Update nwis.py

* Returning all raw data from get_qwdata and get_gwleveles

This addresses issue #27 by modifying the function that parses date and time from get_qwdata and get_gwleveles avoiding data loss.

Co-authored-by: Scott Black <[email protected]>
Co-authored-by: Nouri1992 <[email protected]>
Co-authored-by: Nouri1992 <[email protected]>
  • Loading branch information
4 people authored Nov 14, 2022
1 parent f10dc03 commit 7164878
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 79 deletions.
Binary file added .DS_Store
Binary file not shown.
112 changes: 70 additions & 42 deletions dataretrieval/nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,18 @@
WATERDATA_BASE_URL = 'https://nwis.waterdata.usgs.gov/'
WATERDATA_URL = WATERDATA_BASE_URL + 'nwis/'
WATERSERVICE_URL = 'https://waterservices.usgs.gov/nwis/'
PARAMCODES_URL = 'https://help.waterdata.usgs.gov/code/parameter_cd_nm_query?'
ALLPARAMCODES_URL = 'https://help.waterdata.usgs.gov/code/parameter_cd_query?'

WATERSERVICES_SERVICES = ['dv', 'iv', 'site', 'stat', 'gwlevels']
WATERDATA_SERVICES = ['qwdata', 'measurements', 'peaks', 'pmcodes', 'water_use', 'ratings']


def format_response(df, service=None):
def format_response(df, service=None, **kwargs):
"""Setup index for response from query.
"""
mi = kwargs.pop('multi_index', True)

if service == 'peaks':
df = preformat_peaks_response(df)

Expand All @@ -33,7 +37,7 @@ def format_response(df, service=None):
# XXX: consider making site_no index
return df

elif len(df['site_no'].unique()) > 1:
elif len(df['site_no'].unique()) > 1 and mi:
# setup multi-index
df.set_index(['site_no', 'datetime'], inplace=True)
if hasattr(df.index.levels[1], 'tzinfo') and df.index.levels[1].tzinfo is None:
Expand All @@ -53,7 +57,8 @@ def preformat_peaks_response(df):
return df


def get_qwdata(datetime_index=True, wide_format=True, sites=None, start=None, end=None, **kwargs):
def get_qwdata(datetime_index=True, wide_format=True, sites=None,
start=None, end=None, multi_index=True,**kwargs):
"""
Get water sample data from qwdata service.
Expand All @@ -69,6 +74,8 @@ def get_qwdata(datetime_index=True, wide_format=True, sites=None, start=None, en
If the qwdata parameter begin_date is supplied, it will overwrite the start parameter
end: string
If the qwdata parameter end_date is supplied, it will overwrite the end parameter
multi_index: boolean
If False, a dataframe with a single-level index (datetime) is returned
Returns:
DataFrame containing times series data from the NWIS json and Metadata as tuple
Expand All @@ -79,7 +86,7 @@ def get_qwdata(datetime_index=True, wide_format=True, sites=None, start=None, en
end = kwargs.pop('end_date', end)
sites = kwargs.pop('site_no', sites)
return _qwdata(site_no=sites, begin_date=start, end_date=end, datetime_index=datetime_index,
** kwargs)
multi_index=multi_index, ** kwargs)

def _qwdata(datetime_index=True, **kwargs):
# check number of sites, may need to create multiindex
Expand Down Expand Up @@ -122,8 +129,7 @@ def _qwdata(datetime_index=True, **kwargs):
df = format_datetime(df, 'sample_dt', 'sample_tm',
'sample_start_time_datum_cd')

df = format_response(df)
return df, _set_metadata(response, **kwargs)
return format_response(df, **kwargs), _set_metadata(response, **kwargs)


def get_discharge_measurements(sites=None, start=None, end=None, **kwargs):
Expand Down Expand Up @@ -153,7 +159,7 @@ def _discharge_measurements(**kwargs):
return _read_rdb(response.text), _set_metadata(response, **kwargs)


def get_discharge_peaks(sites=None, start=None, end=None, **kwargs):
def get_discharge_peaks(sites=None, start=None, end=None, multi_index=True, **kwargs):
"""
Get discharge peaks from the waterdata service.
Expand All @@ -172,18 +178,18 @@ def get_discharge_peaks(sites=None, start=None, end=None, **kwargs):
start = kwargs.pop('begin_date', start)
end = kwargs.pop('end_date', end)
sites = kwargs.pop('site_no', sites)
return _discharge_peaks(site_no=sites, begin_date=start, end_date=end, **kwargs)
return _discharge_peaks(site_no=sites, begin_date=start, end_date=end, multi_index=multi_index, **kwargs)


def _discharge_peaks(**kwargs):
response = query_waterdata('peaks', format='rdb', **kwargs)

df = _read_rdb(response.text)

return format_response(df, service='peaks'), _set_metadata(response, **kwargs)
return format_response(df, service='peaks', **kwargs), _set_metadata(response, **kwargs)


def get_gwlevels(start='1851-01-01', end=None, **kwargs):
def get_gwlevels(start='1851-01-01', end=None, multi_index=True, **kwargs):
"""
Querys the groundwater level service from waterservices
Expand All @@ -200,16 +206,17 @@ def get_gwlevels(start='1851-01-01', end=None, **kwargs):
"""
start = kwargs.pop('startDT', start)
end = kwargs.pop('endDT', end)
return _gwlevels(startDT=start, endDT=end, **kwargs)
return _gwlevels(startDT=start, endDT=end, multi_index=multi_index, **kwargs)


def _gwlevels(**kwargs):

response = query_waterservices('gwlevels', **kwargs)

df = _read_rdb(response.text)
df = format_datetime(df, 'lev_dt', 'lev_tm', 'lev_tz_cd')

return format_response(df), _set_metadata(response, **kwargs)
return format_response(df, **kwargs), _set_metadata(response, **kwargs)


def get_stats(sites, **kwargs):
Expand Down Expand Up @@ -294,7 +301,7 @@ def query_waterservices(service, **kwargs):
return query(url, payload=kwargs)


def get_dv(start=None, end=None, **kwargs):
def get_dv(start=None, end=None, multi_index=True, **kwargs):
"""
Get daily values data from NWIS and return it as a DataFrame.
Expand All @@ -312,15 +319,14 @@ def get_dv(start=None, end=None, **kwargs):
"""
start = kwargs.pop('startDT', start)
end = kwargs.pop('endDT', end)
return _dv(startDT=start, endDT=end, **kwargs)
return _dv(startDT=start, endDT=end, multi_index=multi_index, **kwargs)


def _dv(**kwargs):
response = query_waterservices('dv', format='json', **kwargs)
df = _read_json(response.json())

df = format_response(df)
return df, _set_metadata(response, **kwargs)
return format_response(df, **kwargs), _set_metadata(response, **kwargs)


def get_info(**kwargs):
Expand Down Expand Up @@ -411,7 +417,7 @@ def get_info(**kwargs):
return _read_rdb(response.text), _set_metadata(response, **kwargs)


def get_iv(start=None, end=None, **kwargs):
def get_iv(start=None, end=None, multi_index=True, **kwargs):
"""Get instantaneous values data from NWIS and return it as a DataFrame.
Note: If no start or end date are provided, only the most recent record is returned.
Expand All @@ -428,40 +434,63 @@ def get_iv(start=None, end=None, **kwargs):
"""
start = kwargs.pop('startDT', start)
end = kwargs.pop('endDT', end)
return _iv(startDT=start, endDT=end, **kwargs)
return _iv(startDT=start, endDT=end, multi_index=multi_index, **kwargs)


def _iv(**kwargs):
response = query_waterservices('iv', format='json', **kwargs)
return _read_json(response.json()), _set_metadata(response, **kwargs)
df = _read_json(response.json())
return format_response(df, **kwargs), _set_metadata(response, **kwargs)


def get_pmcodes(parameterCd='All', **kwargs):
def get_pmcodes(parameterCd = 'All', partial = True):
"""
Return a DataFrame containing all NWIS parameter codes.
Note: NWIS may return incorrect column names. Rename them with
Returns a DataFrame containing all NWIS parameter code information.
>>> df.rename(columns={key:value})
Parameters (Additional parameters, if supplied, will be used as query parameters).
Parameters
----------
parameterCd: string or listlike
parameterCd: string or list
Accepts parameter codes or names
partial: boolean
Default is True (partial querying). If False, the funciton will query only exact matches
Returns:
DataFrame containing the USGS parameter codes and Metadata as tuple
"""
payload = {'radio_pm_search' : 'pm_search',
'pm_group' : 'All+--+include+all+parameter+groups',
'pm_search' : parameterCd,
'casrn_search' : None,
'srsname_search' : None,
'show' : ['parameter_group_nm', 'casrn', 'srsname','parameter_units', 'parameter_nm'],
'format' : 'rdb'}
if parameterCd is None:
raise TypeError('The query must include a parameter name or code')

payload.update(kwargs)
url = WATERDATA_URL + 'pmcodes/pmcodes'
response = query(url, payload)
return _read_rdb(response.text), _set_metadata(response, **kwargs)
payload = {'fmt':'rdb'}
url = PARAMCODES_URL

if isinstance(parameterCd, str): # when a single code or name is given
if parameterCd.lower() == "all":
payload.update({'group_cd': '%'})
url = ALLPARAMCODES_URL
response = query(url, payload)
return _read_rdb(response.text), _set_metadata(response)

else:
parameterCd = [parameterCd]

if not isinstance(parameterCd, list):
raise TypeError('Parameter information (code or name) must be type string or list')

# Querying with a list of parameters names, codes, or mixed
l = []
for param in parameterCd:
if isinstance(param, str):
if partial:
param ='%{0}%'.format(param)
payload.update({'parm_nm_cd':param})
response = query(url, payload)
if len(response.text.splitlines()) < 10: # empty query
raise TypeError('One of the parameter codes or names entered does not return any information,'\
' please try a different value')
l.append(_read_rdb(response.text))
else:
raise TypeError('Parameter information (code or name) must be type string')
return pd.concat(l), _set_metadata(response)


def get_water_use(years="ALL", state=None, counties="ALL", categories="ALL"):
Expand Down Expand Up @@ -625,7 +654,7 @@ def get_record(sites=None, start=None, end=None, state=None,
raise TypeError('{} service not yet implemented'.format(service))


def _read_json(json, multi_index=False):
def _read_json(json):
"""
Reads a NWIS Water Services formatted JSON into a DataFrame.
Expand Down Expand Up @@ -690,7 +719,6 @@ def _read_json(json, multi_index=False):
merged_df = update_merge(merged_df, record_df, na_only=True,
on=['site_no', 'datetime'])

merged_df = format_response(merged_df)
return merged_df


Expand All @@ -714,7 +742,7 @@ def _read_rdb(rdb):

fields = re.split("[\t]", rdb.splitlines()[count])
fields = [field.replace(",", "") for field in fields]
dtypes = {'site_no': str, 'dec_long_va': float, 'dec_lat_va': float}
dtypes = {'site_no': str, 'dec_long_va': float, 'dec_lat_va': float, 'parm_cd': str, 'parameter_cd':str}

df = pd.read_csv(StringIO(rdb), delimiter='\t', skiprows=count + 2,
names=fields, na_values='NaN', dtype=dtypes)
Expand All @@ -741,7 +769,7 @@ def _set_metadata(response, **parameters):

if 'parameterCd' in parameters:
md.variable_info = lambda: get_pmcodes(parameterCd=parameters['parameterCd'])

comments = ""
for line in response.text.splitlines():
if line.startswith("#"):
Expand Down
6 changes: 3 additions & 3 deletions dataretrieval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ def format_datetime(df, date_field, time_field, tz_field):
#create a datetime index from the columns in qwdata response
df[tz_field] = df[tz_field].map(tz)

df['datetime'] = pd.to_datetime(df.pop(date_field) + ' ' +
df.pop(time_field) + ' ' +
df.pop(tz_field),
df['datetime'] = pd.to_datetime(df[date_field] + ' ' +
df[time_field] + ' ' +
df[tz_field],
format = '%Y-%m-%d %H:%M',
utc=True)

Expand Down
28 changes: 10 additions & 18 deletions tests/data/waterdata_pmcodes.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,10 @@
#
#
# USGS Parameter Code Definitions File
#
# Column Definitions
# parameter_cd - Parameter Code
# parameter_group_nm - Group Name
# casrn - CASRN
# srsname - SRSName
# parameter_units - Parameter Unit
# parameter_nm - Parameter Name/Description
#
#
# Date Retrieved: 2020-04-16 18:48:43 EDT
#
parameter_cd parameter_group_nm casrn srsname parameter_units parameter_nm
5s 30s 170s 170s 16s 170s
00618 Nutrient Nitrate, water, filtered, milligrams per liter as nitrogen 14797-55-8 Nitrate mg/l as N
#
# National Water Information System
# 2022/06/08
#
#
# Date Retrieved: USGS Water Data for the Nation Help System
#
parameter_cd group parm_nm epa_equivalence result_statistical_basis result_time_basis result_weight_basis result_particle_size_basis result_sample_fraction result_temperature_basis CASRN SRSName parm_unit
5s 8s 58s 5s 0s 0s 0s 0s 9s 0s 10s 7s 9s
00618 Nutrient Nitrate, water, filtered, milligrams per liter as nitrogen Agree Dissolved 14797-55-8 Nitrate mg/l as N
Loading

0 comments on commit 7164878

Please sign in to comment.