Skip to content

Commit b005fc9

Browse files
authored
Merge pull request #3173 from keflavich/cdms_cats
CDMS: add whole-molecule query tool & refactor metadata acquisition
2 parents ca584f6 + 742289f commit b005fc9

File tree

11 files changed

+2904
-1227
lines changed

11 files changed

+2904
-1227
lines changed

CHANGES.rst

+11
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ New Tools and Services
88
Service fixes and enhancements
99
------------------------------
1010

11+
linelists.cdms
12+
^^^^^^^^^^^^^^
13+
14+
- Add whole catalog retrieval, improve error messaging for unparseable lines,
15+
improve metadata catalog, and improve lookuptable behavior [#3173,#2901]
16+
17+
jplspec
18+
^^^^^^^
19+
20+
- minor improvement to lookuptable behavior [#3173,#2901]
21+
1122

1223
Infrastructure, Utility and Other Changes and Additions
1324
-------------------------------------------------------

astroquery/jplspec/lookup_table.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
class Lookuptable(dict):
66

7-
def find(self, s, flags):
7+
def find(self, st, flags):
88
"""
99
Search dictionary keys for a regex match to string s
1010
1111
Parameters
1212
----------
13-
s : str
13+
st : str
1414
String to compile as a regular expression
1515
Can be entered non-specific for broader results
1616
('H2O' yields 'H2O' but will also yield 'HCCCH2OD')
@@ -22,17 +22,20 @@ def find(self, s, flags):
2222
2323
Returns
2424
-------
25-
The list of values corresponding to the matches
25+
The dictionary containing only values whose keys match the regex
2626
2727
"""
2828

29-
R = re.compile(s, flags)
29+
if st in self:
30+
return {st: self[st]}
31+
32+
R = re.compile(st, flags)
3033

3134
out = {}
3235

33-
for k, v in self.items():
34-
match = R.search(str(k))
36+
for key, val in self.items():
37+
match = R.search(str(key))
3538
if match:
36-
out[k] = v
39+
out[key] = val
3740

3841
return out

astroquery/linelists/cdms/__init__.py

+12
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,25 @@ class Conf(_config.ConfigNamespace):
1414
Configuration parameters for `astroquery.linelists.cdms`.
1515
"""
1616
server = _config.ConfigItem(
17+
'https://cdms.astro.uni-koeln.de/',
18+
'CDMS Search and Conversion Form URL.')
19+
20+
search = _config.ConfigItem(
1721
'https://cdms.astro.uni-koeln.de/cgi-bin/cdmssearch',
1822
'CDMS Search and Conversion Form URL.')
1923

2024
catfile_url = _config.ConfigItem(
2125
'https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html',
2226
'CDMS partition function table listing all available molecules.')
2327

28+
catfile_url2 = _config.ConfigItem(
29+
'https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html',
30+
'CDMS catalog table listing all available molecules (with different names from partition function).')
31+
32+
classic_server = _config.ConfigItem(
33+
'https://cdms.astro.uni-koeln.de/classic',
34+
'CDMS Classic Molecule List server.')
35+
2436
timeout = _config.ConfigItem(
2537
60,
2638
'Time limit for connecting to the CDMS server.')

astroquery/linelists/cdms/core.py

+208-14
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from bs4 import BeautifulSoup
77
import astropy.units as u
8+
from astropy import table
89
from astropy.io import ascii
910
from astroquery.query import BaseQuery
1011
from astroquery.utils import async_to_sync
@@ -26,8 +27,11 @@ def data_path(filename):
2627
@async_to_sync
2728
class CDMSClass(BaseQuery):
2829
# use the Configuration Items imported from __init__.py
29-
URL = conf.server
30+
URL = conf.search
31+
SERVER = conf.server
32+
CLASSIC_URL = conf.classic_server
3033
TIMEOUT = conf.timeout
34+
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']
3135

3236
def query_lines_async(self, min_frequency, max_frequency, *,
3337
min_strength=-500, molecule='All',
@@ -143,8 +147,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
143147
else:
144148
payload['Molecules'] = molecule
145149

146-
payload = list(payload.items())
147-
148150
if get_query_payload:
149151
return payload
150152
# BaseQuery classes come with a _request method that includes a
@@ -170,6 +172,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
170172
response2 = self._request(method='GET', url=fullurl,
171173
timeout=self.TIMEOUT, cache=cache)
172174

175+
# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
176+
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
177+
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
178+
if payload['Molecules'] in badlist:
179+
raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "
180+
f"Try get_molecule({payload['Molecules']}) instead.")
181+
173182
return response2
174183

175184
def _parse_result(self, response, *, verbose=False):
@@ -278,8 +287,9 @@ def _parse_result(self, response, *, verbose=False):
278287

279288
return result
280289

281-
def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
282-
catfile_url=conf.catfile_url):
290+
def get_species_table(self, *, catfile='partfunc.cat', use_cached=True,
291+
catfile_url=conf.catfile_url,
292+
catfile2='catdir.cat', catfile_url2=conf.catfile_url2):
283293
"""
284294
A directory of the catalog is found in a file called 'catdir.cat.'
285295
@@ -302,9 +312,35 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
302312
"""
303313

304314
if use_cached:
305-
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
315+
try:
316+
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
317+
result2 = ascii.read(data_path(catfile2), format='fixed_width', delimiter='|')
318+
except UnicodeDecodeError:
319+
with open(data_path(catfile), 'rb') as fh:
320+
content = fh.read()
321+
text = content.decode('ascii', errors='replace')
322+
result = ascii.read(text, format='basic', delimiter='|')
323+
with open(data_path(catfile2), 'rb') as fh:
324+
content = fh.read()
325+
text = content.decode('ascii', errors='replace')
326+
result2 = ascii.read(text, format='basic', delimiter='|')
306327
else:
307328
result = retrieve_catfile(catfile_url)
329+
result2 = retrieve_catfile2(catfile_url2)
330+
result.write(data_path(catfile), format='ascii.fixed_width', delimiter='|', overwrite=True)
331+
result2.write(data_path(catfile2), format='ascii.fixed_width', delimiter='|', overwrite=True)
332+
333+
merged = table.join(result, result2, keys=['tag'])
334+
if not all(merged['#lines'] == merged['# lines']):
335+
raise ValueError("Inconsistent table of molecules from CDMS.")
336+
del merged['# lines']
337+
338+
# reorder columns
339+
result = merged[['tag', 'molecule', 'Name', '#lines', 'lg(Q(1000))',
340+
'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))', 'lg(Q(150))', 'lg(Q(75))',
341+
'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
342+
'lg(Q(2.725))',
343+
'Ver.', 'Documentation', 'Date of entry', 'Entry']]
308344

309345
meta = {'lg(Q(1000))': 1000.0,
310346
'lg(Q(500))': 500.0,
@@ -331,6 +367,96 @@ def tryfloat(x):
331367
result.meta = {'Temperature (K)': [1000., 500., 300., 225., 150., 75.,
332368
37.5, 18.75, 9.375, 5., 2.725]}
333369

370+
result.add_index('tag')
371+
372+
return result
373+
374+
def get_molecule(self, molecule_id, *, cache=True):
375+
"""
376+
Retrieve the whole molecule table for a given molecule id
377+
"""
378+
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
379+
raise ValueError("molecule_id should be a length-6 string of numbers")
380+
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
381+
response = self._request(method='GET', url=url,
382+
timeout=self.TIMEOUT, cache=cache)
383+
result = self._parse_cat(response)
384+
385+
species_table = self.get_species_table()
386+
result.meta = dict(species_table.loc[int(molecule_id)])
387+
388+
return result
389+
390+
def _parse_cat(self, response, *, verbose=False):
391+
"""
392+
Parse a catalog response into an `~astropy.table.Table`
393+
394+
See details in _parse_response; this is a very similar function,
395+
but the catalog responses have a slightly different format.
396+
"""
397+
398+
if 'Zero lines were found' in response.text:
399+
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")
400+
401+
text = response.text
402+
403+
# notes about the format
404+
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
405+
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
406+
starts = {'FREQ': 0,
407+
'ERR': 14,
408+
'LGINT': 22,
409+
'DR': 30,
410+
'ELO': 32,
411+
'GUP': 42,
412+
'TAG': 45,
413+
'QNFMT': 52,
414+
'Q1': 56,
415+
'Q2': 58,
416+
'Q3': 60,
417+
'Q4': 62,
418+
'Q5': 64,
419+
'Q6': 66,
420+
'Q7': 68,
421+
'Q8': 70,
422+
'Q9': 72,
423+
'Q10': 74,
424+
'Q11': 76,
425+
'Q12': 78,
426+
'Q13': 80,
427+
'Q14': 82,
428+
}
429+
430+
result = ascii.read(text, header_start=None, data_start=0,
431+
comment=r'THIS|^\s{12,14}\d{4,6}.*',
432+
names=list(starts.keys()),
433+
col_starts=list(starts.values()),
434+
format='fixed_width', fast_reader=False)
435+
436+
# int truncates - which is what we want
437+
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]
438+
439+
result['FREQ'].unit = u.MHz
440+
result['ERR'].unit = u.MHz
441+
442+
result['Lab'] = result['MOLWT'] < 0
443+
result['MOLWT'] = np.abs(result['MOLWT'])
444+
result['MOLWT'].unit = u.Da
445+
446+
fix_keys = ['GUP']
447+
for suf in '':
448+
for qn in (f'Q{ii}' for ii in range(1, 15)):
449+
qnind = qn+suf
450+
fix_keys.append(qnind)
451+
for key in fix_keys:
452+
if not np.issubdtype(result[key].dtype, np.integer):
453+
intcol = np.array(list(map(parse_letternumber, result[key])),
454+
dtype=int)
455+
result[key] = intcol
456+
457+
result['LGINT'].unit = u.nm**2 * u.MHz
458+
result['ELO'].unit = u.cm**(-1)
459+
334460
return result
335461

336462

@@ -375,10 +501,13 @@ def find(self, st, flags):
375501
376502
Returns
377503
-------
378-
The list of values corresponding to the matches
504+
The dictionary containing only values whose keys match the regex
379505
380506
"""
381507

508+
if st in self:
509+
return {st: self[st]}
510+
382511
out = {}
383512

384513
for kk, vv in self.items():
@@ -394,24 +523,89 @@ def find(self, st, flags):
394523
def build_lookup():
395524

396525
result = CDMS.get_species_table()
526+
527+
# start with the 'molecule' column
397528
keys = list(result['molecule'][:]) # convert NAME column to list
398529
values = list(result['tag'][:]) # convert TAG column to list
399530
dictionary = dict(zip(keys, values)) # make k,v dictionary
531+
532+
# repeat with the Name column
533+
keys = list(result['Name'][:])
534+
values = list(result['tag'][:])
535+
dictionary2 = dict(zip(keys, values))
536+
dictionary.update(dictionary2)
537+
400538
lookuptable = Lookuptable(dictionary) # apply the class above
401539

402540
return lookuptable
403541

404542

405-
def retrieve_catfile(url='https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html'):
543+
def retrieve_catfile(url=f'{conf.classic_server}/entries/partition_function.html'):
406544
"""
407545
Simple retrieve index function
408546
"""
409547
response = requests.get(url)
410548
response.raise_for_status()
411-
tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
412-
names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
413-
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
414-
'lg(Q(2.725))'],
415-
col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
416-
format='fixed_width', delimiter=' ')
549+
lines = response.text.split("\n")
550+
551+
# used to convert '---' to nan
552+
def tryfloat(x):
553+
try:
554+
return float(x)
555+
except ValueError:
556+
return np.nan
557+
558+
# the 'fixed width' table reader fails because there are rows that violate fixed width
559+
tbl_rows = []
560+
for row in lines[15:-5]:
561+
split = row.split()
562+
tag = int(split[0])
563+
molecule_and_lines = row[7:41]
564+
molecule = " ".join(molecule_and_lines.split()[:-1])
565+
nlines = int(molecule_and_lines.split()[-1])
566+
partfunc = map(tryfloat, row[41:].split())
567+
partfunc_dict = dict(zip(['lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
568+
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))',
569+
'lg(Q(9.375))', 'lg(Q(5.000))', 'lg(Q(2.725))'], partfunc))
570+
tbl_rows.append({'tag': tag,
571+
'molecule': molecule,
572+
'#lines': nlines,
573+
})
574+
tbl_rows[-1].update(partfunc_dict)
575+
tbl = table.Table(tbl_rows)
576+
# tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
577+
# names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
578+
# 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
579+
# 'lg(Q(2.725))'],
580+
# col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
581+
# format='fixed_width', delimiter=' ')
582+
return tbl
583+
584+
585+
def retrieve_catfile2(url=f'{conf.classic_server}/predictions/catalog/catdir.html'):
586+
"""
587+
Simple retrieve index function
588+
"""
589+
response = requests.get(url)
590+
response.raise_for_status()
591+
try:
592+
tbl = ascii.read(response.text, format='html')
593+
except UnicodeDecodeError:
594+
# based on https://github.com/astropy/astropy/issues/3826#issuecomment-256113937
595+
# which suggests to start with the bytecode content and decode with 'replace errors'
596+
text = response.content.decode('ascii', errors='replace')
597+
tbl = ascii.read(text, format='html')
598+
599+
# delete a junk column (wastes space)
600+
del tbl['Catalog']
601+
602+
# for joining - want same capitalization
603+
tbl.rename_column("Tag", "tag")
604+
605+
# one of these is a unicode dash, the other is a normal dash.... in theory
606+
if 'Entry in cm–1' in tbl.colnames:
607+
tbl.rename_column('Entry in cm–1', 'Entry')
608+
if 'Entry in cm-1' in tbl.colnames:
609+
tbl.rename_column('Entry in cm-1', 'Entry')
610+
417611
return tbl

0 commit comments

Comments
 (0)