5
5
6
6
from bs4 import BeautifulSoup
7
7
import astropy .units as u
8
+ from astropy import table
8
9
from astropy .io import ascii
9
10
from astroquery .query import BaseQuery
10
11
from astroquery .utils import async_to_sync
@@ -26,8 +27,11 @@ def data_path(filename):
26
27
@async_to_sync
27
28
class CDMSClass (BaseQuery ):
28
29
# use the Configuration Items imported from __init__.py
29
- URL = conf .server
30
+ URL = conf .search
31
+ SERVER = conf .server
32
+ CLASSIC_URL = conf .classic_server
30
33
TIMEOUT = conf .timeout
34
+ MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS' , '028582 H2NC' , '058501 H2C2S' , '064527 HC3HCN' ]
31
35
32
36
def query_lines_async (self , min_frequency , max_frequency , * ,
33
37
min_strength = - 500 , molecule = 'All' ,
@@ -143,8 +147,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
143
147
else :
144
148
payload ['Molecules' ] = molecule
145
149
146
- payload = list (payload .items ())
147
-
148
150
if get_query_payload :
149
151
return payload
150
152
# BaseQuery classes come with a _request method that includes a
@@ -170,6 +172,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
170
172
response2 = self ._request (method = 'GET' , url = fullurl ,
171
173
timeout = self .TIMEOUT , cache = cache )
172
174
175
+ # accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
176
+ badlist = (self .MALFORMATTED_MOLECULE_LIST + # noqa
177
+ [y for x in self .MALFORMATTED_MOLECULE_LIST for y in x .split ()])
178
+ if payload ['Molecules' ] in badlist :
179
+ raise ValueError (f"Molecule { payload ['Molecules' ]} is known not to comply with standard CDMS format. "
180
+ f"Try get_molecule({ payload ['Molecules' ]} ) instead." )
181
+
173
182
return response2
174
183
175
184
def _parse_result (self , response , * , verbose = False ):
@@ -278,8 +287,9 @@ def _parse_result(self, response, *, verbose=False):
278
287
279
288
return result
280
289
281
- def get_species_table (self , * , catfile = 'catdir.cat' , use_cached = True ,
282
- catfile_url = conf .catfile_url ):
290
+ def get_species_table (self , * , catfile = 'partfunc.cat' , use_cached = True ,
291
+ catfile_url = conf .catfile_url ,
292
+ catfile2 = 'catdir.cat' , catfile_url2 = conf .catfile_url2 ):
283
293
"""
284
294
A directory of the catalog is found in a file called 'catdir.cat.'
285
295
@@ -302,9 +312,35 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
302
312
"""
303
313
304
314
if use_cached :
305
- result = ascii .read (data_path (catfile ), format = 'fixed_width' , delimiter = '|' )
315
+ try :
316
+ result = ascii .read (data_path (catfile ), format = 'fixed_width' , delimiter = '|' )
317
+ result2 = ascii .read (data_path (catfile2 ), format = 'fixed_width' , delimiter = '|' )
318
+ except UnicodeDecodeError :
319
+ with open (data_path (catfile ), 'rb' ) as fh :
320
+ content = fh .read ()
321
+ text = content .decode ('ascii' , errors = 'replace' )
322
+ result = ascii .read (text , format = 'basic' , delimiter = '|' )
323
+ with open (data_path (catfile2 ), 'rb' ) as fh :
324
+ content = fh .read ()
325
+ text = content .decode ('ascii' , errors = 'replace' )
326
+ result2 = ascii .read (text , format = 'basic' , delimiter = '|' )
306
327
else :
307
328
result = retrieve_catfile (catfile_url )
329
+ result2 = retrieve_catfile2 (catfile_url2 )
330
+ result .write (data_path (catfile ), format = 'ascii.fixed_width' , delimiter = '|' , overwrite = True )
331
+ result2 .write (data_path (catfile2 ), format = 'ascii.fixed_width' , delimiter = '|' , overwrite = True )
332
+
333
+ merged = table .join (result , result2 , keys = ['tag' ])
334
+ if not all (merged ['#lines' ] == merged ['# lines' ]):
335
+ raise ValueError ("Inconsistent table of molecules from CDMS." )
336
+ del merged ['# lines' ]
337
+
338
+ # reorder columns
339
+ result = merged [['tag' , 'molecule' , 'Name' , '#lines' , 'lg(Q(1000))' ,
340
+ 'lg(Q(500))' , 'lg(Q(300))' , 'lg(Q(225))' , 'lg(Q(150))' , 'lg(Q(75))' ,
341
+ 'lg(Q(37.5))' , 'lg(Q(18.75))' , 'lg(Q(9.375))' , 'lg(Q(5.000))' ,
342
+ 'lg(Q(2.725))' ,
343
+ 'Ver.' , 'Documentation' , 'Date of entry' , 'Entry' ]]
308
344
309
345
meta = {'lg(Q(1000))' : 1000.0 ,
310
346
'lg(Q(500))' : 500.0 ,
@@ -331,6 +367,96 @@ def tryfloat(x):
331
367
result .meta = {'Temperature (K)' : [1000. , 500. , 300. , 225. , 150. , 75. ,
332
368
37.5 , 18.75 , 9.375 , 5. , 2.725 ]}
333
369
370
+ result .add_index ('tag' )
371
+
372
+ return result
373
+
374
+ def get_molecule (self , molecule_id , * , cache = True ):
375
+ """
376
+ Retrieve the whole molecule table for a given molecule id
377
+ """
378
+ if not isinstance (molecule_id , str ) or len (molecule_id ) != 6 :
379
+ raise ValueError ("molecule_id should be a length-6 string of numbers" )
380
+ url = f'{ self .CLASSIC_URL } /entries/c{ molecule_id } .cat'
381
+ response = self ._request (method = 'GET' , url = url ,
382
+ timeout = self .TIMEOUT , cache = cache )
383
+ result = self ._parse_cat (response )
384
+
385
+ species_table = self .get_species_table ()
386
+ result .meta = dict (species_table .loc [int (molecule_id )])
387
+
388
+ return result
389
+
390
+ def _parse_cat (self , response , * , verbose = False ):
391
+ """
392
+ Parse a catalog response into an `~astropy.table.Table`
393
+
394
+ See details in _parse_response; this is a very similar function,
395
+ but the catalog responses have a slightly different format.
396
+ """
397
+
398
+ if 'Zero lines were found' in response .text :
399
+ raise EmptyResponseError (f"Response was empty; message was '{ response .text } '." )
400
+
401
+ text = response .text
402
+
403
+ # notes about the format
404
+ # [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
405
+ # 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
406
+ starts = {'FREQ' : 0 ,
407
+ 'ERR' : 14 ,
408
+ 'LGINT' : 22 ,
409
+ 'DR' : 30 ,
410
+ 'ELO' : 32 ,
411
+ 'GUP' : 42 ,
412
+ 'TAG' : 45 ,
413
+ 'QNFMT' : 52 ,
414
+ 'Q1' : 56 ,
415
+ 'Q2' : 58 ,
416
+ 'Q3' : 60 ,
417
+ 'Q4' : 62 ,
418
+ 'Q5' : 64 ,
419
+ 'Q6' : 66 ,
420
+ 'Q7' : 68 ,
421
+ 'Q8' : 70 ,
422
+ 'Q9' : 72 ,
423
+ 'Q10' : 74 ,
424
+ 'Q11' : 76 ,
425
+ 'Q12' : 78 ,
426
+ 'Q13' : 80 ,
427
+ 'Q14' : 82 ,
428
+ }
429
+
430
+ result = ascii .read (text , header_start = None , data_start = 0 ,
431
+ comment = r'THIS|^\s{12,14}\d{4,6}.*' ,
432
+ names = list (starts .keys ()),
433
+ col_starts = list (starts .values ()),
434
+ format = 'fixed_width' , fast_reader = False )
435
+
436
+ # int truncates - which is what we want
437
+ result ['MOLWT' ] = [int (x / 1e4 ) for x in result ['TAG' ]]
438
+
439
+ result ['FREQ' ].unit = u .MHz
440
+ result ['ERR' ].unit = u .MHz
441
+
442
+ result ['Lab' ] = result ['MOLWT' ] < 0
443
+ result ['MOLWT' ] = np .abs (result ['MOLWT' ])
444
+ result ['MOLWT' ].unit = u .Da
445
+
446
+ fix_keys = ['GUP' ]
447
+ for suf in '' :
448
+ for qn in (f'Q{ ii } ' for ii in range (1 , 15 )):
449
+ qnind = qn + suf
450
+ fix_keys .append (qnind )
451
+ for key in fix_keys :
452
+ if not np .issubdtype (result [key ].dtype , np .integer ):
453
+ intcol = np .array (list (map (parse_letternumber , result [key ])),
454
+ dtype = int )
455
+ result [key ] = intcol
456
+
457
+ result ['LGINT' ].unit = u .nm ** 2 * u .MHz
458
+ result ['ELO' ].unit = u .cm ** (- 1 )
459
+
334
460
return result
335
461
336
462
@@ -375,10 +501,13 @@ def find(self, st, flags):
375
501
376
502
Returns
377
503
-------
378
- The list of values corresponding to the matches
504
+ The dictionary containing only values whose keys match the regex
379
505
380
506
"""
381
507
508
+ if st in self :
509
+ return {st : self [st ]}
510
+
382
511
out = {}
383
512
384
513
for kk , vv in self .items ():
@@ -394,24 +523,89 @@ def find(self, st, flags):
394
523
def build_lookup ():
395
524
396
525
result = CDMS .get_species_table ()
526
+
527
+ # start with the 'molecule' column
397
528
keys = list (result ['molecule' ][:]) # convert NAME column to list
398
529
values = list (result ['tag' ][:]) # convert TAG column to list
399
530
dictionary = dict (zip (keys , values )) # make k,v dictionary
531
+
532
+ # repeat with the Name column
533
+ keys = list (result ['Name' ][:])
534
+ values = list (result ['tag' ][:])
535
+ dictionary2 = dict (zip (keys , values ))
536
+ dictionary .update (dictionary2 )
537
+
400
538
lookuptable = Lookuptable (dictionary ) # apply the class above
401
539
402
540
return lookuptable
403
541
404
542
405
- def retrieve_catfile (url = 'https://cdms.astro.uni-koeln.de/classic /entries/partition_function.html' ):
543
+ def retrieve_catfile (url = f' { conf . classic_server } /entries/partition_function.html' ):
406
544
"""
407
545
Simple retrieve index function
408
546
"""
409
547
response = requests .get (url )
410
548
response .raise_for_status ()
411
- tbl = ascii .read (response .text , header_start = None , data_start = 15 , data_end = - 5 ,
412
- names = ['tag' , 'molecule' , '#lines' , 'lg(Q(1000))' , 'lg(Q(500))' , 'lg(Q(300))' , 'lg(Q(225))' ,
413
- 'lg(Q(150))' , 'lg(Q(75))' , 'lg(Q(37.5))' , 'lg(Q(18.75))' , 'lg(Q(9.375))' , 'lg(Q(5.000))' ,
414
- 'lg(Q(2.725))' ],
415
- col_starts = (0 , 7 , 34 , 41 , 53 , 66 , 79 , 92 , 106 , 117 , 131 , 145 , 159 , 173 ),
416
- format = 'fixed_width' , delimiter = ' ' )
549
+ lines = response .text .split ("\n " )
550
+
551
+ # used to convert '---' to nan
552
+ def tryfloat (x ):
553
+ try :
554
+ return float (x )
555
+ except ValueError :
556
+ return np .nan
557
+
558
+ # the 'fixed width' table reader fails because there are rows that violate fixed width
559
+ tbl_rows = []
560
+ for row in lines [15 :- 5 ]:
561
+ split = row .split ()
562
+ tag = int (split [0 ])
563
+ molecule_and_lines = row [7 :41 ]
564
+ molecule = " " .join (molecule_and_lines .split ()[:- 1 ])
565
+ nlines = int (molecule_and_lines .split ()[- 1 ])
566
+ partfunc = map (tryfloat , row [41 :].split ())
567
+ partfunc_dict = dict (zip (['lg(Q(1000))' , 'lg(Q(500))' , 'lg(Q(300))' , 'lg(Q(225))' ,
568
+ 'lg(Q(150))' , 'lg(Q(75))' , 'lg(Q(37.5))' , 'lg(Q(18.75))' ,
569
+ 'lg(Q(9.375))' , 'lg(Q(5.000))' , 'lg(Q(2.725))' ], partfunc ))
570
+ tbl_rows .append ({'tag' : tag ,
571
+ 'molecule' : molecule ,
572
+ '#lines' : nlines ,
573
+ })
574
+ tbl_rows [- 1 ].update (partfunc_dict )
575
+ tbl = table .Table (tbl_rows )
576
+ # tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
577
+ # names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
578
+ # 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
579
+ # 'lg(Q(2.725))'],
580
+ # col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
581
+ # format='fixed_width', delimiter=' ')
582
+ return tbl
583
+
584
+
585
+ def retrieve_catfile2 (url = f'{ conf .classic_server } /predictions/catalog/catdir.html' ):
586
+ """
587
+ Simple retrieve index function
588
+ """
589
+ response = requests .get (url )
590
+ response .raise_for_status ()
591
+ try :
592
+ tbl = ascii .read (response .text , format = 'html' )
593
+ except UnicodeDecodeError :
594
+ # based on https://github.com/astropy/astropy/issues/3826#issuecomment-256113937
595
+ # which suggests to start with the bytecode content and decode with 'replace errors'
596
+ text = response .content .decode ('ascii' , errors = 'replace' )
597
+ tbl = ascii .read (text , format = 'html' )
598
+
599
+ # delete a junk column (wastes space)
600
+ del tbl ['Catalog' ]
601
+
602
+ # for joining - want same capitalization
603
+ tbl .rename_column ("Tag" , "tag" )
604
+
605
+ # one of these is a unicode dash, the other is a normal dash.... in theory
606
+ if 'Entry in cm–1' in tbl .colnames :
607
+ tbl .rename_column ('Entry in cm–1' , 'Entry' )
608
+ if 'Entry in cm-1' in tbl .colnames :
609
+ tbl .rename_column ('Entry in cm-1' , 'Entry' )
610
+
417
611
return tbl
0 commit comments