Source code for seppy.loader.soho

# Licensed under a 3-clause BSD style license - see LICENSE.rst

import datetime as dt
import glob
import os
# import warnings

import cdflib
import numpy as np
import pandas as pd
import pooch
import requests
import sunpy
from sunpy.net import Fido
from sunpy.net import attrs as a
from sunpy.timeseries import TimeSeries

from seppy.util import custom_notification, custom_warning, resample_df


logger = pooch.get_logger()
logger.setLevel("WARNING")


def _get_metadata(dataset, path_to_cdf):
    """
    Get meta data from single cdf file
    So far only manually for 'SOHO_ERNE-HED_L2-1MIN' and 'SOHO_ERNE-LED_L2-1MIN'
    """
    metadata = []
    cdf = cdflib.CDF(path_to_cdf)
    if dataset in ['SOHO_COSTEP-EPHIN_L3I-1MIN', 'SOHO_ERNE-HED_L2-1MIN', 'SOHO_ERNE-LED_L2-1MIN']:
        if dataset=='SOHO_COSTEP-EPHIN_L3I-1MIN':
            alpha = 'He'
            m = '_int'
        if dataset=='SOHO_ERNE-HED_L2-1MIN':
            alpha = 'A'
            m = 'H'
        if dataset=='SOHO_ERNE-LED_L2-1MIN':
            alpha = 'A'
            m = 'L'
        metadata = {'He_E_label': cdf.varget('He_E_label').flatten(),
                    'He_energy': cdf.varget('He_energy'),
                    'He_energy_delta': cdf.varget('He_energy_delta'),
                    f'{alpha}{m}_LABL': cdf.varattsget(f'{alpha}{m}')['LABLAXIS'],
                    f'{alpha}{m}_UNITS': cdf.varattsget(f'{alpha}{m}')['UNITS'],
                    f'{alpha}{m}_FILLVAL': cdf.varattsget(f'{alpha}{m}')['FILLVAL'],
                    'P_E_label': cdf.varget('P_E_label').flatten(),
                    'P_energy': cdf.varget('P_energy'),
                    'P_energy_delta': cdf.varget('P_energy_delta'),
                    f'P{m}_LABL': cdf.varattsget(f'P{m}')['LABLAXIS'],
                    f'P{m}_UNITS': cdf.varattsget(f'P{m}')['UNITS'],
                    f'P{m}_FILLVAL': cdf.varattsget(f'P{m}')['FILLVAL'],
                    }

        channels_dict_df_He = pd.DataFrame(cdf.varget('He_E_label').flatten(), columns=['ch_strings'])
        channels_dict_df_He['lower_E'] = cdf.varget("He_energy")-cdf.varget("He_energy_delta")
        channels_dict_df_He['upper_E'] = cdf.varget("He_energy")+cdf.varget("He_energy_delta")
        channels_dict_df_He['He_energy_delta'] = cdf.varget("He_energy_delta")
        channels_dict_df_He['DE'] = cdf.varget("He_energy_delta")*2  # obtain FULL width of energy channel!
        # channels_dict_df_He['mean_E'] = np.sqrt(channels_dict_df_He['upper_E'] * channels_dict_df_He['lower_E'])
        channels_dict_df_He['mean_E'] = cdf.varget("He_energy")

        channels_dict_df_p = pd.DataFrame(cdf.varget('P_E_label').flatten(), columns=['ch_strings'])
        channels_dict_df_p['lower_E'] = cdf.varget("P_energy")-cdf.varget("P_energy_delta")
        channels_dict_df_p['upper_E'] = cdf.varget("P_energy")+cdf.varget("P_energy_delta")
        channels_dict_df_p['P_energy_delta'] = cdf.varget("P_energy_delta")
        channels_dict_df_p['DE'] = cdf.varget("P_energy_delta")*2  # obtain FULL width of energy channel!
        # channels_dict_df_p['mean_E'] = np.sqrt(channels_dict_df_p['upper_E'] * channels_dict_df_p['lower_E'])
        channels_dict_df_p['mean_E'] = cdf.varget("P_energy")

        metadata.update({'channels_dict_df_He': channels_dict_df_He})
        metadata.update({'channels_dict_df_p': channels_dict_df_p})
    return metadata


[docs] def soho_load(dataset, startdate, enddate, path=None, resample=None, pos_timestamp='center', max_conn=5): """ Download CDF files via SunPy/Fido from CDAWeb for CELIAS, EPHIN, ERNE onboard SOHO Parameters ---------- dataset : str Name of SOHO dataset: \n - 'SOHO_COSTEP-EPHIN_L2-1MIN': SOHO COSTEP-EPHIN Level2 1 minute data \n https://www.ieap.uni-kiel.de/et/ag-heber/costep/data.php \n - 'SOHO_COSTEP-EPHIN_L3I-1MIN': SOHO COSTEP-EPHIN Level3 intensity 1 minute data \n https://cdaweb.gsfc.nasa.gov/misc/NotesS.html#SOHO_COSTEP-EPHIN_L3I-1MIN \n - 'SOHO_ERNE-LED_L2-1MIN': SOHO ERNE-LED Level2 1 minute data - VERY OFTEN NO DATA! \n https://cdaweb.gsfc.nasa.gov/misc/NotesS.html#SOHO_ERNE-LED_L2-1MIN \n - 'SOHO_ERNE-HED_L2-1MIN': SOHO ERNE-HED Level2 1 minute data \n https://cdaweb.gsfc.nasa.gov/misc/NotesS.html#SOHO_ERNE-HED_L2-1MIN startdate, enddate : datetime or str Datetime object (e.g., dt.date(2021,12,31) or dt.datetime(2021,4,15)) or "standard" datetime string (e.g., "2021/04/15") (enddate must always be later than startdate) path : str, optional Local path for storing downloaded data, by default None resample : str, optional Resample frequency in format understandable by Pandas, e.g. '1min', by default None. Note that this is just a simple wrapper around thepandas resample function that is calculating the mean of the data in the new time bins. This is not necessarily the correct way to resample data, depending on the data type (for example for errors)! pos_timestamp : str, optional change the position of the timestamp: 'center' or 'start' of the accumulation interval, or 'original' to do nothing, by default 'center'. max_conn : int, optional The number of parallel download slots used by Fido.fetch, by default 5 Returns ------- df : Pandas dataframe See links above for the different datasets for a description of the dataframe columns metadata : dict Dictionary containing different metadata, e.g., energy channels """ # Catch old default value for pos_timestamp if pos_timestamp is None: pos_timestamp = 'center' if not (pos_timestamp=='center' or pos_timestamp=='start' or pos_timestamp=='original'): raise ValueError('"pos_timestamp" must be either "original", "center", or "start"!') if dataset == 'SOHO_COSTEP-EPHIN_L2-1MIN': df, metadata = soho_ephin_loader(startdate, enddate, resample=resample, path=path, all_columns=False, pos_timestamp=pos_timestamp) else: trange = a.Time(startdate, enddate) cda_dataset = a.cdaweb.Dataset(dataset) try: result = Fido.search(trange, cda_dataset) filelist = [i[0].split('/')[-1] for i in result.show('URL')[0]] filelist.sort() if path is None: filelist = [sunpy.config.get('downloads', 'download_dir') + os.sep + file for file in filelist] elif type(path) is str: filelist = [path + os.sep + f for f in filelist] downloaded_files = filelist for i, f in enumerate(filelist): if os.path.exists(f) and os.path.getsize(f) == 0: os.remove(f) if not os.path.exists(f): _downloaded_file = Fido.fetch(result[0][i], path=path, max_conn=max_conn) # downloaded_files = Fido.fetch(result, path=path, max_conn=max_conn) # use Fido.fetch(result, path='/ThisIs/MyPath/to/Data/{file}') to use a specific local folder for saving data files # downloaded_files.sort() data = TimeSeries(downloaded_files, concatenate=True) df = data.to_dataframe() metadata = _get_metadata(dataset, downloaded_files[0]) # if dataset.upper() == 'SOHO_ERNE-HED_L2-1MIN' or dataset.upper() == 'SOHO_ERNE-LED_L3I-1MIN': # custom_warning(f'The format of "channels_dict_df_p" in the metadata for {dataset} has been changed providing the full width of energy channels for DE (instead of the half)!') # elif dataset.upper() == 'SOHO_ERNE-LED_L2-1MIN': # custom_warning(f'The format of the metadata for {dataset} has been changed. The previous metadata is now provided in meta["energy_labels"]!') # remove this (i.e. following lines) when sunpy's read_cdf is updated, # and FILLVAL will be replaced directly, see # https://github.com/sunpy/sunpy/issues/5908 # df = df.replace(-1e+31, np.nan) # for all fluxes # df = df.replace(-2147483648, np.nan) # for ERNE count rates # 4 Apr 2023: previous 2 lines removed because they are taken care of with sunpy # 4.1.0: # https://docs.sunpy.org/en/stable/whatsnew/changelog.html#id7 # https://github.com/sunpy/sunpy/pull/5956 # careful! # adjusting the position of the timestamp manually. # requires knowledge of the original time resolution and timestamp position! if pos_timestamp == 'center': if (dataset.upper() == 'SOHO_ERNE-HED_L2-1MIN' or dataset.upper() == 'SOHO_ERNE-LED_L2-1MIN' or dataset.upper() == 'SOHO_COSTEP-EPHIN_L3I-1MIN'): df.index = df.index+pd.Timedelta('30s') if dataset.upper() == 'SOHO_CELIAS-PM_30S': df.index = df.index+pd.Timedelta('15s') if pos_timestamp == 'start': if dataset.upper() == 'SOHO_CELIAS-SEM_15S': df.index = df.index-pd.Timedelta('7.5s') if isinstance(resample, str): if dataset.upper() in ['SOHO_ERNE-HED_L2-1MIN', 'SOHO_ERNE-LED_L2-1MIN']: cols_unc = [] keywords_unc = [] elif dataset.upper() in ['SOHO_COSTEP-EPHIN_L3I-1MIN']: cols_unc = 'auto' keywords_unc = ['_sys_', '_stat_'] df = resample_df(df, resample, pos_timestamp=pos_timestamp, cols_unc=cols_unc, verbose=False, keywords_unc=keywords_unc) except (RuntimeError, IndexError): print(f'Unable to obtain "{dataset}" data!') downloaded_files = [] df = [] metadata = [] return df, metadata
[docs] def calc_av_en_flux_ERNE(df, channels_dict_df, avg_channels, species='p', sensor='HED'): """ avg_channels : list of int, optional averaging channels m to n if [m, n] is provided (both integers), by default None """ # calculation of total delta-E for averaging multiple channels: if len(avg_channels) > 1: DE_total = channels_dict_df.loc[avg_channels[0]:avg_channels[-1]]['DE'].sum() else: DE_total = channels_dict_df.loc[avg_channels[0]]['DE'] # averaging of intensities: t_flux = 0 for bins in range(avg_channels[0], avg_channels[-1]+1): if species.lower() in ['he', 'a', 'alpha']: t_flux = t_flux + df[f'A{sensor.upper()[0]}_{bins}'] * channels_dict_df.loc[bins]['DE'] elif species.lower() in ['p', 'i', 'h']: t_flux = t_flux + df[f'P{sensor.upper()[0]}_{bins}'] * channels_dict_df.loc[bins]['DE'] avg_flux = t_flux/DE_total # string lower energy energy_low = channels_dict_df.lower_E[avg_channels[0]] # string upper energy without .0 decimal but with ' keV' ending energy_up = channels_dict_df.upper_E[avg_channels[-1]] new_ch_string = f'{energy_low} - {energy_up} MeV' return avg_flux, new_ch_string
[docs] def soho_ephin_download(date, path=None): """ Download SOHO/EPHIN level 2 ascii data file from Kiel university to local path Parameters ---------- date : datetime object datetime of data to retrieve path : str local path where the files will be stored Returns ------- downloaded_file : str full local path to downloaded file """ # add a OS-specific '/' to end end of 'path' if path: if not path[-1] == os.sep: path = f'{path}{os.sep}' doy = int(date.strftime('%j')) year = date.year if year<2000: pre="eph" yy=year-1900 else: pre="epi" yy=year-2000 name="%s%02d%03d" %(pre, yy, doy) base = "http://ulysses.physik.uni-kiel.de/costep/level2/rl2/" file = name+".rl2" url = base+str(date.year)+'/'+file try: downloaded_file = pooch.retrieve(url=url, known_hash=None, fname=file, path=path, progressbar=True) except ModuleNotFoundError: downloaded_file = pooch.retrieve(url=url, known_hash=None, fname=file, path=path, progressbar=False) except requests.HTTPError: print(f'No corresponding EPHIN data found at {url}') downloaded_file = [] print('') return downloaded_file
[docs] def soho_ephin_loader(startdate, enddate, resample=None, path=None, all_columns=False, pos_timestamp='center', use_uncorrected_data_on_own_risk=False): """ Load SOHO/EPHIN level 2 ascii data and return it as Pandas dataframe together with a dictionary providing the energy ranges per channel Parameters ---------- startdate : str start date enddate : str end date resample : str, optional resample frequency in format understandable by Pandas, e.g. '1min', by default None. Note that this is just a simple wrapper around thepandas resample function that is calculating the mean of the data in the new time bins. This is not necessarily the correct way to resample data, depending on the data type (for example for errors)! path : str, optional local path where the files are/should be stored, by default None all_columns : boolean, optional if True provide all availalbe columns in returned dataframe, by default False pos_timestamp : {str}, optional change the position of the timestamp: 'center' or 'start' of the accumulation interval, or 'original' to do nothing, by default 'center'. Returns ------- df : Pandas dataframe dataframe with either 15 channels of electron or 30 channels of proton/ion fluxes and their respective uncertainties channels_dict_df : dict Pandas dataframe giving details on the measurement channels """ if not path: path = sunpy.config.get('downloads', 'download_dir') + os.sep # create list of files to load: dates = pd.date_range(start=startdate, end=enddate, freq='D') filelist = [] for i, doy in enumerate(dates.day_of_year): if dates[i].year<2000: pre = "eph" yy = dates[i].year-1900 else: pre = "epi" yy = dates[i].year-2000 name = "%s%02d%03d" %(pre, yy, doy) try: file = glob.glob(f"{path}{os.sep}{name}.rl2")[0] except IndexError: print(f"File {name}.rl2 not found locally at {path}.") file = soho_ephin_download(dates[i], path) if len(file) > 0: filelist.append(file) if len(filelist) > 0: filelist = np.sort(filelist) col_names = ['Year', 'DOY', 'MS', 'S/C Epoch', 'Status Word part 1', 'Status Word part 2', 'E150', 'E300', 'E1300', 'E3000', 'P4', 'P8', 'P25', 'P41', 'H4', 'H8', 'H25', 'H41', 'INT', 'P4 GM', 'P4 GR', 'P4 S', 'P8 GM', 'P8 GR', 'P8 S', 'P25 GM', 'P25 GR', 'P25 S', 'P41 GM', 'P41 GR', 'P41 S', 'H4 GM', 'H4 GR', 'H4 S1', 'H4 S23', 'H8 GM', 'H8 GR', 'H8 S1', 'H8 S23', 'H25 GM', 'H25 GR', 'H25 S1', 'H25 S23', 'H41 GM', 'H41 GR', 'H41 S1', 'H41 S23', 'Status Flag', 'Spare 1', 'Spare 2', 'Spare 3'] # read files into Pandas dataframes: df = pd.read_csv(filelist[0], header=None, sep=r'\s+', names=col_names) if len(filelist) > 1: for file in filelist[1:]: t_df = pd.read_csv(file, header=None, sep=r'\s+', names=col_names) df = pd.concat([df, t_df]) # # generate datetime index from year, day of year, and milisec of day: df.index = doy2dt(df.Year.values, df.DOY.values + df.MS.values/1000./86400.) df.index.name = 'time' # drop some unused columns: if not all_columns: df = df.drop(columns=['Year', 'DOY', 'MS', 'S/C Epoch', 'Status Word part 1', 'Status Word part 2', 'P4 GM', 'P4 GR', 'P4 S', 'P8 GM', 'P8 GR', 'P8 S', 'P25 GM', 'P25 GR', 'P25 S', 'P41 GM', 'P41 GR', 'P41 S', 'H4 GM', 'H4 GR', 'H4 S1', 'H4 S23', 'H8 GM', 'H8 GR', 'H8 S1', 'H8 S23', 'H25 GM', 'H25 GR', 'H25 S1', 'H25 S23', 'H41 GM', 'H41 GR', 'H41 S1', 'H41 S23', 'Spare 1', 'Spare 2', 'Spare 3']) # TODO: Proton and helium measurements need to be corrected for effects determined post-launch, # cf. chapter 2.3 of https://www.ieap.uni-kiel.de/et/ag-heber/costep/materials/L2_spec_ephin.pdf # Until this correction has been implemented here, these data products are set to -9e9. # Setting use_uncorrected_data_on_own_risk=True skips this replacement, so that the uncorrected # data can be obtained at own risk! if use_uncorrected_data_on_own_risk: # # warnings.warn("Proton and helium data is still uncorrected! Know what you're doing and use at own risk!") custom_warning("SOHO/EPHIN proton and helium data is still uncorrected! Know what you're doing and use at own risk!") else: custom_notification("SOHO/EPHIN proton and helium data are not supported at the moment and set to negative values of -9e9!") df.P4 = -9e9 df.P8 = -9e9 df.P25 = -9e9 df.P41 = -9e9 df.H4 = -9e9 df.H8 = -9e9 df.H25 = -9e9 df.H41 = -9e9 # replace bad data with np.nan: # there shouldn't be bad data in rl2 files! # df = df.replace(-9999.900, np.nan) # derive instrument status and dependencies status = df['Status Flag'].values fmodes = np.zeros(len(status)) for q in range(len(status)): binaries = '{0:08b}'.format(int(status[q])) if int(binaries[-1]) == 1: if int(binaries[-3]) == 1: fmodes[q] = 1 else: fmodes[q] = 2 ringoff = np.zeros(len(status)) for q in range(len(status)): binaries = '{0:08b}'.format(int(status[q])) if int(binaries[-2]): ringoff[q] = 1 cs_e300 = '0.67 - 3.0 MeV' cs_e1300 = '2.64 - 6.18 MeV' cs_p25 = '25 - 41 MeV' cs_he25 = '25 - 41 MeV/N' if max(fmodes)==1: cs_e1300 = "2.64 - 10.4 MeV" cs_p25 = '25 - 53 MeV' cs_he25 = '25 - 53 MeV/n' if max(fmodes)==2: # # warnings.warn('Careful: EPHIN ring off!') custom_warning('SOHO/EPHIN ring is off. This means high risk of contaminated measurements!') # failure mode D since 4 Oct 2017: # dates[-1].date() is enddate, used to catch cases when enddate is a string if dates[-1].date() >= dt.date(2017, 10, 4): cs_e300 = 'deactivated bc. of failure mode D' cs_e1300 = "0.67 - 10.4 MeV" # dates[0].date() is startdate, used to catch cases when startdate is a string if dates[0].date() <= dt.date(2017, 10, 4): # # warnings.warn('EPHIN instrument status (i.e., electron energy channels) changed during selected period (on Oct 4, 2017)!') custom_warning('SOHO/EPHIN instrument status (i.e., electron energy channels) changed during selected period (on Oct 4, 2017)!') # careful! # adjusting the position of the timestamp manually. # requires knowledge of the original time resolution and timestamp position! if pos_timestamp == 'center': df.index = df.index+pd.Timedelta('30s') # optional resampling: if isinstance(resample, str): df = resample_df(df, resample, pos_timestamp=pos_timestamp, cols_unc=[], verbose=False) else: df = [] energies = {'E150': '0.25 - 0.7 MeV', 'E300': cs_e300, 'E1300': cs_e1300, 'E3000': '4.80 - 10.4 MeV', 'P4': '4.3 - 7.8 MeV', 'P8': '7.8 - 25 MeV', 'P25': cs_p25, 'P41': '41 - 53 MeV', 'H4': '4.3 - 7.8 MeV/n', 'H8': '7.8 - 25.0 MeV/n', 'H25': cs_he25, 'H41': '40.9 - 53.0 MeV/n', 'INT': '>25 MeV integral'} meta = {'energy_labels': energies} return df, meta
[docs] def doy2dt(year, doy): """ convert decimal day of year to datetime """ if isinstance(year, (int, np.uint)) or isinstance(year, (float)): year = [year] if isinstance(doy, (int, np.uint)) or isinstance(doy, (float)): doy = [doy] if len(doy) > len(year): year = np.zeros(len(doy))+year datearray = [] for i in range(len(doy)): if np.isnan(doy[i]): datearray.append(np.nan) else: datearray.append((dt.datetime(int(year[i]), 1, 1, 0) + dt.timedelta(float(doy[i])-1))) return np.array(datearray)
# Followig function replaced by soho_ephin_loader() # def ephin_rl2_downloader(startdate, enddate, path=None): # """ # download EPHIN level2 rl2 files with sunpy.fido from VSO (http://virtualsolar.org) # """ # # add a OS-specific '/' to end end of 'path' # if path: # if not path[-1] == os.sep: # path = f'{path}{os.sep}' # trange = a.Time(startdate, enddate) # result = Fido.search(trange, a.Instrument.costep) # if result.file_num == 0: # # warnings.warn('WARNING: No corresponding data files found at VSO!') # downloaded_files = [] # else: # # get list of file types of results: # ftypes = np.array([result.show('fileid')[0][i][0].split('.')[-1] for i in range(result.file_num)]) # # download only .rl2 files: # downloaded_files = Fido.fetch(result[0][np.where(ftypes=='rl2')], path=path) # return downloaded_files