Source code for seppy.loader.bepi

import glob
import os
import pooch
import requests
import sunpy
import warnings
import numpy as np
import pandas as pd

from astropy.utils.data import get_pkg_data_filename
from seppy.util import resample_df

# omit Pandas' PerformanceWarning
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)


logger = pooch.get_logger()
logger.setLevel("WARNING")



[docs]
def bepi_sixsp_download(date, path=None):
    """Download BepiColombo/SIXS-P level 3 data file from SERPENTINE data server to local path

    Parameters
    ----------
    date : datetime object
        datetime of data to retrieve
    path : str
        local path where the files will be stored

    Returns
    -------
    downloaded_file : str
        full local path to downloaded file
    """

    # add a OS-specific '/' to end end of 'path'
    if path:
        if not path[-1] == os.sep:
            path = f'{path}{os.sep}'

    base = "https://data.serpentine-h2020.eu/l3data/bepi/l3/six_der"

    fname = f"six_der_sc_{date.year}{date.strftime('%m')}_l3_data.csv"

    url = f"{base}/{date.year}/{fname}"

    try:
        downloaded_file = pooch.retrieve(url=url, known_hash=None, fname=fname, path=path, progressbar=True)
    except ModuleNotFoundError:
        downloaded_file = pooch.retrieve(url=url, known_hash=None, fname=fname, path=path, progressbar=False)
    except requests.HTTPError:
        print(f'No corresponding BepiColombo/SIXS-P data found at {url}')
        downloaded_file = []

    return downloaded_file




[docs]
def bepi_sixsp_l3_loader(startdate, enddate=None, resample=None, path=None, pos_timestamp='center'):
    """Loads BepiColombo/SIXS-P level 3 data and returns it as Pandas dataframe together with a dictionary providing the energy ranges per channel

    Parameters
    ----------
    startdate : str or datetime-like
        start date
    enddate : str or datetime-like, optional
        end date
    resample : str, optional
        resample frequency in format understandable by Pandas, e.g. '1min', by
        default None. Note that this is just a simple wrapper around thepandas
        resample function that is calculating the mean of the data in the new
        time bins. This is not necessarily the correct way to resample data,
        depending on the data type (for example for errors)!
    path : str, optional
        local path where the files are/should be stored, by default None, in which case the sunpy download folder will be used.
    pos_timestamp : str, optional
        change the position of the timestamp: 'center' or 'start' of the accumulation interval, by default 'center'.

    Returns
    -------
    df : Pandas dataframe
        Pandas dataframe of measured fluxes and uncertaintites
    channels_dict_df : dict
        Dictionary giving details on the measurement channels
    """

    channels_dict = {}
    for side in range(0, 4):  # omit Side4 info because it's not part of the L3 data product (22 Aug 2025)
        for species in ['e', 'p', 'pe']:
            filepath = get_pkg_data_filename(f'data/bepi_sixsp_instrumental_constants/sixsp_side{side}_{species}_gf_en.csv', package='seppy')
            tdf = pd.read_csv(filepath, index_col=0).T
            if species == 'e':
                species_str = 'Electron'
                channels_dict[f'Side{side}_{species_str}_Bins_str'] = ((tdf['E']*1000).round(0).astype(int).astype('str')+' keV').to_dict()
            if species == 'p':
                species_str = 'Proton'
                channels_dict[f'Side{side}_{species_str}_Bins_str'] = (tdf['E'].round(2).astype('str')+' MeV').to_dict()
            if species == 'pe':
                species_str = 'Proton_As_Electron'
                for i in ['PE4', 'PE5', 'PE6']:
                    try:
                        tdf.drop(index=i, inplace=True)  # drop PE4, PE5, PE6 info because it's not part of the L3 data product (22 Aug 2025)
                    except KeyError:
                        pass
                channels_dict[f'Side{side}_{species_str}_Bins_str'] = (tdf['E'].round(2).astype('str')+' MeV').to_dict()
            channels_dict[f'Side{side}_{species_str}_Bins_Effective_Energy'] = tdf['E'].to_dict()
            channels_dict[f'Side{side}_{species_str}_Bins_Low_Energy'] = tdf['E_low'].to_dict()
            channels_dict[f'Side{side}_{species_str}_Bins_High_Energy'] = tdf['E_high'].to_dict()

    if not path:
        path = sunpy.config.get('downloads', 'download_dir') + os.sep
    #
    if not enddate:
        enddate = startdate
    startdate = sunpy.time.parse_time(startdate).to_datetime()
    enddate = sunpy.time.parse_time(enddate).to_datetime()
    if startdate.date() == enddate.date():
        enddate = enddate + pd.Timedelta('1D')
    # create list of files to load:
    dates = pd.date_range(start=startdate.replace(day=1), end=enddate, freq='MS')
    filelist = []
    for i, doy in enumerate(dates.month):
        try:
            f = glob.glob(f"{path}{os.sep}six_der_sc_{dates[i].year}{dates[i].strftime('%m')}_l3_data.csv")[0]  # sept_{dates[i].year}_{doy}_*.dat")[0]
        except IndexError:
            # print(f"File not found locally from {path}, downloading...")
            f = bepi_sixsp_download(dates[i], path)
        if len(f) > 0:
            filelist.append(f)
    if len(filelist) > 0:
        filelist = np.sort(filelist)

        # read files into Pandas dataframes:
        df = pd.read_csv(filelist[0])
        if len(filelist) > 1:
            for f in filelist[1:]:
                t_df = pd.read_csv(f)
                df = pd.concat([df, t_df])

        # generate datetime index:
        df.index = pd.to_datetime(df['TimeUTC'])
        df.index.name = 'TimeUTC'
        df.drop(['TimeUTC'], inplace=True, axis=1)

        # shrink dataframe to requested time interval
        df = df[(df.index >= pd.to_datetime(startdate, utc=True)) & (df.index <=  pd.to_datetime(enddate, utc=True))]

        # replace bad data with np.nan:
        # df = df.replace(-9999.900, np.nan)

        # TODO: (as it's not really nicely done so far)
        # careful!
        # adjusting the position of the timestamp manually.
        # requires knowledge of the original time resolution and timestamp position!
        if pos_timestamp == 'start':
            df.index = df.index-pd.Timedelta('60s')

        # optional resampling:
        if isinstance(resample, str):
            if len(df) > 0:
                df = resample_df(df, resample, pos_timestamp=pos_timestamp, cols_unc='auto', verbose=False)
    else:
        df = []

    return df, channels_dict