#!/usr/bin/env python
# coding: utf-8
"""
This script trains energy, DISP regressors and event classifiers with
DL1-stereo events. The input events are separated by the telescope
combination types at first, and then telescope-wise RFs are trained for
every combination type. When training event classifiers, gamma or proton
MC events are randomly extracted so that the RFs are trained with the
same number of events by both types of primary particles.
Please specify the RF type that will be trained by using
`--train-energy`, `--train-disp` and `--train-classifier` arguments.
If the `--use-unsigned` argument is given, the RFs will be trained with
unsigned features.
Before running the script, it would be better to merge input MC files
per telescope pointing direction with the following script:
`magic-cta-pipe/magicctapipe/scripts/lst1_magic/merge_hdf_files.py`
Usage:
$ python lst1_magic_train_rfs.py
--input-dir-gamma dl1_stereo/gamma
(--input-dir-proton dl1_stereo/proton)
(--output-dir rfs)
(--config-file config.yaml)
(--train-energy)
(--train-disp)
(--train-classifier)
(--use-unsigned)
"""
import argparse
import logging
import random
import time
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
from magicctapipe.io import format_object, load_train_data_files
from magicctapipe.io.io import GROUP_INDEX_TRAIN
from magicctapipe.reco import DispRegressor, EnergyRegressor, EventClassifier
__all__ = [
"get_events_at_random",
"train_energy_regressor",
"train_disp_regressor",
"train_event_classifier",
]
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)
TEL_NAMES = {
1: "LST-1",
2: "MAGIC-I",
3: "MAGIC-II",
} # TODO: REMOVE WHEN SWITCHING TO THE NEW RFs IMPLEMENTTATION (1 RF PER TELESCOPE)
# True event class of gamma and proton MCs
EVENT_CLASS_GAMMA = 0
EVENT_CLASS_PROTON = 1
# Set the random seed
random.seed(1000)
def get_events_at_random(event_data, n_events_random):
"""
Extracts a given number of shower events randomly.
Parameters
----------
event_data : pandas.core.frame.DataFrame
Data frame of shower events
n_events_random : int
Number of events to be extracted randomly
Returns
-------
pandas.core.frame.DataFrame
Data frame of the shower events extracted randomly
"""
# Get the unique multi indices
multi_indices_unique = np.unique(event_data.index).tolist()
# Extract a given number of indices randomly
multi_indices_random = pd.MultiIndex.from_tuples(
tuples=random.sample(multi_indices_unique, n_events_random),
names=event_data.index.names,
)
# Extract the events of the random indices
event_data_selected = event_data.loc[multi_indices_random]
event_data_selected.sort_index(inplace=True)
return event_data_selected
[docs]
def train_energy_regressor(input_dir, output_dir, config, use_unsigned_features=False):
"""
Trains energy regressors with gamma MC DL1-stereo events.
Parameters
----------
input_dir : str
Path to a directory where input gamma MC data files are stored
output_dir : str
Path to a directory where to save trained RFs
config : dict
Configuration for the LST-1 + MAGIC analysis
use_unsigned_features : bool
If `True`, it uses unsigned features for training RFs
"""
config_rf = config["energy_regressor"]
gamma_offaxis = config_rf["gamma_offaxis"]
logger.info("\nGamma off-axis angles allowed:")
logger.info(format_object(gamma_offaxis))
# Load the input files
logger.info(f"\nInput directory: {input_dir}")
event_data_train = load_train_data_files(
input_dir, gamma_offaxis["min"], gamma_offaxis["max"]
)
# Configure the energy regressor
logger.info("\nRF regressors:")
logger.info(format_object(config_rf["settings"]))
logger.info("\nFeatures:")
logger.info(format_object(config_rf["features"]))
logger.info(f"\nUse unsigned features: {use_unsigned_features}")
energy_regressor = EnergyRegressor(
config_rf["settings"], config_rf["features"], use_unsigned_features
)
# Create the output directory
Path(output_dir).mkdir(exist_ok=True, parents=True)
# Loop over every telescope combination type
for tel_combo, df_train in event_data_train.items():
logger.info(f"\nEnergy regressors for the '{tel_combo}' type:")
# Train the RFs
energy_regressor.fit(df_train)
# Check the feature importance
for tel_id, telescope_rf in energy_regressor.telescope_rfs.items():
importances = telescope_rf.feature_importances_.round(5)
importances = dict(zip(energy_regressor.features, importances))
logger.info(f"\n{TEL_NAMES[tel_id]} feature importance:")
logger.info(format_object(importances))
# Save the trained RFs
if use_unsigned_features:
output_file = f"{output_dir}/energy_regressors_{tel_combo}_unsigned.joblib"
else:
output_file = f"{output_dir}/energy_regressors_{tel_combo}.joblib"
energy_regressor.save(output_file)
logger.info(f"\nOutput file: {output_file}")
[docs]
def train_disp_regressor(input_dir, output_dir, config, use_unsigned_features=False):
"""
Trains DISP regressors with gamma MC DL1-stereo events.
Parameters
----------
input_dir : str
Path to a directory where input gamma MC data files are stored
output_dir : str
Path to a directory where to save trained RFs
config : dict
Configuration for the LST-1 + MAGIC analysis
use_unsigned_features : bool
If `True`, it uses unsigned features for training RFs
"""
config_rf = config["disp_regressor"]
gamma_offaxis = config_rf["gamma_offaxis"]
logger.info("\nGamma off-axis angles allowed:")
logger.info(format_object(gamma_offaxis))
# Load the input files
logger.info(f"\nInput directory: {input_dir}")
event_data_train = load_train_data_files(
input_dir, gamma_offaxis["min"], gamma_offaxis["max"]
)
# Configure the DISP regressor
logger.info("\nRF regressors:")
logger.info(format_object(config_rf["settings"]))
logger.info("\nFeatures:")
logger.info(format_object(config_rf["features"]))
logger.info(f"\nUse unsigned features: {use_unsigned_features}")
disp_regressor = DispRegressor(
config_rf["settings"], config_rf["features"], use_unsigned_features
)
# Create the output directory
Path(output_dir).mkdir(exist_ok=True, parents=True)
# Loop over every telescope combination type
for tel_combo, df_train in event_data_train.items():
logger.info(f"\nDISP regressors for the '{tel_combo}' type:")
# Train the RFs
disp_regressor.fit(df_train)
# Check the feature importance
for tel_id, telescope_rf in disp_regressor.telescope_rfs.items():
importances = telescope_rf.feature_importances_.round(5)
importances = dict(zip(disp_regressor.features, importances))
logger.info(f"\n{TEL_NAMES[tel_id]} feature importance:")
logger.info(format_object(importances))
# Save the trained RFs to an output file
if use_unsigned_features:
output_file = f"{output_dir}/disp_regressors_{tel_combo}_unsigned.joblib"
else:
output_file = f"{output_dir}/disp_regressors_{tel_combo}.joblib"
disp_regressor.save(output_file)
logger.info(f"\nOutput file: {output_file}")
[docs]
def train_event_classifier(
input_dir_gamma, input_dir_proton, output_dir, config, use_unsigned_features=False
):
"""
Trains event classifiers with gamma and proton MC DL1-stereo events.
Parameters
----------
input_dir_gamma : str
Path to a directory where input gamma MC data files are stored
input_dir_proton : str
Path to a directory where input proton MC data files are stored
output_dir : str
Path to a directory where to save trained RFs
config : dict
Configuration for the LST-1 + MAGIC analysis
use_unsigned_features : bool
If `True`, it uses unsigned features for training RFs
"""
config_rf = config["event_classifier"]
gamma_offaxis = config_rf["gamma_offaxis"]
logger.info("\nGamma off-axis angles allowed:")
logger.info(format_object(gamma_offaxis))
# Load the input gamma MC data files
logger.info(f"\nInput gamma MC directory: {input_dir_gamma}")
event_data_gamma = load_train_data_files(
input_dir_gamma, gamma_offaxis["min"], gamma_offaxis["max"], EVENT_CLASS_GAMMA
)
# Load the input proton MC data files
logger.info(f"\nInput proton MC directory: {input_dir_proton}")
event_data_proton = load_train_data_files(
input_dir_proton, true_event_class=EVENT_CLASS_PROTON
)
# Configure the event classifier
logger.info("\nRF classifiers:")
logger.info(format_object(config_rf["settings"]))
logger.info("\nFeatures:")
logger.info(format_object(config_rf["features"]))
logger.info(f"\nUse unsigned features: {use_unsigned_features}")
event_classifier = EventClassifier(
config_rf["settings"], config_rf["features"], use_unsigned_features
)
# Create the output directory
Path(output_dir).mkdir(exist_ok=True, parents=True)
# Loop over every telescope combination type
common_combinations = set(event_data_gamma.keys()) & set(event_data_proton.keys())
for tel_combo in sorted(common_combinations):
logger.info(f"\nEvent classifiers for the '{tel_combo}' type:")
df_gamma = event_data_gamma[tel_combo]
df_proton = event_data_proton[tel_combo]
# Adjust the number of training samples
n_events_gamma = len(df_gamma.groupby(GROUP_INDEX_TRAIN).size())
n_events_proton = len(df_proton.groupby(GROUP_INDEX_TRAIN).size())
if n_events_gamma > n_events_proton:
logger.info(f"Extracting {n_events_proton} gamma MC events...")
df_gamma = get_events_at_random(df_gamma, n_events_proton)
elif n_events_proton > n_events_gamma:
logger.info(f"Extracting {n_events_gamma} proton MC events...")
df_proton = get_events_at_random(df_proton, n_events_gamma)
df_train = pd.concat([df_gamma, df_proton])
# Train the RFs
event_classifier.fit(df_train)
# Check the feature importance
for tel_id, telescope_rf in event_classifier.telescope_rfs.items():
importances = telescope_rf.feature_importances_.round(5)
importances = dict(zip(event_classifier.features, importances))
logger.info(f"\n{TEL_NAMES[tel_id]} feature importance:")
logger.info(format_object(importances))
# Save the trained RFs to an output file
if use_unsigned_features:
output_file = f"{output_dir}/event_classifiers_{tel_combo}_unsigned.joblib"
else:
output_file = f"{output_dir}/event_classifiers_{tel_combo}.joblib"
event_classifier.save(output_file)
logger.info(f"\nOutput file: {output_file}")
def main():
"""Main function."""
start_time = time.time()
parser = argparse.ArgumentParser()
parser.add_argument(
"--input-dir-gamma",
"-g",
dest="input_dir_gamma",
type=str,
required=True,
help="Path to a directory where input gamma MC data files are stored",
)
parser.add_argument(
"--input-dir-proton",
"-p",
dest="input_dir_proton",
type=str,
help="Path to a directory where input proton MC data files are stored",
)
parser.add_argument(
"--output-dir",
"-o",
dest="output_dir",
type=str,
default="./data",
help="Path to a directory where to save trained RFs",
)
parser.add_argument(
"--config-file",
"-c",
dest="config_file",
type=str,
default="./config.yaml",
help="Path to a configuration file",
)
parser.add_argument(
"--train-energy",
dest="train_energy",
action="store_true",
help="Train energy regressors",
)
parser.add_argument(
"--train-disp",
dest="train_disp",
action="store_true",
help="Train DISP regressors",
)
parser.add_argument(
"--train-classifier",
dest="train_classifier",
action="store_true",
help="Train event classifiers",
)
parser.add_argument(
"--use-unsigned",
dest="use_unsigned",
action="store_true",
help="Use unsigned features for training RFs",
)
args = parser.parse_args()
with open(args.config_file, "rb") as f:
config = yaml.safe_load(f)
# Train RFs
if args.train_energy:
train_energy_regressor(
args.input_dir_gamma, args.output_dir, config, args.use_unsigned
)
if args.train_disp:
train_disp_regressor(
args.input_dir_gamma, args.output_dir, config, args.use_unsigned
)
if args.train_classifier:
train_event_classifier(
input_dir_gamma=args.input_dir_gamma,
input_dir_proton=args.input_dir_proton,
output_dir=args.output_dir,
config=config,
use_unsigned_features=args.use_unsigned,
)
if not any([args.train_energy, args.train_disp, args.train_classifier]):
raise ValueError(
"The RF type is not specified. Please see the usage with `--help`."
)
logger.info("\nDone.")
process_time = time.time() - start_time
logger.info(f"\nProcess time: {process_time:.0f} [sec]\n")
if __name__ == "__main__":
main()