# Dataset Definition

In [1]:
from skyllh.core.config import (
 Config,
)
from skyllh.core.dataset import (
 Dataset,
 DatasetCollection,
)

In [2]:
# Create configuration instance.
cfg = Config()

In [3]:
# Create individual dataset.
my_dataset = Dataset(
 cfg=cfg,
 name='My Dataset',
 exp_pathfilenames='exp.npy',
 mc_pathfilenames='mc.npy',
 livetime=365,
 version=1,
 verqualifiers={'patch': 0},
 default_sub_path_fmt='my_dataset_v{version:03d}_p{patch:02d}',
 base_path='/data/ana/analyses/',
)

# Create collection of individual datasets.
dsc = DatasetCollection(
 name='My Dataset Collection',
 description='This is my dataset collection containing all my individual '
 'datasets.')
dsc.add_datasets((my_dataset,))



In [4]:
print(dsc)

DatasetCollection "My Dataset Collection"
--------------------------------------------------------------------------------
Description:
This is my dataset collection containing all my individual datasets.
Available datasets:

 Dataset "My Dataset": v001patch00
 { livetime = 365.000 days }
 Experimental data:
 [[92mFOUND[0m] /data/ana/analyses/my_dataset_v001_p00/exp.npy
 MC data:
 [[92mFOUND[0m] /data/ana/analyses/my_dataset_v001_p00/mc.npy
 


In [5]:
my_dataset = dsc.get_dataset('My Dataset')
print(my_dataset)

Dataset "My Dataset": v001patch00
 { livetime = 365.000 days }
 Experimental data:
 [[92mFOUND[0m] /data/ana/analyses/my_dataset_v001_p00/exp.npy
 MC data:
 [[92mFOUND[0m] /data/ana/analyses/my_dataset_v001_p00/mc.npy
 


## Auxiliary data files

In [6]:
my_dataset.add_aux_data_definition('aux_file_key_1', 'aux_data/aux_file1.dat')

In [7]:
print(my_dataset)

Dataset "My Dataset": v001patch00
 { livetime = 365.000 days }
 Experimental data:
 [[92mFOUND[0m] /data/ana/analyses/my_dataset_v001_p00/exp.npy
 MC data:
 [[92mFOUND[0m] /data/ana/analyses/my_dataset_v001_p00/mc.npy
 Auxiliary data:
 aux_file_key_1: 
 [[92mFOUND[0m] /data/ana/analyses/my_dataset_v001_p00/aux_data/aux_file1.dat


In [8]:
my_dataset.add_aux_data('aux_data_1', [1, 2, 3])

## Dataset Origin

In [9]:
from skyllh.core.dataset import (
 DatasetOrigin,
 WGETDatasetTransfer,
)

In [10]:
origin = DatasetOrigin(
 host='data.mydomain.com',
 base_path='/downloads/data',
 sub_path='my_dataset',
 transfer_func=WGETDatasetTransfer(protocol='https').transfer)
my_dataset.origin = origin

### Origin as archive file

In [11]:
origin = DatasetOrigin(
 host='data.mydomain.com',
 base_path='/downloads/data',
 sub_path='',
 filename='my_dataset.zip',
 transfer_func=WGETDatasetTransfer(protocol='https').transfer,
 post_transfer_func=WGETDatasetTransfer.post_transfer_unzip)

## Downloading the dataset