Description
The efficient implementation of complex DAWs in various scientific disciplines requires deep knowledge of a large stack – consisting of an abstract DAW description, compilation of a logical plan, mapping onto the currently available infrastructure, and appropriate configuration of execution engines. Components and configurations developed for one computational infrastructure are often unsuitable for another, either leading to an undesirable platform lock-in or to a considerable loss of efficiency.
The goal of subproject B1 is therefore to improve portability. To this end, we
- compare DAW requirements with declarative descriptions of the available infrastructure,
- profile both DAWs and infrastructure as needed, and
- then map the DAWs onto the infrastructure using novel scheduling and load balancing (SLB) techniques to automatically optimize efficiency.
Ultimately, we aim to allow scientists to focus on the domain-specific challenges in their DAWs, while our new components provide an efficient selection and use of the available computing infrastructure automatically.

PIs
Publications
15 entries « ‹ 1 of 2
› » 2022
Jonathan Will; Lauritz Thamsen; Jonathan Bader; Dominik Scheinert; Odej Kao
Ruya: Memory-Aware Iterative Optimization of Cluster Configurations for Big Data Processing Inproceedings
In: 2022 IEEE International Conference on Big Data (IEEE BigData 2022), IEEE, 2022.
@inproceedings{will2022ruya,
title = {Ruya: Memory-Aware Iterative Optimization of Cluster Configurations for Big Data Processing},
author = {Jonathan Will and Lauritz Thamsen and Jonathan Bader and Dominik Scheinert and Odej Kao},
url = {https://arxiv.org/pdf/2211.04240.pdf},
year = {2022},
date = {2022-12-20},
urldate = {2022-12-20},
booktitle = {2022 IEEE International Conference on Big Data (IEEE BigData 2022)},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Dominik Scheinert; Soeren Becker; Jonathan Bader; Lauritz Thamsen; Jonathan Will; Odej Kao
Perona: Robust Infrastructure Fingerprinting for Resource-Efficient Big Data Analytics Inproceedings
In: 2022 IEEE International Conference on Big Data (IEEE BigData 2022), IEEE, 2022.
@inproceedings{scheinert2022PeronaRI,
title = {Perona: Robust Infrastructure Fingerprinting for Resource-Efficient Big Data Analytics},
author = {Dominik Scheinert and Soeren Becker and Jonathan Bader and Lauritz Thamsen and Jonathan Will and Odej Kao},
url = {https://arxiv.org/pdf/2211.08227.pdf},
year = {2022},
date = {2022-12-20},
urldate = {2022-12-20},
booktitle = {2022 IEEE International Conference on Big Data (IEEE BigData 2022)},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jonathan Bader; Nicolas Zunker; Soeren Becker; Odej Kao
Leveraging Reinforcement Learning for Task Resource Allocation in Scientific Workflows Inproceedings
In: 2022 IEEE International Conference on Big Data (IEEE BigData 2022), IEEE, 2022.
@inproceedings{bader2022RL,
title = {Leveraging Reinforcement Learning for Task Resource Allocation in Scientific Workflows},
author = {Jonathan Bader and Nicolas Zunker and Soeren Becker and Odej Kao},
url = {https://arxiv.org/pdf/2211.12076.pdf},
year = {2022},
date = {2022-12-20},
urldate = {2022-12-20},
booktitle = {2022 IEEE International Conference on Big Data (IEEE BigData 2022)},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jonathan Bader; Joel Witzke; Soeren Becker; Ansgar Lößer; Fabian Lehmann; Leon Doehler; Anh Duc Vu; Odej Kao
Towards Advanced Monitoring for Scientific Workflows Inproceedings
In: 2022 IEEE International Conference on Big Data (IEEE BigData 2022), IEEE, 2022.
@inproceedings{bader2022towards,
title = {Towards Advanced Monitoring for Scientific Workflows},
author = {Jonathan Bader and Joel Witzke and Soeren Becker and Ansgar Lößer and Fabian Lehmann and Leon Doehler and Anh Duc Vu and Odej Kao},
url = {https://arxiv.org/pdf/2211.12744.pdf},
year = {2022},
date = {2022-12-20},
urldate = {2022-12-20},
booktitle = {2022 IEEE International Conference on Big Data (IEEE BigData 2022)},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jonathan Will; Lauritz Thamsen; Jonathan Bader; Dominik Scheinert; Odej Kao
Get Your Memory Right: The Crispy Resource Allocation Assistant for Large-Scale Data Processing Inproceedings
In: 2022 IEEE International Conference on Cloud Engineering (IC2E), IEEE, 2022.
@inproceedings{will2022memory,
title = {Get Your Memory Right: The Crispy Resource Allocation Assistant for Large-Scale Data Processing},
author = {Jonathan Will and Lauritz Thamsen and Jonathan Bader and Dominik Scheinert and Odej Kao},
url = {https://arxiv.org/pdf/2206.13852.pdf},
year = {2022},
date = {2022-12-15},
urldate = {2022-12-15},
booktitle = {2022 IEEE International Conference on Cloud Engineering (IC2E)},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jonathan Bader; Kevin Styp-Rekowski; Leon Doehler; Soeren Becker; Odej Kao
Macaw: The Machine Learning Magnetometer Calibration Workflow Inproceedings
In: 2022 International Conference on Data Mining Workshops (ICDMW), IEEE, 2022.
@inproceedings{baderStypRekowski2022ICDMW,
title = {Macaw: The Machine Learning Magnetometer Calibration Workflow},
author = {Jonathan Bader and Kevin Styp-Rekowski and Leon Doehler and Soeren Becker and Odej Kao},
url = {https://arxiv.org/abs/2210.08897},
year = {2022},
date = {2022-12-01},
urldate = {2022-01-01},
booktitle = {2022 International Conference on Data Mining Workshops (ICDMW)},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jonathan Bader; Fabian Lehmann; Alexander Groth; Lauritz Thamsen; Dominik Scheinert; Jonathan Will; Ulf Leser; Odej Kao
Reshi: Recommending Resources for Scientific Workflow Tasks on Heterogeneous Infrastructures Inproceedings
In: 41th International Performance Computing and Communications Conference 2022, IEEE, 2022.
@inproceedings{baderReshi2022IPCCC,
title = {Reshi: Recommending Resources for Scientific Workflow Tasks on Heterogeneous Infrastructures},
author = {Jonathan Bader and Fabian Lehmann and Alexander Groth and Lauritz Thamsen and Dominik Scheinert and Jonathan Will and Ulf Leser and Odej Kao},
url = {https://ieeexplore.ieee.org/document/9894299},
doi = {https://doi.org/10.1109/IPCCC55026.2022.9894299},
year = {2022},
date = {2022-10-12},
urldate = {2022-01-01},
booktitle = {41th International Performance Computing and Communications Conference 2022},
publisher = {IEEE},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jonathan Bader; Fabian Lehmann; Lauritz Thamsen; Jonathan Will; Ulf Leser; Odej Kao
Lotaru: Locally Estimating Runtimes of Scientific Workflow Tasks in Heterogeneous Clusters Inproceedings
In: 34th International Conference on Scientific and Statistical Database Management (SSDBM 2022), pp. 1–12, ACM, 2022.
@inproceedings{baderLotaruLocallyEstimating2022,
title = {Lotaru: Locally Estimating Runtimes of Scientific Workflow Tasks in Heterogeneous Clusters},
author = {Jonathan Bader and Fabian Lehmann and Lauritz Thamsen and Jonathan Will and Ulf Leser and Odej Kao},
url = {https://dl.acm.org/doi/abs/10.1145/3538712.3538739},
doi = {10.1145/3538712.3538739},
year = {2022},
date = {2022-08-23},
urldate = {2022-08-23},
booktitle = {34th International Conference on Scientific and Statistical Database Management (SSDBM 2022)},
pages = {1–12},
publisher = {ACM},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Lauritz Thamsen; Dominik Scheinert; Jonathan Will; Jonathan Bader; Odej Kao
Collaborative Cluster Configuration for Distributed Data-Parallel Processing: A Research Overview Journal Article
In: Datenbank-Spektrum, vol. 22, pp. 143–151, 2022.
@article{Thamsen2022-zu,
title = {Collaborative Cluster Configuration for Distributed Data-Parallel Processing: A Research Overview},
author = {Lauritz Thamsen and Dominik Scheinert and Jonathan Will and Jonathan Bader and Odej Kao},
url = {https://link.springer.com/article/10.1007/s13222-022-00416-z},
doi = {https://doi.org/10.1007/s13222-022-00416-z},
year = {2022},
date = {2022-05-31},
urldate = {2022-05-31},
journal = {Datenbank-Spektrum},
volume = {22},
pages = {143–151},
abstract = {Many organizations routinely analyze large datasets using systems
for distributed data-parallel processing and clusters of
commodity resources. Yet, users need to configure adequate
resources for their data processing jobs. This requires
significant insights into expected job runtimes and scaling
behavior, resource characteristics, input data distributions, and
other factors. Unable to estimate performance accurately, users
frequently overprovision resources for their jobs, leading to low
resource utilization and high costs.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Many organizations routinely analyze large datasets using systems
for distributed data-parallel processing and clusters of
commodity resources. Yet, users need to configure adequate
resources for their data processing jobs. This requires
significant insights into expected job runtimes and scaling
behavior, resource characteristics, input data distributions, and
other factors. Unable to estimate performance accurately, users
frequently overprovision resources for their jobs, leading to low
resource utilization and high costs.
Dominik Scheinert; Alireza Alamgiralem; Jonathan Bader; Jonathan Will; Thorsten Wittkopp; Lauritz Thamsen
On the Potential of Execution Traces for Batch Processing Workload Optimization in Public Clouds Inproceedings
In: 2021 IEEE International Conference on Big Data (Big Data), pp. 3113-3118, 2022.
@inproceedings{9671275,
title = {On the Potential of Execution Traces for Batch Processing Workload Optimization in Public Clouds},
author = {Dominik Scheinert and Alireza Alamgiralem and Jonathan Bader and Jonathan Will and Thorsten Wittkopp and Lauritz Thamsen},
doi = {10.1109/BigData52589.2021.9671275},
year = {2022},
date = {2022-01-13},
urldate = {2021-01-01},
booktitle = {2021 IEEE International Conference on Big Data (Big Data)},
pages = {3113-3118},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
15 entries « ‹ 1 of 2
› »