Continued from A6: Data Analysis Workflows for Interactive Scientific Exploration
Description
The process of specifying DAWs for scientific discoveries is often exploratory, involving the repeated adaptation of DAW based on results of previous executions. So far, we studied the design of DAWs for scientific exploration from the perspective of a single user. Now, our focus will shift to the perspective of an entire scientific community, in which analysis pipelines are shared, best-practices for data handling emerge, and benchmarks are published. Specifically, we target support for the neuroimaging community, where DAWs handle sequences of brain imaging data.

Scientists
- Mahdi Esamiloghli
- Omar Sherif
Publications
2024
Elfaramawy, Nourhan; Deniz, Fatma; Grunske, Lars; Hilbrich, Marcus; Kehrer, Timo; Lamprecht, Anna-Lena; Mendling, Jan; Weidlich, Matthias
On Managing Large Collections of Scientific Workflows Miscellaneous
Modellierung 2024 Satellite Events, 2024.
@misc{Elfaramawy24,
title = {On Managing Large Collections of Scientific Workflows},
author = {Nourhan Elfaramawy and Fatma Deniz and Lars Grunske and Marcus Hilbrich and Timo Kehrer and Anna-Lena Lamprecht and Jan Mendling and Matthias Weidlich},
doi = {10.18420/modellierung2024-ws-012},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
publisher = {Gesellschaft für Informatik e.V.},
howpublished = {Modellierung 2024 Satellite Events},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Pohl, Sebastian; Elfaramawy, Nourhan; Miling, Artur; Cao, Kedi; Kehr, Birte; Weidlich, Matthias
How Do Users Design Scientific Workflows? The Case of Snakemake and Nextflow Proceedings Article
In: Proceedings of the 36th International Conference on Scientific and Statistical Database Management, Association for Computing Machinery, Rennes, France, 2024, ISBN: 9798400710209.
@inproceedings{10.1145/3676288.3676290,
title = {How Do Users Design Scientific Workflows? The Case of Snakemake and Nextflow},
author = {Sebastian Pohl and Nourhan Elfaramawy and Artur Miling and Kedi Cao and Birte Kehr and Matthias Weidlich},
url = {https://doi.org/10.1145/3676288.3676290},
doi = {10.1145/3676288.3676290},
isbn = {9798400710209},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
booktitle = {Proceedings of the 36th International Conference on Scientific and Statistical Database Management},
publisher = {Association for Computing Machinery},
address = {Rennes, France},
series = {SSDBM '24},
abstract = {Scientific workflows automate the analysis of large-scale scientific data, fostering the reuse of data processing operators as well as the reproducibility and traceability of analysis results. In exploratory research, however, workflows are continuously adapted, utilizing a wide range of tools and software libraries, to test scientific hypotheses. Script-based workflow engines cater to the required flexibility through direct integration of programming primitives, but lack abstractions for interactive exploration of the workflow design by a user during workflow execution. In this paper, we study the requirements for the design of interactive workflows through the lens of existing workflow collections. Specifically, we focus on two widely used script-based workflow engines: Snakemake and Nextflow. For Snakemake, we collected workflows from 1602 GitHub repositories that are listed in the Snakemake workflow catalog. For Nextflow, we adopted the nf-core collection with workflows from 94 GitHub repositories as the basis for our study. Using these collections, we present insights on common structures in the design of the workflows. Moreover, we report on results on the language features typically adopted in workflow specification, with a focus on the concepts that are of particular importance for interactive workflows.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2023
Pohl, Sebastian; Elfaramawy, Nourhan; Cao, Kedi; Kehr, Birte; Weidlich, Matthias
How do users design scientific workflows? The Case of Snakemake Working paper
arXiv, 2023.
@workingpaper{nokey,
title = {How do users design scientific workflows? The Case of Snakemake},
author = {Sebastian Pohl and Nourhan Elfaramawy and Kedi Cao and Birte Kehr and Matthias Weidlich},
url = {https://doi.org/10.48550/arXiv.2309.14097},
doi = {2309.14097},
year = {2023},
date = {2023-09-25},
urldate = {2023-09-25},
abstract = {Scientific workflows automate the analysis of large-scale scientific data, fostering the reuse of data processing operators as well as the reproducibility and traceability of analysis results. In exploratory research, however, workflows are continuously adapted, utilizing a wide range of tools and software libraries, to test scientific hypotheses. Script-based workflow engines cater to the required flexibility through direct integration of programming primitives but lack abstractions for interactive exploration of the workflow design by a user during workflow execution. To derive requirements for such interactive workflows, we conduct an empirical study on the use of Snakemake, a popular Python-based workflow engine. Based on workflows collected from 1602 GitHub repositories, we present insights on common structures of Snakemake workflows, as well as the language features typically adopted in their specification.},
howpublished = {arXiv},
keywords = {},
pubstate = {published},
tppubtype = {workingpaper}
}
Cao, Kedi; Elfaramawy, Nourhan; Weidlich, Matthias; Kehr, Birte
From Program Chains to Exploratory Workflows: PopinSnake for Genomic Insertion Detection Proceedings Article
In: 2023 IEEE 19th International Conference on e-Science (e-Science), pp. 1-7, 2023.
@inproceedings{10254924,
title = {From Program Chains to Exploratory Workflows: PopinSnake for Genomic Insertion Detection},
author = {Kedi Cao and Nourhan Elfaramawy and Matthias Weidlich and Birte Kehr},
doi = {10.1109/e-Science58273.2023.10254924},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
booktitle = {2023 IEEE 19th International Conference on e-Science (e-Science)},
pages = {1-7},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Elfaramawy, Nourhan
Interactive Workflows for Exploratory Data Analysis Proceedings Article
In: Bao, Zhifeng; Sellis, Timos (Ed.): Proceedings of the VLDB 2022 PhD Workshop co-located with the 48th International Conference on Very Large Databases (VLDB 2022), Sydney, Australia, September 5, 2022, CEUR-WS.org, 2022.
@inproceedings{elfaramawy2022Interactive,
title = {Interactive Workflows for Exploratory Data Analysis},
author = {Nourhan Elfaramawy},
editor = {Zhifeng Bao and Timos Sellis},
url = {http://ceur-ws.org/Vol-3186/paper_2.pdf},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {Proceedings of the VLDB 2022 PhD Workshop co-located with the 48th
International Conference on Very Large Databases (VLDB 2022), Sydney,
Australia, September 5, 2022},
volume = {3186},
publisher = {CEUR-WS.org},
series = {CEUR Workshop Proceedings},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Krannich, Thomas; White, W Timothy J; Niehus, Sebastian; Holley, Guillaume; Halldórsson, Bjarni V; Kehr, Birte
Population-scale detection of non-reference sequence variants using colored de Bruijn graphs Journal Article
In: Bioinformatics, vol. 38, no. 3, pp. 604-611, 2021, ISSN: 1367-4803.
@article{10.1093/bioinformatics/btab749,
title = {Population-scale detection of non-reference sequence variants using colored de Bruijn graphs},
author = {Thomas Krannich and W Timothy J White and Sebastian Niehus and Guillaume Holley and Bjarni V Halldórsson and Birte Kehr},
url = {https://doi.org/10.1093/bioinformatics/btab749},
doi = {10.1093/bioinformatics/btab749},
issn = {1367-4803},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
journal = {Bioinformatics},
volume = {38},
number = {3},
pages = {604-611},
abstract = {With the increasing throughput of sequencing technologies, structural variant (SV) detection has become possible across tens of thousands of genomes. Non-reference sequence (NRS) variants have drawn less attention compared with other types of SVs due to the computational complexity of detecting them. When using short-read data, the detection of NRS variants inevitably involves a de novo assembly which requires high-quality sequence data at high coverage. Previous studies have demonstrated how sequence data of multiple genomes can be combined for the reliable detection of NRS variants. However, the algorithms proposed in these studies have limited scalability to larger sets of genomes.We introduce PopIns2, a tool to discover and characterize NRS variants in many genomes, which scales to considerably larger numbers of genomes than its predecessor PopIns. In this article, we briefly outline the PopIns2 workflow and highlight our novel algorithmic contributions. We developed an entirely new approach for merging contig assemblies of unaligned reads from many genomes into a single set of NRS using a colored de Bruijn graph. Our tests on simulated data indicate that the new merging algorithm ranks among the best approaches in terms of quality and reliability and that PopIns2 shows the best precision for a growing number of genomes processed. Results on the Polaris Diversity Cohort and a set of 1000 Icelandic human genomes demonstrate unmatched scalability for the application on population-scale datasets.The source code of PopIns2 is available from https://github.com/kehrlab/PopIns2.Supplementary data are available at Bioinformatics online.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
