@techreport{32789f3e22c64d67884bdeaa102d1e66,
title = "Polar: A Benchmark for Multilingual, Multicultural, and Multi-Event Online Polarization",
abstract = " Online polarization poses a growing challenge for democratic discourse, yet most computational social science research remains monolingual, culturally narrow, or event-specific. We introduce POLAR, a multilingual, multicultural, and multievent dataset with over 23k instances in seven languages from diverse online platforms and real-world events. Polarization is annotated along three axes: presence, type, and manifestation, using a variety of annotation platforms adapted to each cultural context. We conduct two main experiments: (1) we fine-tune six multilingual pretrained language models in both monolingual and cross-lingual setups; and (2) we evaluate a range of open and closed large language models (LLMs) in few-shot and zero-shot scenarios. Results show that while most models perform well on binary polarization detection, they achieve substantially lower scores when predicting polarization types and manifestations. These findings highlight the complex, highly contextual nature of polarization and the need for robust, adaptable approaches in NLP and computational social science. All resources will be released to support further research and effective mitigation of digital polarization globally. ",
keywords = "cs.CL",
author = "Usman Naseem and Juan Ren and Saba Anwar and Sarah Kohail and Veliz, \{Rudy Alexandro Garrido\} and Robert Geislinger and Aisha Jabr and Idris Abdulmumin and Laiba Qureshi and Borkar, \{Aarushi Ajay\} and Mukhtar, \{Maryam Ibrahim\} and Ayele, \{Abinew Ali\} and Ahmad, \{Ibrahim Said\} and Adem Ali and Martin Semmann and Muhammad, \{Shamsuddeen Hassan\} and Yimam, \{Seid Muhie\}",
note = "Preprint",
year = "2025",
month = may,
day = "27",
language = "English",
type = "WorkingPaper",
}
@techreport{ecec0c4f2980493eab4efe19ffc9465b,
title = "Whispering in Amharic: Fine-tuning Whisper for Low-resource Language",
abstract = "This work explores fine-tuning OpenAI's Whisper automatic speech recognition (ASR) model for Amharic, a low-resource language, to improve transcription accuracy. While the foundational Whisper model struggles with Amharic due to limited representation in its training data, we fine-tune it using datasets like Mozilla Common Voice, FLEURS, and the BDU-speech dataset. The best-performing model, Whispersmall-am, significantly improves when finetuned on a mix of existing FLEURS data and new, unseen Amharic datasets. Training solely on new data leads to poor performance, but combining it with FLEURS data reinforces the model, enabling better specialization in Amharic. We also demonstrate that normalizing Amharic homophones significantly enhances Word Error Rate (WER) and Bilingual Evaluation Understudy (BLEU) scores. This study underscores the importance of fine-tuning strategies and dataset composition for improving ASR in low-resource languages, providing insights for future Amharic speech recognition research.",
keywords = "cs.CL, cs.LG",
author = "Gete, \{Dawit Ketema\} and Ahmed, \{Bedru Yimam\} and Belay, \{Tadesse Destaw\} and Ejigu, \{Yohannes Ayana\} and Imam, \{Sukairaj Hafiz\} and Tessema, \{Alemu Belay\} and Adem, \{Mohammed Oumer\} and Belay, \{Tadesse Amare\} and Robert Geislinger and Musa, \{Umma Aliyu\} and Martin Semmann and Muhammad, \{Shamsuddeen Hassan\} and Henning Schreiber and Yimam, \{Seid Muhie\}",
year = "2025",
month = mar,
day = "24",
language = "English",
type = "WorkingPaper",
}
@techreport{5e7c43d59fd046368f1d25ddc11b7843,
title = "Brighter: BRIdging the Gap in Human-Annotated Textual Emotion Recognition Datasets for 28 Languages",
abstract = " People worldwide use language in subtle and complex ways to express emotions. While emotion recognition -- an umbrella term for several NLP tasks -- significantly impacts different applications in NLP and other fields, most work in the area is focused on high-resource languages. Therefore, this has led to major disparities in research and proposed solutions, especially for low-resource languages that suffer from the lack of high-quality datasets. In this paper, we present BRIGHTER -- a collection of multilabeled emotion-annotated datasets in 28 different languages. BRIGHTER covers predominantly low-resource languages from Africa, Asia, Eastern Europe, and Latin America, with instances from various domains annotated by fluent speakers. We describe the data collection and annotation processes and the challenges of building these datasets. Then, we report different experimental results for monolingual and crosslingual multi-label emotion identification, as well as intensity-level emotion recognition. We investigate results with and without using LLMs and analyse the large variability in performance across languages and text domains. We show that BRIGHTER datasets are a step towards bridging the gap in text-based emotion recognition and discuss their impact and utility. ",
keywords = "cs.CL",
author = "Muhammad, \{Shamsuddeen Hassan\} and Nedjma Ousidhoum and Idris Abdulmumin and Wahle, \{Jan Philip\} and Terry Ruas and Meriem Beloucif and \{de Kock\}, Christine and Nirmal Surange and Daniela Teodorescu and Ahmad, \{Ibrahim Said\} and Adelani, \{David Ifeoluwa\} and Aji, \{Alham Fikri\} and Ali, \{Felermino D. M. A.\} and Ilseyar Alimova and Vladimir Araujo and Nikolay Babakov and Naomi Baes and Ana-Maria Bucur and Andiswa Bukula and Guanqun Cao and Cardenas, \{Rodrigo Tufino\} and Rendi Chevi and Chukwuneke, \{Chiamaka Ijeoma\} and Alexandra Ciobotaru and Daryna Dementieva and Gadanya, \{Murja Sani\} and Robert Geislinger and Bela Gipp and Oumaima Hourrane and Oana Ignat and Lawan, \{Falalu Ibrahim\} and Rooweither Mabuya and Rahmad Mahendra and Vukosi Marivate and Andrew Piper and Alexander Panchenko and Ferreira, \{Charles Henrique Porto\} and Vitaly Protasov and Samuel Rutunda and Manish Shrivastava and Udrea, \{Aura Cristina\} and Wanzare, \{Lilian Diana Awuor\} and Sophie Wu and Wunderlich, \{Florian Valentin\} and Zhafran, \{Hanif Muhammad\} and Tianhui Zhang and Yi Zhou and Mohammad, \{Saif M.\}",
note = "20 pages, under review",
year = "2025",
month = feb,
day = "17",
language = "English",
type = "WorkingPaper",
}
@inbook{56d17e49d30f4db2b1f16c89c7b3f717,
title = "Automatisiertes Erkennen und Z{\"a}hlen von Zebrab{\"a}rblingen mittels k{\"u}nstlicher Intelligenz",
abstract = "EinleitungDie Zucht und Haltung von Fischen erfolgt h{\"a}ufig in standardisierten H{\"a}lterungsanlagen. Insbesondere zu Beginn des Lebenszyklus der Tiere ist die t{\"a}glich Bestimmung der Tieranzahl aufgrund der hohen Besatzdichte kaum durchf{\"u}hrbar. Im Rahmen eines Bachelorseminars wurde ein System entwickelt, in dem die Tiere in regelm{\"a}{\ss}igen Abst{\"a}nden fotografiert werden, um mittels k{\"u}nstlicher Intelligenz einzelne Fische zu identifizieren und deren Anzahl zu ermitteln. Dabei wurden bereits vorhandene Modelle analysiert und anschlie{\ss}end f{\"u}r den Anwendungsfall angepasst.BeschreibungAdulte Zebrab{\"a}rblinge (Danio rerio) werden in standardisierten Aquarien aus Glas mit Kies und einer Versteckm{\"o}glichkeit in Gruppen von bis zu 50 Tieren gehalten. Im ersten Schritt wurden regelm{\"a}{\ss}ig Fotos der Tiere erstellt und die KI damit {\"u}ber eine Web-basierte Schnittstelle gef{\"u}ttert und trainiert, sodass die KI Fische als Berechnungsobjekt erkennt. Im zweiten Schritt wurden die Aufnahmen standardisiert, in dem z.B. Abstand, Winkel, Gr{\"o}{\ss}e, Reflexionen, Lichtverh{\"a}ltnisse und Hintergrund angepasst wurden, ohne auf das Wohlbefinden der Tiere einzugreifen. Zus{\"a}tzlich wurde ber{\"u}cksichtigt, inwiefern die standardm{\"a}{\ss}ig verwendeten Versteckm{\"o}glichkeiten zur Ortung der Tiere genutzt werden kann.SchlussfolgerungDas Ergebnis der Arbeit ist die Entwicklung einer {\"o}ffentlichen Webseite, auf die Aquarienbilder mit Zebrab{\"a}rblingen hochgeladen werden kann und die Anzahl der Tiere bestimmt wird. Diese Webseite steht frei unter https://zebrai.demo.hcds.uni-hamburg.de/ zur Verf{\"u}gung. Bitte beachten Sie, dass nur Bilder hochgeladen werden d{\"u}rfen, f{\"u}r die Sie auch die Bildrechte haben. Sie k{\"o}nnen auch andere Bilder von Fischen, Kaulkappen oder jungen Fr{\"o}schen hochladen. Jedes Bild verbessert den Lernerfolg der KI, sodass durch Ihre Unterst{\"u}tzung die Weiterentwicklung des Tools voranschreitet. Viel Spa{\ss} beim Ausprobieren.W{\"u}rdigung (Acknowledgement)Wir danken den Tierpflegenden der Fischhaltung im Institut f{\"u}r Zell- und Systembiologie der Tiere der Universit{\"a}t Hamburg f{\"u}r die tolle Unterst{\"u}tzung des Projektes.Referenzen[1] Gie{\ss}mann, J., Albers, N., Subiza, R., Geislinger, R., Remus, S., Yimam, S. M. and Semmann M. 2024 ZebrAI - Zebrafish Counting using AI https://zebrai.demo.hcds.uni-hamburg.de/",
keywords = "KI, Zebrab{\"a}rbling, Dario, automatisiertes Z{\"a}hlen",
author = "Matthias Braun and Martin Semmann and Steffen Kubitz and Julia Giessmann and Nikolai Albers and Robert Geislinger",
year = "2024",
month = sep,
language = "Deutsch",
volume = "61",
booktitle = "Wissenschaftliche Tagung der Gesellschaft f{\"u}r Versuchstierkunde GV-SOLAS",
}
@phdthesis{f83869c4f82041cb942142c22d8b1b2c,
title = "Enhancing Sentiment Analysis: Model Comparison, Domain Adaptation, and Lexicon Evolution in German Data",
author = "Robert Geislinger",
year = "2024",
month = jul,
day = "25",
language = "English",
type = "Final degree thesis",
school = "University of Hamburg",
}
@inbook{70aa79ece4c747d090a4cf13411238d1,
title = "Concept Over Time Analysis: Unveiling Temporal Patterns for Qualitative Data Analysis",
abstract = "In this system demonstration paper, we present the Concept Over Time Analysis extension for the Discourse Analysis Tool Suite.The proposed tool empowers users to define, refine, and visualize their concepts of interest within an interactive interface. Adhering to the Human-in-the-loop paradigm, users can give feedback through sentence annotations. Utilizing few-shot sentence classification, the system employs Sentence Transformers to compute representations of sentences and concepts. Through an iterative process involving semantic similarity searches, sentence annotation, and fine-tuning with contrastive data, the model continuously refines, providing users with enhanced analysis outcomes. The final output is a timeline visualization of sentences classified to concepts. Especially suited for the Digital Humanities, Concept Over Time Analysis serves as a valuable tool for qualitative data analysis within extensive datasets. The chronological overview of concepts enables researchers to uncover patterns, trends, and shifts in discourse over time.",
author = "Tim Fischer and Florian Schneider and Robert Geislinger and Florian Helfer and Gertraud Koch and Chris Biemann",
year = "2024",
month = jun,
day = "1",
doi = "10.18653/v1/2024.naacl-demo.15",
language = "English",
pages = "148--157",
editor = "Kai-Wei Chang and Annie Lee and Nazneen Rajani",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 3: System Demonstrations)",
publisher = "Association for Computational Linguistics",
}
@inbook{c90a9a37b65d48bbb5e10e1d54a9336c,
title = "Multi-Modal Learning Application -- Support Language Learners with NLP Techniques and Eye-Tracking",
author = "Robert Geislinger and Pourasad, \{Ali Ebrahimi\} and Deniz G{\"u}l and Daniel Djahangir and \{Muhie Yimam\}, Seid and Steffen Remus and Chris Biemann",
year = "2023",
month = sep,
day = "1",
language = "English",
pages = "6--11",
editor = "Piush Aggarwal and {\"O}zge Ala{\c c}am and Carina Silberer and Sina Zarrie{\ss} and Torsten Zesch",
booktitle = "Proceedings of the 1st Workshop on Linguistic Insights from and for Multimodal Language Processing",
publisher = "Association for Computational Lingustics",
}
@inbook{73239599a927471d81ba631181b253f0,
title = "Improved Open Source Automatic Subtitling for Lecture Videos",
author = "Robert Geislinger and Benjamin Milde and Chris Biemann",
year = "2022",
month = dec,
day = "1",
language = "English",
pages = "98--103",
editor = "Robin Schaefer and Xiaoyu Bai and Manfred Stede and Torsten Zesch",
booktitle = "Proceedings of the 18th Conference on Natural Language Processing (KONVENS 2022)",
publisher = "KONVENS 2022 Organizers",
}
@phdthesis{391b1606a68f4955b08eda828d76bed3,
title = "Implementation und Evaluation automatischer Mehrkanal-Spracherkennung f{\"u}r das Konferenzsystem BigBlueButton",
author = "Robert Geislinger",
year = "2021",
month = sep,
day = "28",
language = "Deutsch",
type = "Final degree thesis",
school = "Universit{\"a}t Hamburg",
}
@inbook{9cbac6e1989a4b578e879e2d4111e911,
title = "Live Subtitling for BigBlueButton with Open-Source Software",
keywords = "Automatic speech recognition, VoIP, Automatic subtitles, Computer-supported collaborative work, Meeting transcription, Multi-party dialogue, Videoconferencing",
author = "Robert Geislinger and Benjamin Milde and Timo Baumann and Chris Biemann",
year = "2021",
month = sep,
language = "English",
pages = "3319--3320",
booktitle = "Interspeech 2021",
}