{"$schema": "https://c3voc.de/schedule/schema.json", "generator": {"name": "pretalx", "version": "2025.2.0.dev0"}, "schedule": {"url": "https://program.berlinbuzzwords.de/bbuzz25/schedule/", "version": "0.19", "base_url": "https://program.berlinbuzzwords.de", "conference": {"acronym": "bbuzz25", "title": "Berlin Buzzwords 2025", "start": "2025-06-15", "end": "2025-06-17", "daysCount": 3, "timeslot_duration": "00:05", "time_zone_name": "Europe/Berlin", "colors": {"primary": "#3d3182"}, "rooms": [{"name": "Kesselhaus", "slug": "4137-kesselhaus", "guid": "dba34920-af79-5710-85ed-8679584f4662", "description": null, "capacity": null}, {"name": "Maschinenhaus", "slug": "4138-maschinenhaus", "guid": "c8f83cdf-00e7-5bea-9469-3375e3a43a87", "description": null, "capacity": null}, {"name": "Palais Atelier", "slug": "4139-palais-atelier", "guid": "04e8173d-489b-5250-a10c-cf102399fb85", "description": null, "capacity": null}, {"name": "Frannz Salon", "slug": "4140-frannz-salon", "guid": "94a9f0bb-9031-51f0-a575-47b27f8e94bc", "description": null, "capacity": null}], "tracks": [], "days": [{"index": 1, "date": "2025-06-15", "day_start": "2025-06-15T04:00:00+02:00", "day_end": "2025-06-16T03:59:00+02:00", "rooms": {"Palais Atelier": [{"guid": "f1e86c48-05da-533d-a7bc-17dbf5a1140d", "code": "E9FJMZ", "id": 68453, "logo": null, "date": "2025-06-15T14:30:00+02:00", "start": "14:30", "duration": "03:00", "room": "Palais Atelier", "slug": "bbuzz25-68453-barcamp", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/E9FJMZ/", "title": "Barcamp", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Barcamps are informal sessions, a kind of \"un-conference\", with a schedule decided on the day. It is all driven by the interests and expertise of those who attend so each one is different, but ours are always great!", "description": "Although the barcamp doesn't have a strict schedule, it won't be completely devoid of structure! #bbuzz barcamps are dynamic events, focused on the overall Berlin Buzzwords topics, tackling the same challenges but in a different format. At the barcamp each session runs for 30 minutes giving enough time to get into the meat of a topic, but without a chance of anyone getting bored. These are participatory sessions and more inclusive than regular conference talks, with everyone taking part. You can help by leading the session, by giving some insights, by asking some great questions, or maybe just with your enthusiasm.\r\n\r\nThe barcamp will be coordinated and moderated by Nick Burch.\r\n\r\nRegistration starts from 2:30pm", "recording_license": "", "do_not_record": false, "persons": [{"code": "97HYST", "name": "Nick Burch", "avatar": "https://program.berlinbuzzwords.de/media/avatars/97HYST_aBOGSjj.jpg", "biography": "Nick is has been heavily involved in a number of Apache projects, such as Tika and POI, while having the fortune to know many of the people involved in the Apache Big Data and Search space! When not helping out with Apache things, Nick works as the Director of Engineering at Saible, where he leads a team making heavy use of Open Source technologies. When not helping ensure everyone gets paid, he is often to be found attending or organising BarCamps, Geek Nights, or other such fun events dedicated to sharing what's great and new!", "public_name": "Nick Burch", "guid": "02fe34a8-176c-520f-a723-b897478d00b2", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/97HYST/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/E9FJMZ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/E9FJMZ/", "attachments": []}]}}, {"index": 2, "date": "2025-06-16", "day_start": "2025-06-16T04:00:00+02:00", "day_end": "2025-06-17T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "842531aa-c50c-5a94-8c7c-f22b840edbe1", "code": "3NRGVL", "id": 68466, "logo": null, "date": "2025-06-16T09:30:00+02:00", "start": "09:30", "duration": "00:05", "room": "Kesselhaus", "slug": "bbuzz25-68466-opening-session", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3NRGVL/", "title": "Opening Session", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Join us as we kick off Berlin Buzzwords 2025!", "description": "", "recording_license": "", "do_not_record": false, "persons": [{"code": "LWMKUK", "name": "Berlin Buzzwords Team", "avatar": null, "biography": null, "public_name": "Berlin Buzzwords Team", "guid": "be54fbab-5192-5dce-8c8d-f50df81e263c", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/LWMKUK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3NRGVL/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3NRGVL/", "attachments": []}, {"guid": "ffa15d2a-39a4-5dec-a4b0-82db4e5cf0ed", "code": "JA9LZL", "id": 70692, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/JA9LZL/Entwurf_2_3_RoM5BZG.png", "date": "2025-06-16T09:35:00+02:00", "start": "09:35", "duration": "00:45", "room": "Kesselhaus", "slug": "bbuzz25-70692-unpacking-digital-sovereignty-how-to-avoid-fueling-the-nationalist-rise", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JA9LZL/", "title": "Unpacking Digital Sovereignty: How to avoid fueling the nationalist rise", "subtitle": "", "track": null, "type": "Keynote", "language": "en", "abstract": "This talk shows that digital sovereignty is prone to open the door to a nationalist agenda which favours the power concentration that led to Big Tech, and it easily slips into alt-right narratives that put colonising space over the needs of most our planet's population", "description": "With the new ruling coalition of Trump, Musk and Big Tech, European digital sovereignty seems to be the widely accepted solution to break free from US companies that are used to threaten European governments. The EU is reviving industrial policy and is willing to invest a lot of money into digital sovereignty - but which problems exactly should it address, and what should be part of the solution? This talk shows that digital sovereignty is prone to open the door to a nationalist agenda which favours the power concentration that led to Big Tech, and it easily slips into alt-right narratives that put colonising space over the needs of most our planet's population. After an overview of current approaches to digital sovereignty and the role of open-source within those, I will discuss what is needed to reclaim digital sovereignty to defend and strengthen democratic practice both in and through technologies.", "recording_license": "", "do_not_record": false, "persons": [{"code": "CDNUSX", "name": "Aline Blankertz", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CDNUSX_1e0VDVa.jpg", "biography": "Aline is an applied economist with a special interest in the data economy, competition policy and platform regulation. She currently works as the Tech Economy Lead at the anti-monopoly organisation Rebalance Now and has co-founded the digital policy collective Structural Integrity. She has been involved in digital and data policy for various years, at Wikimedia Germany, the think tank interface and an economic consultancy, among others.", "public_name": "Aline Blankertz", "guid": "86df46b1-0b1e-5dde-8ebb-b91a5848ca6e", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/CDNUSX/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JA9LZL/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JA9LZL/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/JA9LZL/resources/Aline_Blankertz_Bu_00WoKmY.pdf", "type": "related"}]}, {"guid": "820c8c9a-7286-5e6c-b33e-13e9aa65bb2d", "code": "AWJYJH", "id": 61401, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/AWJYJH/Entwurf_1_52_2yyDZne.png", "date": "2025-06-16T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz25-61401-which-gpu-for-local-llms", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/AWJYJH/", "title": "Which GPU for Local LLMs?", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "You\u2019re using local LLMs. For example, to power RAG. You want to deploy them in production, but you don\u2019t know where: which type of GPU? How large should it be? Should you use a larger model but quantize more aggressively?\r\n\r\nOur benchmark results and their interpretation will give you some answers.", "description": "It\u2019s easy to offload the LLM - in solutions such as RAG - to external services like OpenAI. This is great for PoCs, but if you have a lot of requests, a local LLM makes more sense from both a cost and a latency point of view. Especially in the context of RAG, where the query itself adds latency and the context to be shifted can be significant.\r\n\r\nFor this session, we\u2019ll use llama.cpp -  which supports inference on many models for many platforms - and benchmark some LLMs on various GPUs. We\u2019ll focus on cost, throughput (tokens/s), and memory usage when presenting results. Memory usage is the same for the same model, but we\u2019ll explore quantization and how it influences throughput, especially since we can fit a larger context. A larger context means we can process more queries in parallel.\r\n\r\nParticipants will get a better sense of how to deploy their RAG/LLM in production from a hardware, model, and quantization perspective.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3CMEKA", "name": "Radu Gheorghe", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3CMEKA_jCJOh5W.jpg", "biography": "Radu has been in the search space for many years, mainly on Elasticsearch, Solr, OpenSearch, and, more recently, Vespa.ai. Helps users with both the relevance and the operations side of retrieval. Enjoys education in all its forms (training, blog posts, books, conferences...) and got the chance to be involved in all of them.", "public_name": "Radu Gheorghe", "guid": "e0bb8b22-5b87-5930-bd1b-c992f726ce16", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/3CMEKA/"}, {"code": "ADKESR", "name": "Rafa\u0142 Ku\u0107", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ADKESR_YlS8Z16.png", "biography": "Software engineer, trainer, consultant and author from time to time - some would say that he is an all in one battle weapon concentrated on information retrieval, performance and user search experience. However he also likes all the other cool stuff that is happening in the IT world. Likes to share his knowledge by giving talks at various meet ups and conferences.", "public_name": "Rafa\u0142 Ku\u0107", "guid": "1eb7cc2c-b6ba-5277-9561-a98d6395be51", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/ADKESR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/AWJYJH/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/AWJYJH/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/AWJYJH/resources/Radu_Gheorge__Ra_om7qJvg.pdf", "type": "related"}]}, {"guid": "3c221b08-cc8a-533d-baf2-8f6e34707cec", "code": "TDNGRG", "id": 65576, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/TDNGRG/Entwurf_1_9_l2rd4LS.png", "date": "2025-06-16T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-65576-shipping-lucene-10-0-25-years-in-the-making", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/TDNGRG/", "title": "Shipping Lucene 10.0, 25 years in the making", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "The fascinating journey towards releasing version 10.0 of the popular java search engine Apache Lucene. An inspiring and challenging venture seen through the eyes of its release manager, made possible by the vibrant Lucene community, culminated in deploying the new major to production in record times.", "description": "Preparation and a real team effort: that\u2019s what it takes. Releasing a major is an involved process, especially when it comes to a 25 years old project, with such a wide and diverse user base as Lucene.\r\nThis talk will cover the purpose of shipping a new major version, the implications and benefits that derive from it for Lucene users, as well as specifics of the 10.0 release process.\r\nWe will go through the ups and downs of the release manager as well as the team effort that it took to pull it off: bugs and performance regressions were uncovered in the process. Four release candidates were built along the way.\r\nWe will expand on how the team performed thorough testing and benchmarking, which contributed to the success of the release, culminating in the deployment of Lucene 10.0 to production in record times.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EQTJJB", "name": "Luca Cavanna", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EQTJJB_6tnKONE.png", "biography": "Luca Cavanna is an Apache Lucene committer / PMC member, and principal engineer at Elastic. At Elastic he operates as technical lead of the Elasticsearch Search Foundations team. In Lucene, his main focus is on search concurrency, as well as fixing all the things and shipping releases.", "public_name": "Luca Cavanna", "guid": "5d98d9b4-1595-5e47-81f2-ce666962f10e", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/EQTJJB/"}, {"code": "GJHNF3", "name": "Adrien Grand", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GJHNF3_f5BYUUx.jpg", "biography": "Adrien has been a committer on the Apache Lucene project since 2012, with a focus on ease of use, search efficiency and storage efficiency.", "public_name": "Adrien Grand", "guid": "df19ff59-2390-5312-a85e-571cf9b079ca", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/GJHNF3/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/TDNGRG/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/TDNGRG/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/TDNGRG/resources/Luca_Cavanna__Adr_KdpTrFV.pdf", "type": "related"}]}, {"guid": "73c8427a-bc13-53ee-9e98-bd059fddd040", "code": "YJHRK8", "id": 65389, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/YJHRK8/Entwurf_1_7_jeHQSwb.png", "date": "2025-06-16T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-65389-building-a-knowledge-graph-for-climate-policy", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YJHRK8/", "title": "Building a knowledge graph for climate policy", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "At Climate Policy Radar, we're building an open-source knowledge graph for climate policy. In this talk, we'll share how we combine in-house expertise with scalable data infrastructure to identify key concepts in thousands of global climate policy documents. We'll also touch on ontology design, equitable evaluation, and the climate impacts of AI.", "description": "We'll take you on a technical deep-dive into how we've built and scaled a knowledge graph which maps the relationships between thousands of climate policy concepts, and identifies where those concepts appear in our corpus of climate policy and other climate-relevant documents.\r\n\r\nWe'll share the high-level methodology, infrastructure decisions, and evaluation framework which have allowed our small team to process millions of passages of text while maintaining high standards for fairness and accuracy.\r\n\r\nAfter covering the basics of what a knowledge graph is, and why you might want to build one, we'll cover:\r\n\r\n1. **Knowledge Graph Architecture & Methodology**\r\n   - An ontology which can handle the complexity of the climate policy domain\r\n   - Interoperability considerations with existing sub-domain taxonomies\r\n   - Why we're building in the open with Wikibase\r\n   - The value of real human expertise\r\n\r\n2. **Classifier Development & Evaluation**\r\n   - A common model for classifiers, which can encompass a range of architectures from straightforward regexes, to fine-tuned BERT-based models, to optimised calls to third-party LLMs\r\n   - Sampling strategies for building representative evaluation datasets\r\n   - Quantitative metrics vs qualitative vibe-checks for classifier selection\r\n\r\n3. **Production Infrastructure & Scaling**\r\n   - A modular pipeline design separating model management, inference, and indexing\r\n   - Prefect-based orchestration for distributed inference\r\n   - Infrastructure as code with Pulumi\r\n   - Planned integration with our existing search and RAG systems\r\n\r\nThe audience should leave the talk with a clear understanding of:\r\n\r\n- Practical considerations when building domain-specific, high-impact knowledge graphs\r\n- Methods for evaluating NLP classifier performance in technical domains\r\n- Approaches to scaling inference pipelines, from local experimentation to routine cloud-based deployments\r\n- How we plan to use our knowledge graph to power a climate policy research platform, including integrations with RAG and other LLM-driven systems\r\n\r\nThis talk should be particularly stimulating for data scientists and engineers working on information retrieval systems, knowledge graphs, or other high-impact natural language processing systems.", "recording_license": "", "do_not_record": false, "persons": [{"code": "VSLDWP", "name": "Harrison Pim", "avatar": "https://program.berlinbuzzwords.de/media/avatars/VSLDWP_SOA9FCf.jpg", "biography": "I'm a data scientist / machine learning engineer with a background in computational / quantum physics. I write loads of python and typescript, and a little bit of everything else.\r\n\r\nI like working on hard R&D problems involving computer vision, natural language processing, graph theory, representation learning, recommendation systems, and information retrieval.\r\n\r\nI love turning those research projects into end-to-end pipelines and services which help people in the real world.", "public_name": "Harrison Pim", "guid": "a37f81d5-0c72-5347-a0c1-6fe45d7ac832", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/VSLDWP/"}, {"code": "PRTTM8", "name": "Fred O'Loughlin", "avatar": "https://program.berlinbuzzwords.de/media/avatars/PRTTM8_HtRhXnP.jpg", "biography": "Senior MLOps Engineer & Tech Lead for the Platform team at Climate Policy Radar", "public_name": "Fred O'Loughlin", "guid": "4fe55b33-8acc-5976-a8b0-63dabeedd713", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/PRTTM8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YJHRK8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YJHRK8/", "attachments": []}, {"guid": "d40cd877-de06-582b-826b-13f1a992ed61", "code": "8ZGVVT", "id": 64984, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/8ZGVVT/Entwurf_1_50_ESg6IOv.png", "date": "2025-06-16T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-64984-accelerating-questdb-lessons-from-a-6x-performance-boost", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/8ZGVVT/", "title": "Accelerating QuestDB: Lessons from a 6x Performance Boost", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this talk, I share our journey in making QuestDB, an Apache 2.0-licensed open-source time-series database, a significantly faster analytical database. In this session, I'll walk through how we identified opportunities for improvement, the key changes we implemented, and how those changes delivered dramatic performance improvements.", "description": "In this talk, I share our journey in making QuestDB, an Apache 2.0-licensed open-source time-series database, a significantly faster analytical database. Over the course of just one year, we achieved query performance gains of up to 6x by implementing specialised data structures, SIMD-based optimisations, scalable aggregation algorithms, and parallel execution pipelines.\r\n\r\nQuestDB is designed for high-performance ingestion\u2014processing millions of rows per second\u2014and efficient queries over billions of rows. While it excelled in time-based queries, we found that certain generic analytical queries were slower than expected. In this session, I'll walk through how we identified opportunities for improvement, the key changes we implemented, and how those changes delivered dramatic performance improvements in a relatively short timeframe.\r\n\r\nI\u2019ll demonstrate before-and-after queries to showcase the impact of these optimisations. All the code is freely available in QuestDB's GitHub repository for anyone to explore or contribute to.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KZHVVS", "name": "Javier Ramirez", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KZHVVS_7JwZiyR.jpeg", "biography": "Developer Advocate at QuestDB and all around happy person. Fan of Open Source, Tech Communities, Data, and ML. He/him", "public_name": "Javier Ramirez", "guid": "12d90169-ae2e-5f44-8a99-b8c48dc6323b", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/KZHVVS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/8ZGVVT/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/8ZGVVT/", "attachments": []}, {"guid": "b68f7ff8-760c-5efb-ba7e-34bb4736b5a2", "code": "CU8ZPP", "id": 65564, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/CU8ZPP/Entwurf_1_41_i8wpbU7.png", "date": "2025-06-16T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz25-65564-self-hosting-ai-llms-a-beginners-guide", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CU8ZPP/", "title": "Self-hosting AI LLMs - a beginners guide", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Want to avoid cloud-hosted AIs, and run your LLMs on your own systems, but not sure where to start? Or even what everything means? Join us to see how easy it can be, and what a beginner needs to know!", "description": "There are some great cloud-hosted AI systems out there, but they aren't right for everyone. Maybe it's cost, or environmental impact. Data sovereignty, privacy or control. Whatever your reason, it can be daunting to figure out how to get started. Luckily, it doesn't have to be!\r\n\r\nWe'll guide you through the key terms to know, where to find good AI / LLM models (it isn't github!), and how to run them. We'll see what influences memory and processing needs, and how fine-tuning can help. And even what that is! We'll help kick-start your journey to running models you control on your own hardware.\r\n\r\nWe'll even have some live demos of models running on a laptop!", "recording_license": "", "do_not_record": false, "persons": [{"code": "97HYST", "name": "Nick Burch", "avatar": "https://program.berlinbuzzwords.de/media/avatars/97HYST_aBOGSjj.jpg", "biography": "Nick is has been heavily involved in a number of Apache projects, such as Tika and POI, while having the fortune to know many of the people involved in the Apache Big Data and Search space! When not helping out with Apache things, Nick works as the Director of Engineering at Saible, where he leads a team making heavy use of Open Source technologies. When not helping ensure everyone gets paid, he is often to be found attending or organising BarCamps, Geek Nights, or other such fun events dedicated to sharing what's great and new!", "public_name": "Nick Burch", "guid": "02fe34a8-176c-520f-a723-b897478d00b2", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/97HYST/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CU8ZPP/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CU8ZPP/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/CU8ZPP/resources/Nick_Burch__Self-h_qTdp05O.pdf", "type": "related"}]}, {"guid": "c9d3ba88-b3f7-548b-b1b5-88a1624553cd", "code": "XFHXYP", "id": 65583, "logo": null, "date": "2025-06-16T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-65583-harnessing-ai-to-strengthen-trustworthy-information", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/XFHXYP/", "title": "Harnessing AI to strengthen trustworthy information", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "AI can (also) enhance fact-checking and news classification. We developed a platform integrating search, an intelligent assistant, and a RAG system to support reliable journalism. By leveraging diverse data and analytics, we empower everyone with insights for accuracy and transparency, fostering collaboration for trustworthy information.", "description": "Misinformation spreads rapidly in the digital age, making it increasingly difficult to ensure information integrity. Brandolini\u2019s Law highlights this challenge, but AI presents new opportunities to support fact-checking and news classification. In this session, we will share insights from our work on developing an AI-driven platform that integrates search capabilities, an intelligent assistant, and a Retrieval-Augmented Generation (RAG) system. By leveraging diverse data sources and advanced analytics, our approach empowers journalists and editors with tools to enhance accuracy and transparency in reporting. Attendees will gain an understanding of how AI can provide actionable insights, streamline fact-checking, and promote responsible journalism. While this technology continues to evolve, we aim to foster discussion on AI\u2019s role in strengthening critical thinking and building a more trustworthy information ecosystem.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DXAHHB", "name": "Lucian Precup", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DXAHHB_6GmOiCe.jpg", "biography": "Lucian Precup is the CTO of [all.site](https://all.site/) - the collaborative search engine developed at [Station F](http://stationf.co) in Paris. With his colleagues at [Adelean](http://adelean.com), Lucian develops solutions for indexing, searching and analyzing data. Lucian regularly shares his knowledge in specialized conferences and organizes the [Search, Data & AI Meetup](https://www.meetup.com/fr-FR/search-and-data/).", "public_name": "Lucian Precup", "guid": "15cf6ff2-c885-5e06-af41-4e027691f577", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/DXAHHB/"}, {"code": "TPVZLQ", "name": "Giovanna Monti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/TPVZLQ_rMVyPVF.jpeg", "biography": "I am a software developer with a passion for front-end. I love sharing thoughts and experiences with the tech community, and that's why I started my speaker journey in 2023.\r\nMy motto? Understand things, before you do them. And, in case of doubt, don't be afraid to ask for the millionth time!", "public_name": "Giovanna Monti", "guid": "49718403-11a1-5666-9977-6a34ebd79d95", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/TPVZLQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/XFHXYP/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/XFHXYP/", "attachments": []}, {"guid": "4fe04dab-ea3a-5159-9ebe-0f409a6cf6f6", "code": "AYHCBK", "id": 61408, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/AYHCBK/Entwurf_1_53_e2OLdPe.png", "date": "2025-06-16T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-61408-precision-farming-powered-by-k3s-and-tensorrt", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/AYHCBK/", "title": "Precision farming powered by K3s and TensorRT", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Aurea Imaging is an AgTech scaleup focusing on precision farming in apple orchards. We've build the Treescout, and edge device on top of a tractor which unlocks the potential of each tree. We used a innovative technology stack to meet the requirements of an outdoor rural setup.  Our journey was full of failures, learnings and ongoing challenges", "description": "Aurea Imaging is an AgTech scaleup focusing on precision farming in orchards. By retrofitting our TreeScout sensor package to their tractors, farmers are able to collect data about their orchard down to the tree level as they perform other tasks. Using onboard stereo cameras, the tractor\u2019s high-precision GPS, a machine-vision pipeline running in real time on the device, and cloud-based analytics, this data is turned into maps used by other agricultural machines that enable the grower to utilize less labour and chemical products to produce more food.\r\n\r\nTo run the edge part of this process, we opted against a traditional embedded/edge architecture based on Robot Operating System (ROS) and C/C++, and instead chose to build Python microservices orchestrated with K3s. This has brought the usual benefits of cloud-native tooling, but building cloud-native software for a far-edge, occasionally-airgapped application in an ecosystem based on decades-old standards comes with a myriad of challenges not faced in traditional cloud environments. Overall, Kubernetes on edge has brought us a very high development velocity and a reliable, maintainable codebase, and we hope to both explore the challenges this brings as well as inspire others to try this approach for their next edge project. On top of this we are running object detection models in tensorRT which are able to detect tree specifics at a driving speed of 8km/hr.", "recording_license": "", "do_not_record": false, "persons": [{"code": "G9YNB9", "name": "Wieneke Keller", "avatar": "https://program.berlinbuzzwords.de/media/avatars/G9YNB9_yuP56t8.png", "biography": "tba", "public_name": "Wieneke Keller", "guid": "37aedc6f-3561-5493-901f-8e2bce75c751", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/G9YNB9/"}, {"code": "GY3GKU", "name": "Sebastian Lenartowicz", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GY3GKU_dVKapuR.jpg", "biography": "Right often enough that it's probably not coincidence.", "public_name": "Sebastian Lenartowicz", "guid": "038a1d10-a10c-5264-83e0-d699cf71b453", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/GY3GKU/"}], "links": [{"title": "link to the the details of the Treescout", "url": "https://aureaimaging.com/treescout/", "type": "related"}], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/AYHCBK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/AYHCBK/", "attachments": []}, {"guid": "926a0d21-bc81-54bb-894e-00ef75dc9bb4", "code": "7NBQPB", "id": 65295, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/7NBQPB/Entwurf_1_57_ucliWR2.png", "date": "2025-06-16T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-65295-end-to-end-semantic-search-with-apache-solr-9-8-llm-module", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/7NBQPB/", "title": "End-to-End Semantic Search with Apache Solr 9.8 LLM Module", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Apache Solr 9.8 introduces the LLM module opening the doors of end-to-end natural language query support through vector-backed semantic search (K Nearest Neighbors).\r\n\u2028This talk explores the open source contribution from both the indexing and query angles and what\u2019s coming next for Solr in terms of integrations with Large Language Models.", "description": "Dense vector search was introduced in Apache Solr 9.0 in 2022 and since then it has received substantial adoption from the community.\r\nText vectorisation had to happen outside Solr, as there was no support to encode text to vector within the search engine transparently.\r\nApache Solr 9.8 changes this, introducing a module that allows interaction with well-known large language model providers such as OpenAI, Cohere, HuggingFace and Mistral AI via the open-source library \u2028LangChain4j.\r\nExpect to learn how to configure Solr to access external text vectorisation services and use them to encode and run your queries through the 'knn_text_to_vector' query parser and vectorise your documents\u2019 textual fields through the 'Text To Vector Update Request Processor'.\u2028\r\nThis is a foundational enabler that speeds up the design and development of end-to-end semantic search solutions.\u2028\r\nThe talk wraps up with future directions and how the introduction of the LLM module opens the doors for exciting new integrations.\u2028\r\nJoin us as we dive into the AI future of Apache Solr!", "recording_license": "", "do_not_record": false, "persons": [{"code": "GJ3PTP", "name": "Alessandro Benedetti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GJ3PTP_rgB6tjJ.jpg", "biography": "Alessandro Benedetti is an Apache Lucene/Solr committer and Solr PMC member, Director at Sease Ltd.\r\nHe believes in Open Source as\u00a0a way to build a bridge between Academia and Industry\u00a0and facilitate the progress of applied research.\u2028\r\nAlessandro is a passionate R&D software engineer, continuously applying the latest trends in Information Retrieval and AI to solve search problems.\u2028He\u2019s been working on Learning To Rank for years and more recently he\u2019s been exploring Generative AI techs like Large Language Models and Retrieval Augmented Generation.\u2028\r\nWhen he isn't on clients' projects, he contributes to the open-source community and presents at meet-ups and conferences such as ECIR, Search Solutions, Community Over Code, Haystack and Berlin Buzzwords.", "public_name": "Alessandro Benedetti", "guid": "bd8c60c2-a21e-5832-978a-2ca73e1cddd0", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/GJ3PTP/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/7NBQPB/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/7NBQPB/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/7NBQPB/resources/Alessandro_Benedet_4bIPj5A.pdf", "type": "related"}]}, {"guid": "725691af-525e-56a9-9fbd-25bd777748ca", "code": "RAWGVW", "id": 60009, "logo": null, "date": "2025-06-16T18:00:00+02:00", "start": "18:00", "duration": "03:00", "room": "Kesselhaus", "slug": "bbuzz25-60009-get-together", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/RAWGVW/", "title": "Get-Together", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Join us for food and drinks at Palais Kulturbrauerei!", "description": "Berlin Buzzwords is a good place to meet great people, and while there is ample time to chat and discuss during the conference, we also want to provide an opportunity for everyone to meet up outside of the regular conference program with our Get-Together on Monday! \r\n\r\nOur Get-Together is a casual meet up in an relaxed atmosphere and provides a perfect opportunity to meet old and new friends, business contacts or get to know other participants of Berlin Buzzwords. Thanks to our partner Search Guard there will be food and drinks available: We will offer a range of vegetarian and vegan food options as well as alcoholic and non-alcoholic drinks.", "recording_license": "", "do_not_record": false, "persons": [{"code": "LWMKUK", "name": "Berlin Buzzwords Team", "avatar": null, "biography": null, "public_name": "Berlin Buzzwords Team", "guid": "be54fbab-5192-5dce-8c8d-f50df81e263c", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/LWMKUK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/RAWGVW/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/RAWGVW/", "attachments": []}], "Maschinenhaus": [{"guid": "fd105d31-1918-5d5b-9f61-56641c71844b", "code": "YR3T98", "id": 65332, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/YR3T98/Entwurf_1_10_F5lsUHa.png", "date": "2025-06-16T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz25-65332-zero-to-scale-telemetry-pipeline-with-apache-cassandra", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YR3T98/", "title": "Zero to Scale: Telemetry pipeline with Apache Cassandra", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Picture billions of messages pouring in daily from thousands of data providers around the globe, which are then processed and published to customers. How can one design a telemetry system to capture, publish, and then index essential information about the data flowing through the system to give internal teams visibility to aid in troubleshooting?", "description": "A core part of our business is to receive and then process humongous amounts of financial data from all over the globe. This pipeline scales to tens of billions of pricing messages every day, in which each message carries highly valuable information. Getting visibility into what was sent to us versus what was published to our customers is of utmost importance to enable internal teams to quickly troubleshoot issues reported by any of the data providers.\r\n\r\nBut how do we capture essential information about the data flowing through such a massive and high throughput system scaling to more than tens of thousands of processes running on close to a hundred machines, in which traffic peaks at more than a million messages per minute? In this talk, we will talk about how we built a high throughput telemetry system for streaming, storing, and searching such a high volume of data, starting from scratch using open source technologies like ZeroMQ, Apache Kafka, Kubernetes, and Apache Cassandra. You will gain valuable insights into the system\u2019s design and performance, as well as the lessons we learnt along the way. We will cover everything from schema design and load testing to incremental deployment in order to manage such high data throughput.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TA7ZNP", "name": "Shikhar Srivastava", "avatar": "https://program.berlinbuzzwords.de/media/avatars/TA7ZNP_ZB56KYZ.jpg", "biography": "Shikhar Srivastava is a Senior Software Engineer on the Real-time Contributions Engineering team at Bloomberg in London, where he designs and builds high-performance financial data systems. Shikhar is passionate about exploring innovative technologies to enhance real-time data processing. His career journey spans from developing machine learning models for ETA prediction at startups in India to architecting low-latency market data solutions at Bloomberg. Lately, he has been diving deep into Apache Cassandra, making use of its distributed database capabilities to tackle scalability challenges.", "public_name": "Shikhar Srivastava", "guid": "c6096174-3a9f-5130-ae22-a247d7089fc2", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/TA7ZNP/"}, {"code": "WSDCRK", "name": "Nomin-Erdene Oyun", "avatar": "https://program.berlinbuzzwords.de/media/avatars/WSDCRK_dZsyHRH.jpg", "biography": "Nomin-Erdene Oyun is a Senior Software Engineer on the Real-time Contributions Feeds Infrastructure Engineering team at Bloomberg in New York. With a strong interest in building impactful software solutions, she focuses on developing real-time data infrastructure and high-performance processing pipelines that drive transparency and enable data-driven decision making in the financial space. She enjoys the creative and technical journey from concept to deployment, and has been involved in bringing multiple projects to life from the ground up over the course of her career.", "public_name": "Nomin-Erdene Oyun", "guid": "11a3a1b9-c9f2-5514-bd47-ecf960714927", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/WSDCRK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YR3T98/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YR3T98/", "attachments": []}, {"guid": "41bf61a3-e6f8-50da-9c59-8feb1f6fc38e", "code": "3PXQZ8", "id": 65361, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/3PXQZ8/Entwurf_1_55_ELypWJC.png", "date": "2025-06-16T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-65361-ai-powered-search-results-navigation-with-llms-json-schema", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3PXQZ8/", "title": "AI-Powered Search Results Navigation with LLMs & JSON Schema", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Struggling to identify relevant filters among too many facets and frustrating results navigation? We explore an AI Filter Assistant for statistical data (SDMX) showing how LLMs can be leveraged to suggest the best filters for your natural language query, helping you refine the results in Apache Solr. We share wins, fails, and lessons learned.", "description": "In this talk, we explore an AI-powered Filter Assistant, designed for the Statistical Data and Metadata eXchange (SDMX) to improve User eXperience in navigating search results efficiently and effectively.\r\nWe discuss how LLMs enhance filter suggestions by analyzing both user queries and indexed data.\r\n\r\nOn the architecture side, we break down:\r\n1) Data retrieval \u2013 how we collected and processed the input SDMX data to build taxonomies used by the model to reconcile the concepts in the natural language query\r\n2) API structure \u2013 a deep dive into our endpoints, what they do, and the responses they return.\r\n3) Model choice \u2013 the process of identifying the best LLM for the task, including our motivations and studies\r\n4) Structured output & JSON Schema \u2013 key benefits, limitations, and lessons learned from extensive testing. We showcase different test results and insights on what works best.\r\n5) Solr query optimization \u2013 how to integrate the assistant\u2019s output into a search query, using different boolean strategies to handle the refinement of both too-many and zero-result scenarios.\r\n\r\nExpect real-world insights, practical takeaways, and a discussion on the future of AI-driven filtering!", "recording_license": "", "do_not_record": false, "persons": [{"code": "MJTHGB", "name": "Anna Ruggero", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MJTHGB_JgshkqQ.jpg", "biography": "Hi!\r\nI\u2019m Anna Ruggero, an IT consultant in the information retrieval world.\r\nI support clients in the process of improving their search engines with the implementation of innovative personalized solutions. I specialize in the integration of machine learning techniques with information retrieval systems, from Learning-to-Rank techniques to Neural Searches and Recommender Systems. \r\nI extensively worked on e-commerce websites, improving their performance by developing personalized models and evaluation systems. \r\n\r\nI highly believe in innovation and research, keeping up-to-date with the latest academic studies and contributing to them. I participated in the European Conference of Information Retrieval 2022 with a poster on offline and online evaluation in the industry and published a paper on improving interleaving techniques for the evaluation of information retrieval systems at the ECIR 2023.\r\n\r\nI can't wait to talk about search with you!", "public_name": "Anna Ruggero", "guid": "53fa5d8d-d62e-5ef7-ab92-95248b0ef81d", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/MJTHGB/"}, {"code": "L8WESZ", "name": "Edward Lambe", "avatar": "https://program.berlinbuzzwords.de/media/avatars/L8WESZ_cjcp00k.jpg", "biography": "Edward Lambe is the Head of the MED Data Engineering team and the Deputy Head of MED IT at the Bank for International Settlements (BIS). Since joining the BIS in 2016, Edward has overseen the implementation of several key projects within the IT unit of the Monetary and Economic Department. Notably, he led the delivery of the BIS Data Portal, a core initiative of the BIS 2025 Innovation programme aimed at modernising the dissemination of BIS statistics. Prior to his tenure at the BIS, Edward held various statistical and IT roles at the Central Statistics Office, Ireland, and the Bank of Ireland. He holds a master\u2019s degree from the Cork Institute of Technology and a bachelor\u2019s degree from the National University of Ireland, Cork.", "public_name": "Edward Lambe", "guid": "a4e45b76-0b0d-5489-93a1-f5a630891b95", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/L8WESZ/"}, {"code": "CBFBFW", "name": "Ilaria Petreti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CBFBFW_HwmcWsS.jpg", "biography": "Ilaria is a Data Scientist with a background in Machine Learning and Natural Language Processing for Information Retrieval systems. Since joining the Sease team in 2020, she has worked on various projects, focusing on integrating Learning To Rank and Search Quality Evaluation in e-commerce ecosystems. More recently, she has been exploring the potential of Vector Search and Large Language Models in Search, leveraging these technologies to enhance retrieval strategies and improve result relevance.\r\n\r\nBeyond her work, she is an active information retrieval research community member, regularly sharing her insights through blog posts, contributing to open-source projects, and speaking at international conferences such as Berlin Buzzword and ElasticON.", "public_name": "Ilaria Petreti", "guid": "9495bfbd-3bd9-5666-a0d4-98c3a515b555", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/CBFBFW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3PXQZ8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3PXQZ8/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/3PXQZ8/resources/Anna_Ruggero_et_al_YRSrydz.pdf", "type": "related"}]}, {"guid": "73c56cc2-f258-551d-bb07-5c11f135abaa", "code": "HS8PFX", "id": 64930, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/HS8PFX/Entwurf_1_28_skuftIR.png", "date": "2025-06-16T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-64930-airflow-3-the-new-beginning", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HS8PFX/", "title": "Airflow 3  - the new beginning", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "If you were living under the rock and have not heard that Airlfow 3 is out, and solves most of the pain points that you had with Airflow 2,  this talk is for you. You will learn how you can boost your Data Engineering and AI/ML workflows (without having to rewrite your DAGs) with what Airlfow 3 community worked for last 12 months.", "description": "Airflow 3 is out. Spring last year Airlfow community came to conclusion that in order to respond to a number of users of ours, we have to again reinvent ourselves and release Airflow 3. Berlin Buzzwords is at the right time and the right place to talk about it, as at the time of Buzzwords, Airflow 3 will already be  out for a while and we will know not only what we planned, but how our users already use Airflow 3 and what benefit it brings for them.\r\n\r\nIn this talk you will learn from Jarek, one of the top maintainers of Airflow all about you want to know about Airflow 3:\r\n\r\n* why we decided to change to Airflow 3\r\n* what are the architectural changes and improvements that lay foundation under Airflow 3 being modern and applicable to more workflows\r\n* what are the features you always wanted and you can use now: Versioning, Enterprise level security isolation, better dependency management, execution isolation, dataset as first class citizen, modern react-based UI, schedulable and UI-controlled backfills, ML/AI worfklows including inference workflows, almost streaming experience, and more\r\n* what early users of Airlfow 3 say and how their workflow management improved\r\n* what is coming next (yes! we are not nearly done yet and more things are coming!)\r\n\r\nLast year, I presented \"new orchestrator in town\" - Airflow 2, but that was only a Prelude. If you want to hear the whole Symphony come to see the talk.", "recording_license": "", "do_not_record": false, "persons": [{"code": "RHFLWL", "name": "Jarek Potiuk", "avatar": "https://program.berlinbuzzwords.de/media/avatars/RHFLWL_O5nYzgM.png", "biography": "Independent Open-Source Contributor and Advisor, Committer and PMC member of Apache Airflow, Member of the Apache Software Foundation, Security Committee Member of the Apache Software Foundation. Organizer of community-focused events, speaker.\r\n\r\nJarek is an Engineer with a broad experience in many subjects - Open-Source, Cloud, Mobile, Robotics, AI, Backend, Developer Experience, Security, but he also had a lot of non-engineering experience - building a Software House from scratch, being CTO, organizing big, international community events, technical sales support, pr and marketing advisory but also looking at legal aspects of security, licensing, branding and building open-source communities are all under his belt. \r\n\r\nWith the experience in very small and very big companies and everything in-between, Jarek found his place in Open-Source world, where his internal individual-contributor drive can be used to the uttermost of the potential.", "public_name": "Jarek Potiuk", "guid": "4c9f75cf-0eab-55bf-b736-f71716d599e6", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/RHFLWL/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HS8PFX/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HS8PFX/", "attachments": []}, {"guid": "d1589bb0-08ac-573c-96e8-01ebea0dc732", "code": "PQJRNM", "id": 65409, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/PQJRNM/Entwurf_1_24_EDpDfbK.png", "date": "2025-06-16T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-65409-what-you-see-is-what-you-mean-intent-based-ecommerce-search", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/PQJRNM/", "title": "What you see is what you mean; intent based ecommerce search", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Intent based clustering is our approach to overcome some limitations of modern hybrid search systems. We show how an upfront LLM-supported in-depth query understanding can leverage steps like retrieval, clustering, validation and presentation. We address various aspects from prototype to production in a large-scale high-volume e-commerce search.", "description": "In today's e-commerce landscape, hybrid search systems represent the market standard, yet they often struggle with highly inspirational queries, lack of precision in semantic recall, and presentation of diversity. Ambiguity in search terms can lead to disorganized results or overshadowed relevant products. \r\n\r\nThis session will demonstrate how an upfront LLM-supported in depth query understanding can increase recall meaningfully without sacrificing precision. We show how LLM-driven query understanding can enhance retrieval even for highly inspirational searches and how a final alignment between retrieved products, the query intent and the query context can compensate for indexing shortcomings. Additionally, we'll explore how query intent-based clustering can visually organize results by disambiguating meanings, thus providing users with a clearer path to relevant products. \r\n\r\nOur discussion will take attendees through our journey from prototype to production-ready implementation, sharing insights and challenges encountered in large-scale e-commerce environments with high query volume. \r\n\r\nIntent-based clustering represents a novel approach in e-commerce, reducing the \"paradox of choice\" and aiding customers in navigating product diversity. We'll explore various presentation forms for query intent-based clusters and share findings from UX tests that evaluated these approaches. \r\n\r\nJoin us to gain a comprehensive understanding of how these advanced techniques can transform search experiences and drive better outcomes in e-commerce settings.", "recording_license": "", "do_not_record": false, "persons": [{"code": "XRQLDW", "name": "Dennis Berger", "avatar": "https://program.berlinbuzzwords.de/media/avatars/XRQLDW_blUdxyp.jpg", "biography": "Dennis Berger is a search and software engineer working at Otto. He designs and develops robust search backends using Apache Solr, Rust, and Java. Additionally, he explores ways to integrate emerging technologies with established systems. His work includes crafting microservice-based architectures and finding innovative ways to solve problems. Recognized for his technical precision, Dennis continues to push the boundaries of search with modern technologies.", "public_name": "Dennis Berger", "guid": "4366bd5d-8e86-559e-a0c4-6ae290d907f7", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/XRQLDW/"}, {"code": "MGHVFG", "name": "Marco Petris", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MGHVFG_FJlqJDk.jpg", "biography": "I'm a Senior Software Developer and I'm currently working on AI driven search at OTTO.", "public_name": "Marco Petris", "guid": "f0a596cb-c250-507f-acab-588013d3da0c", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/MGHVFG/"}, {"code": "JTAWXG", "name": "Volker Carlguth", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JTAWXG_mF0Wwfj.png", "biography": "Volker Carlguth has been working in the field of retail search engines for 20 years, taking on various roles as a software developer, consultant, and now as a product manager. He is passionate about understanding user intent and is particularly interested in applying emerging AI methods to this area.", "public_name": "Volker Carlguth", "guid": "f8669011-16e1-5d47-a38f-182fcd24d130", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/JTAWXG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/PQJRNM/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/PQJRNM/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/PQJRNM/resources/Berger_et_al__What_qNv0r97.pdf", "type": "related"}]}, {"guid": "96a89d72-3f0f-5586-a666-1e044b3d86d9", "code": "WLNLKY", "id": 65399, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/WLNLKY/Entwurf_1_1_TEzi9gQ.png", "date": "2025-06-16T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz25-65399-performance-tuning-apache-solr-for-dense-vectors", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WLNLKY/", "title": "Performance Tuning Apache Solr for Dense Vectors", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "While powerful, dense vector search is not a plug-and-play feature that will scale straight out-of-the-box, particularly when it comes to extracting the maximum performance from limited compute resources. Come learn how we tuned dense vector indexes for our 100M+ document dataset, and drastically sped up our queries.", "description": "With the recent boom in AI, many organizations are in the process of building semantic search stacks from scratch powered by Apache Solr and dense vectors. What many quickly learn when dealing with dense vectors is just how heavy the compute requirements are for vector search compared with lexical search. If not well-tuned, vector search query latency can quickly skyrocket, even with an otherwise reasonably sized dataset.\r\n\r\nWe experienced this pain firsthand when we started vectorizing a 100M+ document dataset. While one can certainly approach this problem head-on by throwing hardware resources at it, this is neither a cheap nor fully-effective solution.\r\n\r\nThis talk will cover a brief introduction to how Apache Solr/Lucene builds dense vector indexes, the journey of how we optimized our dense vector setup, as well as highlight the pitfalls/best practices we learned.\r\n\r\nWhether you\u2019re a company building out full RAG pipelines or an enthusiast playing around with a novel alternative to standard lexical search, you\u2019re going to want to squeeze the most performance out of your limited compute resources. Let us help you hit the ground running.", "recording_license": "", "do_not_record": false, "persons": [{"code": "MJPM3B", "name": "Kevin Liang", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MJPM3B_2V8HySa.jpg", "biography": "Kevin Liang is a software engineer on Bloomberg's Search Infrastructure engineering team in New York. As part of a team that offers search-as-a-managed-service, he works closely with Apache Solr day in and day out. This includes everything from the application-level software down to the bare-metal hardware. Recently, his work has focused increasingly on support for dense vector search, but has also covered a variety of subjects, including automation, backups, and major version upgrades.", "public_name": "Kevin Liang", "guid": "12adb108-d841-5ecf-ab46-0f4fd740ac58", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/MJPM3B/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WLNLKY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WLNLKY/", "attachments": []}, {"guid": "36ea3a81-2504-53d2-9d81-4633d75c56bb", "code": "DXNPMT", "id": 64873, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/DXNPMT/Entwurf_1_44_A5n8liv.png", "date": "2025-06-16T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-64873-anatomy-of-table-level-locks-in-postgresql", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DXNPMT/", "title": "Anatomy of Table-Level Locks in PostgreSQL", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talks explains locking mechanisms (MVCC, lock queue) in PostgreSQL, focusing on table-level locks that are acquired by Data Definition Language(DDL) operations. If not managed well, schema changes can result in downtime. Not all operations require the same level of locking, and PostgreSQL offers tools and techniques to minimize locking impact.", "description": "In PostgreSQL, managing schema changes without downtime can be a challenging task. Table-level locks, which control access during Data Definition Language (DDL) operations like ALTER or DROP TABLE, can result in unintended application slowdowns or even service interruptions when not fully understood. This talk will provide a comprehensive dive into table-level locking and lock queueing in PostgreSQL, helping attendees gain the insights they need to perform efficient schema changes.\r\n\r\nWe\u2019ll start by explaining the various types of table-level locks in PostgreSQL such as Access Share, Exclusive, and Access Exclusive and how they are automatically managed during common DDL operations. Then, we\u2019ll break down lock queuing: how PostgreSQL organizes lock requests, what happens when transactions wait for locks, and how deadlocks can arise in complex environments.\r\n\r\nNext, we\u2019ll focus on practical approaches to managing table-level locks for near-zero downtime. Attendees will learn techniques to minimize locking impact, including understanding lock conflicts, using online schema migration patterns, and identifying lock-heavy queries. We\u2019ll introduce open-source tools like pgroll, which utilizes the expand/contract pattern to make schema changes in small, lock-free steps.\r\n\r\nBy the end of this session, attendees will be equipped with practical strategies and knowledge to control lock behavior during schema changes, ensuring data integrity and reducing operational disruptions. This talk will provide the tools needed to manage PostgreSQL schema changes with confidence and minimal impact on production environments.", "recording_license": "", "do_not_record": false, "persons": [{"code": "B8AKHE", "name": "G\u00fcl\u00e7in Y\u0131ld\u0131r\u0131m Jelinek", "avatar": "https://program.berlinbuzzwords.de/media/avatars/B8AKHE_6MQo9xm.JPG", "biography": "G\u00fcl\u00e7in started working with Postgres at a startup company in 2012 and was amazed at how powerful Postgres truly is! Over the years, she has actively contributed to the PostgreSQL community by organizing conferences, delivering talks, and engaging as a dedicated community member. In recognition of her commitment, G\u00fcl\u00e7in was elected to the PostgreSQL Europe Board in 2017.\r\n\r\nFueled by her passion for PostgreSQL automation and cloud technologies, G\u00fcl\u00e7in took on the role of Cloud Services Manager and led the cloud development efforts at 2ndQuadrant, which was later acquired by EDB in 2020. Committed to fostering diversity and inclusion, she is an integral part of Postgres Women, advocating for increased representation of women in technical communities.\r\n\r\nCurrently, G\u00fcl\u00e7in is a Staff Database Engineer at Xata, where she continues to explore her interests in PostgreSQL. In addition to her engineering work, she is one of the co-founders of Kadin Yazilimci (Women Developers of Turkey) and has led the core team for more than 10 years. In 2023, she launched Diva: Dive into AI as a Kadin Yazilimci initiative and has been part of the organizing team since.\r\n\r\nShe is now recognized as a PostgreSQL Contributor by the Postgres project. Being part of PostgreSQL Europe Diversity Committee, she looks forward to serving the community and contributing to the project's longevity and health. G\u00fcl\u00e7in lives in Prague and is the co-founder and organizer of the monthly Prague PostgreSQL Meetup.", "public_name": "G\u00fcl\u00e7in Y\u0131ld\u0131r\u0131m Jelinek", "guid": "1f84d441-7ade-5564-a7b9-8d27386e5dc3", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/B8AKHE/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DXNPMT/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DXNPMT/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/DXNPMT/resources/Gulcin_Yildirim_Je_4tLeOKO.pdf", "type": "related"}]}, {"guid": "7f5653c1-e116-50ba-b55d-24a88b8e28eb", "code": "U3GWMN", "id": 58330, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/U3GWMN/Entwurf_1_11_RaqHexO.png", "date": "2025-06-16T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-58330-how-i-sidestepped-being-glue", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/U3GWMN/", "title": "How I Sidestepped \u2018Being Glue\u2019", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "We all do things in our day to day work that are deemed \u2018non-promotable\u2019 - these are tasks that are crucial for project success but might not get you promoted. This is commonly known as glue work, a term coined by Tanya Reilly. Being glue doesn\u2019t mean an end to your career, and it isn\u2019t something that you can\u2019t recover from.", "description": "Abstract:\r\nWe all do things in our day to day work that are deemed \u2018non-promotable\u2019 - these are tasks that are crucial for project success but might not get you promoted. This is commonly known as glue work, a term coined by Tanya Reilly. Your first instinct about this could be to drop these tasks immediately, but this might not be the best approach. Being glue doesn\u2019t mean an end to your career, and it isn\u2019t something that you can\u2019t recover from. During this talk, I will use a personal experience to illustrate how I narrowly avoided the trap of being permanently stuck with glue work and how to salvage a situation where you might be in a similar predicament.\r\n\r\nOutline:\r\n1. Introduce the concept of glue work\r\n\r\n2. Talk about the specific work I was doing and why I started getting concerned\r\n- I didn\u2019t want to do work that wasn\u2019t deemed promotable\r\n- As a woman in engineering, I needed to be extra careful about not doing \u2018administrative\u2019 or \u2018secretarial\u2019 work\r\n\r\n3. How I dealt with it\r\n- Realizing that this could become an issue for my career progression.\r\n- Instead of being reactive and immediately stopping doing these tasks, thinking things through.\r\n- Dealing with it proactively: I immediately talked to my manager and tried understanding how they perceived this work to be.\r\n- Talked to a mentor at my company (a staff engineer) on what their take was. They helped identify a couple of things:\r\n\r\na. If the work that you\u2019re doing isn\u2019t being reflected in your performance evaluations, then that\u2019s a red flag.\r\nb. Try to think about why you\u2019re constantly doing this work? Is it because roles aren\u2019t well defined in your company? They talked about how this job was being done by the project lead in other teams. This helped me realize that the project lead role was very loosely defined within my team, which was part of the reason I was experiencing this.\r\n\r\n4. Long term fixes:\r\n- Moving forward, make sure I was as heavily invested in the implementation stages of the project (which requires more technical skills than soft skills).\r\n- Knowledge sharing and mentoring other engineers so the glue work I was doing could be done on a rotational basis within the team.\r\n\r\nKey takeaways:\r\n1. Learn to reflect on your day to day work and identify \u2018glue work\u2019.\r\n2. Instead of being reactive to \u2018glue work\u2019, learn to reflect on how you can mitigate the risks that come with glue work and stay on track with getting promoted.\r\n3. Learn how to leverage your manager, and team members to reduce the \u2018glue work\u2019 you\u2019re doing.", "recording_license": "", "do_not_record": false, "persons": [{"code": "VRVEXV", "name": "Fatima Taj", "avatar": "https://program.berlinbuzzwords.de/media/avatars/VRVEXV_4dJZM1B.jpeg", "biography": "Fatima is a Senior Software Engineer at Yelp with a deep passion for mentoring early-career tech professionals. She has successfully guided many individuals through their first steps in the tech industry, helping them overcome challenges and achieve their career goals. In addition to her mentorship, Fatima is a prominent voice in the tech community, with a substantial following on LinkedIn, where she shares actionable insights on career development and growth.\r\n\r\nAn experienced speaker, Fatima has presented at leading conferences including Developer Week 2024, the Southern California Linux Expo (Scale) 2023 and 2024, NDC Copenhagen Developer Festival 2023, Women of Silicon Roundabout 2022, cdCon+GitOpsCon 2023 (as a keynote speaker), Momentum 2024, and the Black is Tech Conference in 2022 and 2023. She has also spoken at over 80 hackathons across North America. Her sessions are renowned for their practical, hands-on advice, making her a sought-after speaker on topics related to career progression and professional growth in the tech industry.\r\n\r\nFatima holds a master's degree in Data Science from HEC Montreal and a bachelor\u2019s degree in Mathematics from the University of Waterloo, Canada.", "public_name": "Fatima Taj", "guid": "5918e4be-51b5-560c-8436-557450a3c1b3", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/VRVEXV/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/U3GWMN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/U3GWMN/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/U3GWMN/resources/Fatima_Taj__How_I__15YPq45.pdf", "type": "related"}]}, {"guid": "cadd67f0-f3ce-5ed0-9bb5-50d3ff62cde5", "code": "GLM8BQ", "id": 62561, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/GLM8BQ/Entwurf_1_25_ChXWvpk.png", "date": "2025-06-16T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-62561-melting-icebergs-direct-access-to-kafka-data-via-iceberg", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/GLM8BQ/", "title": "Melting Icebergs: Direct access to Kafka Data via Iceberg", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Data in organizations is traditionally split between operational and analytical estates. Join us for an account of our journey combining Apache Kafka and Apache Iceberg to create a solution that addresses both estates with one data source.", "description": "An organisation's data has traditionally been split between the operational estate, for daily business operations, and the analytical estate for after-the-fact analysis and reporting. The journey from one side to the other is today a long and torturous one. But does it have to be?\r\n\r\nIn the modern data stack Apache Kafka is your defacto standard operational platform and Apache Iceberg has emerged as the champion of table formats to power analytical applications. Can we leverage the best of Iceberg and Kafka to create a powerful solution greater than the sum of its parts?\r\n\r\nYes you can and we did!\r\n\r\nThis isn't a typical story of connectors, ELT, and separate data stores. We've developed an advanced projection of Kafka data in an Iceberg-compatible format, allowing direct access from warehouses and analytical tools.\r\n\r\nIn this talk, we'll cover:\r\n\r\n* How we presented Kafka data for Iceberg processors without moving or transforming data upfront\u2014no hidden ETL!\r\n* Integrating Kafka's ecosystem into Iceberg, leveraging Schema Registry, consumer groups, and more.\r\n* Meeting Iceberg's performance and cost reduction expectations while sourcing data directly from Kafka.\r\n\r\nExpect a technical deep dive into the protocols, formats, and services we used, all while staying true to our core principles:\r\n\r\n* Kafka as the single source of truth\u2014no separate stores.\r\n* Analytical processors shouldn't need Kafka-specific adjustments.\r\n* Operational performance must remain uncompromised.\r\n* Kafka's mature ecosystem features, like ACLs and quotas, should be reused, not reinvented.\r\n\r\nJoin us for a thrilling account of the highs and lows of merging two data giants and stay tuned for the surprise twist at the end!", "recording_license": "", "do_not_record": false, "persons": [{"code": "W873GY", "name": "Tom Scott", "avatar": "https://program.berlinbuzzwords.de/media/avatars/W873GY_8JcSSnX.jpeg", "biography": "Long time enthusiast of Kafka and all things data integration, Tom has more than 10yrs experience (5yrs+ Kafka) in innovative and efficient ways to store, query and move data. Currently working at Streambased, Tom is building multi tenant, on-prem and cloud Kafka services to attack common Kafka pain points and break down barriers to starting your data journey.", "public_name": "Tom Scott", "guid": "f7db3d07-e5b9-5bd3-abdb-46a8c60a96f0", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/W873GY/"}, {"code": "E8ETVJ", "name": "Roman Kolesnev", "avatar": "https://program.berlinbuzzwords.de/media/avatars/E8ETVJ_eAaQxvq.jpg", "biography": "Roman is a Principal Software Engineer at Streambased. His experience includes designing and building business critical event streaming applications and distributed systems in the financial and technology sectors.", "public_name": "Roman Kolesnev", "guid": "76810c28-4388-5248-9cdc-691ea6f7dc7b", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/E8ETVJ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/GLM8BQ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/GLM8BQ/", "attachments": []}], "Palais Atelier": [{"guid": "ea35abff-a11e-5b60-a8ac-2f66676745d4", "code": "3XUUYV", "id": 70522, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/3XUUYV/Entwurf_1_56_HEamUyl.png", "date": "2025-06-16T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz25-70522-mixture-of-encoders-a-vector-native-approach-to-search", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3XUUYV/", "title": "Mixture of Encoders: A Vector-Native Approach to Search", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Mixture of Encoders is a vector-native alternative that models both structured and unstructured data in a unified embedding space. We will introduce the method, show how it powers natural language search and real-time recommendations, and share open-source tools and benchmarks for replacing complex hybrid stacks.", "description": "Filters, hybrid search, rank fusion, re-ranking. Most retrieval systems today are stitched together from separate components, each tuned in isolation. There is no systematic way to integrate structured data into vector search. Ask anyone maintaining a mature Elasticsearch deployment with 100+ boosts and hand-written scoring logic whether they can still evaluate retrieval quality end to end and iterate quickly. The answer is almost always no.\r\n\r\nTo address this, you need models that understand both your unstructured and your structured data. That includes numeric, categorical, relational, spatial, and temporal metadata, all of which are critical for powering modern search, recommendations, and agentic retrieval systems. These signals drive both end-user precision and business impact. At M&S(Marks and Spencer), we solved this problem using a set of custom pipelines, but the process required significant development effort and lacked a unified framework. There is a better way.\r\n\r\nWe call our approach the Mixture of Encoders. It is a vector-native alternative to hybrid search that brings structure to retrieval by embedding each data type with a specialised encoder and composing them into a unified vector space. Text, images, categories, numerical features, and contextual signals all become searchable through a single query. This enables nuanced, real-time retrieval across modalities without relying on filters or post-processing stages.\r\n\r\nIn this talk, we will introduce the technique and show how it supports natural language query decomposition, dynamic modality weighting, and session-aware ranking, all within a single retrieval step. We will share how this approach has been deployed in production, powering retrieval in high-churn environments and contributing over $10M in incremental revenue through improved discovery and recommendation quality. To support adoption, we are also releasing open source datasets for benchmarking real-world information retrieval tasks, along with open source demo implementations that show how to apply the Mixture of Encoders to your own data and use cases.\r\n\r\n---\r\nThis talk is sponsored by <a href=\"https://www.superlinked.com\">Superlinked</a>.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8TWA3E", "name": "Filip Makraduli", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8TWA3E_7CofXyz.jpeg", "biography": "Filip Makraduli is a machine learning engineer and developer advocate with a strong background in AI systems, vector search, and large language models (LLMs). He holds a Master\u2019s degree in Biomedical Data Science from Imperial College London. Currently, Filip works as a founding developer relations engineer at Superlinked, where he focuses on building real-time, multi-attribute search and recommendation systems. His work emphasizes the use of multi-encoder architectures to enhance retrieval quality and reduce reliance on reranking strategies. In the past, Filip worked as a data scientist at Marks & Spencer, where he contributed to AI-driven solutions for retail. He has also held machine learning engineering roles across several UK-based startups, focusing on applied AI and product-oriented ML development. In addition to his industry work, Filip has been active in the open-source community, particularly around LLM tooling and pipelines. He has delivered various talks on practical machine learning applications, including a presentation on AI-powered music recommendation systems titled \u201cWhen music just doesn\u2019t match our vibe, can AI help?\u201d Filip is passionate about bridging the gap between cutting-edge AI research and real-world applications, particularly in the areas of personalization, search, and recommendation systems. He also has a strong interest in the business side of technology, especially how product, research, and engineering decisions align with go-to-market strategies, developer adoption, and long-term commercial value.", "public_name": "Filip Makraduli", "guid": "b174cce8-4408-57e5-9710-20d885eb5831", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/8TWA3E/"}, {"code": "9VPWF9", "name": "Ben Gutkovich", "avatar": "https://program.berlinbuzzwords.de/media/avatars/9VPWF9_s8wEwP4.jpg", "biography": "Ben is the Co-Founder & COO of Superlinked.com, a compute and data engineering framework for turning data into vector embeddings, designed for building GenAI-powered RAG, Search, Recommender, and Analytics systems, while retaining control and maximising retrieval quality.\r\n\r\nPreviously, Ben supported C-level executives at large multi-national tech and media corporations with Growth and Operations Strategy as a Manager at McKinsey & Company and led Business Development at easyCar Club (acquired by Turo). Ben holds a bachelor's degree in Computer Science and an MBA from London Business School.", "public_name": "Ben Gutkovich", "guid": "31602732-8110-53ee-8403-b214315db089", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/9VPWF9/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3XUUYV/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/3XUUYV/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/3XUUYV/resources/Filip_Makraduli___seDaZG8.pdf", "type": "related"}]}, {"guid": "6284df42-4061-5f06-9a18-63a3d51c3939", "code": "HXUTN9", "id": 65485, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/HXUTN9/Entwurf_1_30_7SHTJT0.png", "date": "2025-06-16T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-65485-quiet-on-set-building-an-on-air-sign-with-open-source-tech", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HXUTN9/", "title": "Quiet on Set: Building an On-Air Sign with Open Source Tech", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Learn how to build a custom On-Air sign using Apache Kafka\u00ae, Apache Flink\u00ae, and Apache Iceberg\u2122! See how to capture events like Zoom meetings and camera usage with Python, process data with FlinkSQL, analyze trends from your Iceberg tables, and bring it all together with a practical IoT project that easily scales out.", "description": "While many of us have adapted to work from home life, one major problem remains: finding an easy way to keep folks in your home away from your workspace when you\u2019re on an important call. Dust off your Raspberry Pi\u2013\u2013let\u2019s build a custom on-air sign with Apache Kafka\u00ae, Apache Flink\u00ae, and Apache Iceberg\u2122!\r\n\r\nWe\u2019ll begin by writing Python scripts to capture key events\u2013\u2013such as when a Zoom meeting is running and when a camera is being used\u2013\u2013and produce it into Kafka. The live data are then consumed by a Raspberry Pi script to drive the operation of a custom designed on-air sign. From there, you\u2019ll be introduced to the ins and outs of FlinkSQL for stream processing as we wrangle the data into a better format for downstream use. And, finally, we\u2019ll see Iceberg in action and learn how to use query engines to analyze meeting and recording trends.\r\n\r\nBy the end of the session, you\u2019ll be well-acquainted with this powerful trio of open source technologies and know how you could use the same scaffolding and scale out a simple, at-home project to millions of users and simultaneous events.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BQKRWQ", "name": "Danica Fine", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BQKRWQ_MHUF7uf.jpg", "biography": "Danica began her career as a software engineer in financial services and pivoted to developer relations, where she focussed primarily on open source technologies under the Apache Software Foundation umbrella such as Apache Kafka and Apache Flink. She now leads the open source advocacy efforts at Snowflake, supporting Apache Iceberg and Apache Polaris (incubating).\r\n\r\nShe can be found on X, Bluesky, and Mastodon), talking about tech, plants, and baking @TheDanicaFine.", "public_name": "Danica Fine", "guid": "6d1075a1-ba4b-5270-abc2-38f3f2b1b25d", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/BQKRWQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HXUTN9/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HXUTN9/", "attachments": []}, {"guid": "0f85893c-581f-5dff-8efa-c98b61f2c6db", "code": "LQK7AE", "id": 61593, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/LQK7AE/Entwurf_1_20_Wv4fpRz.png", "date": "2025-06-16T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-61593-performance-fault-tolerance-building-a-modern-database", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/LQK7AE/", "title": "Performance & Fault Tolerance: Building a Modern Database", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "What are some key concepts and design decisions behind modern, scalable, highly performant databases?\r\nLearn how a database delivers sub-millisecond 99 percentile latency at throughputs of millions of operations per second, at scale, and how you can use it.", "description": "What are some key concepts and design decisions behind modern, scalable, highly performant databases? \r\nScyllaDB was initially inspired by Apache Cassandra. Instead of using Java, it\u2019s written in C++, which gives it fine control over hardware and operating system resources. These design decisions, using a shard-per-core architecture and more, allow it to achieve x10 performance compared to other databases, with sub-millisecond 99-percentile latency at throughputs of millions of operations per second. \r\nIn this deep dive, you will learn about the core concept and architecture of a modern, close-to-the-hardware, distributed, scalable, fault-tolerant database and see some examples of its use. The talk covers autotuning, scalability, elasticity, high availability, and more.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TR9WGC", "name": "Guy Shtub", "avatar": "https://program.berlinbuzzwords.de/media/avatars/TR9WGC_XWjcsIZ.jpg", "biography": "Guy is experienced in creating products that people love. Previously, he co-founded two startups. Outside of the office, you can find him climbing, juggling, and generally getting off the beaten path. Guy holds a B.SC. degree in Software Engineering from Ben Gurion University.", "public_name": "Guy Shtub", "guid": "116bb60a-a8ff-5514-94fa-cb12d3d419a5", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/TR9WGC/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/LQK7AE/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/LQK7AE/", "attachments": []}, {"guid": "60de0dfc-cd46-50b5-b7d7-af7b1523d62c", "code": "SEM8GV", "id": 64372, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/SEM8GV/Entwurf_1_29_8uoUW1l.png", "date": "2025-06-16T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-64372-a-decade-of-lessons-in-open-source-licensing", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/SEM8GV/", "title": "A decade of lessons in Open Source licensing", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Drawing from experience reviewing over 1,000 open-source releases, I'll address common misconceptions, frequent compliance issues, and the evolution of policies to mitigate these challenges.. Attendees will gain practical insights to ensure smoother project releases and foster a compliant, collaborative, open-source community.", "description": "In this talk, I will provide insights into how developers and community members in the open-source community navigate legal and licensing policies. Over the past decade, I have reviewed over 1000 open-source releases for compliance with various licensing and distribution policies. I will discuss common misconceptions that open-source community members have about licensing, highlight frequent issues encountered during releases, and share how our policies and processes have evolved over time to help catch these issues. I will also outline areas that need improvement to align with emerging legislation and industry standards. This talk aims to equip developers and organisations with practical knowledge to navigate the legal landscape of open-source software more effectively, ensuring smoother project releases and fostering a more collaborative and legally compliant community.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DLGEPQ", "name": "Justin Mclean", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DLGEPQ_CiK3E9g.jpg", "biography": "Justin Mclean is a highly experienced professional with over 30 years in web application development, education, and community work, and is an active contributor to open source software. Justin is a renowned speaker at conferences worldwide and currently serves as the Community Manager at Datastrato. He mentors projects in the Apache Software Foundation and holds positions as VP of the ASF Incubator, and is an ASF board member.", "public_name": "Justin Mclean", "guid": "5c9335ec-9c24-522f-8641-9b6b22a2f149", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/DLGEPQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/SEM8GV/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/SEM8GV/", "attachments": []}, {"guid": "4100be12-67d5-54cd-880a-abb96d296309", "code": "YAX9UA", "id": 64383, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/YAX9UA/Entwurf_1_5_szDyi0t.png", "date": "2025-06-16T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz25-64383-gammaflow-denoise-classify-and-disentangle-spectral-data", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YAX9UA/", "title": "gamma_flow: Denoise, classify and disentangle spectral data!", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "gamma_flow is an open-source Python package for real-time spectral data analysis. Designed for speed and efficiency, it avoids large models, opting instead for a novel supervised dimensionality reduction approach. This enables seamless denoising, classification, and disentangling of single-label or multi-label spectra.", "description": "In many research fields, spectral measurements help to assess material properties. In this context, an area of interest for many researchers is the classification (automated labelling) of the measured spectra. Additionally, there may be a need to decompound multi-label spectra (linear combinations of different substances) and identify their constituents.\r\nAs proprietary spectral analysis software are often limited in their functionality and adaptability, a Python package was developed and will be presented in this talk. \r\n\r\ngamma_flow (Guided Analysis of Multi-label spectra by Matrix Factorization for Lightweight Operational Workflows) includes the\r\n- classification of test spectra to predict their constituents\r\n- denoising of test spectra for better recognizability\r\n- outlier detection to evaluate the model's applicability to test spectra\r\nIt is based on a dimensionality reduction model that constitutes a novel, supervised approach to non-negative matrix factorization (NMF). Hence, it exploits and adapts conventional data science methods rather than using extensive, energy-intensive models like neural networks. This results in a fast, robust and reliable automated analysis, leading to classification accuracies above 90%.", "recording_license": "", "do_not_record": false, "persons": [{"code": "VWSFUY", "name": "Viola R\u00e4dle", "avatar": "https://program.berlinbuzzwords.de/media/avatars/VWSFUY_oitxdwP.jpg", "biography": "Viola R\u00e4dle works at the interface between environmental and data science. She discovered her interest in environmental dynamics while studying physics at the University of Heidelberg. In her master's thesis, she researched groundwater systems and later deepened this topic through Bayesian data analysis. She expanded her Python skills as a junior researcher at HTWK Leipzig, where she worked on asphalt recycling and alternative methods of hydrogen production. Since 2023, she has been working as a data scientist at the Federal Environment Agency's AI Lab, where she supports authorities in the field of digitalization and data analysis. In addition to developing prototypes, where she is responsible for programming, project organization and science communication, she gives exciting and accessible lectures in the field of artificial intelligence.", "public_name": "Viola R\u00e4dle", "guid": "e32eb9fe-3d46-507b-a980-3030e521e222", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/VWSFUY/"}, {"code": "XTFH7G", "name": "Raphael Franke", "avatar": "https://program.berlinbuzzwords.de/media/avatars/XTFH7G_OyGTWYo.jpg", "biography": "Raphael Franke is a Data Scientist at the Application Lab for Artificial Intelligence and Big Data at the German Environment Agency. With an academic background in mathematical statistics and data analysis he specializes in applying AI to real-world environmental challenges. His interests lie in probabilistic time series forecasting and leveraging data-driven insights for sustainable impact.", "public_name": "Raphael Franke", "guid": "b24cd439-87c5-540e-b746-1a18d76344e1", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/XTFH7G/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YAX9UA/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YAX9UA/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/YAX9UA/resources/Viola_Radle__Raph_GHt3Jtx.pdf", "type": "related"}]}, {"guid": "7abac71e-984a-5025-a566-6cb9e51efb63", "code": "TBYJME", "id": 71750, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/TBYJME/Entwurf_2_wi0wu7g.png", "date": "2025-06-16T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-71750-from-search-to-insight-leveraging-opensearch-for-scalable-ai-driven-search-experiences", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/TBYJME/", "title": "From Search to Insight: Leveraging OpenSearch for Scalable, AI-Driven Search Experiences", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Modern applications demand search capabilities that go beyond basic text matching\u2014they need to be fast, accurate, personalized, and context-aware. This session demonstrates how OpenSearch's latest AI/ML enhancements and engine improvements enable organizations to build intelligent, scalable search experiences that meet these evolving needs.", "description": "Observability, log analytics, GenAI systems, and RAG pipelines must query massive volumes of semantic embeddings to retrieve relevant content instantly. Today\u2019s search systems often fall short in handling high-dimensional vector data and similarity search.\r\n\r\nOpenSearch 3.0 brings significant architectural improvements to address these challenges. Integrating Apache Lucene 10 and JVM 21, the platform delivers 20% faster queries than its 2.x predecessor and 10\u00d7 the throughput of 1.x versions. New features like GPU-accelerated vector indexing and concurrent segment search dramatically improve k-NN query performance while reducing operational costs.\r\n\r\nThe platform's expanded AI capabilities now include an advanced Vector Engine for approximate k-NN searches and neural sparse search for efficient text indexing. These innovations, combined with optimized embedding ingestion and query-time pruning, enable organizations to build performant, cost-effective search solutions that scale with their needs.\r\n\r\nWe'll explore practical applications of these features, demonstrating how OpenSearch 3.0 powers the next generation of AI-driven search experiences.\r\n\r\n---\r\nThis session is sponsored by <a href=\"https://opensearch.org\">OpenSearch</a>.", "recording_license": "", "do_not_record": false, "persons": [{"code": "R3NQ3P", "name": "Saurabh Singh", "avatar": "https://program.berlinbuzzwords.de/media/avatars/R3NQ3P_RMgu142.png", "biography": "Saurabh is a Software Development Manager at AWS leading the core search, release, and benchmarking areas of the OpenSearch Project. His passion lies in finding solutions for intricate challenges within large-scale distributed systems.", "public_name": "Saurabh Singh", "guid": "640465fc-b8d9-53df-af24-35f7210afd0f", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/R3NQ3P/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/TBYJME/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/TBYJME/", "attachments": []}, {"guid": "1b0938ce-f1ea-510b-b161-737febc03419", "code": "SBVMSZ", "id": 64496, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/SBVMSZ/Entwurf_1_27_Q9g9miP.png", "date": "2025-06-16T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-64496-qumat-apache-mahout-quantum-compute", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/SBVMSZ/", "title": "Qumat: Apache Mahout Quantum Compute", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this talk we present current progress on Mahout's new quantum compute layer named Qumat. We will give an overview of the project, explain why we built Qumat, and show its current state. We will present a demo of Qumat in action, and end with calls to action for researchers and engineers interested in using it and contributing to the project.", "description": "Following Mahout's core values of interoperability and providing tools for matrix arithmetic at scale, we have added a new layer (qumat) alongside our existing distributed matrix math framework (Samsara), that allows quantum researchers and developers to write code once and run it on any back-end available.\r\n\r\nAs with distributed compute systems like Spark and Flink, moving from one platform to another typically requires a complete code rewrite. This is prohibitive in most cases, but Samsara allows machine learning researchers and developers one unified interface to write code once and port instantly to another platform if it is deemed necessary.\r\n\r\nSimilarly for quantum computing, multiple vendors (IBM, GCP, and AWS to name a few) have their own libraries for accessing their cloud quantum compute services, such as qiskit, cirq, and braket. To give the same flexibility in the quantum area, qumat corrals all these libraries under one interface, allowing users to focus on building circuits and writing algorithms rather than adapting to one particular library.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8WJKRX", "name": "Andrew Musselman", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8WJKRX_FTwtyzJ.PNG", "biography": "Andrew works on data and analytics, and runs software teams for a living. He has contributed to the Apache Mahout project for over a decade and has been an ASF member for four years.", "public_name": "Andrew Musselman", "guid": "76e557bf-5183-5306-a538-fed1d031b657", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/8WJKRX/"}, {"code": "K387Y9", "name": "Trevor Grant", "avatar": "https://program.berlinbuzzwords.de/media/avatars/K387Y9_bp46TSx.jpg", "biography": "Trevor Grant is getting back into speaking at conferences after a hiatus from an otherwise prolific career that was put on pause during the pandemic. During his pause he became a father, published a book (Kubeflow for Machine Learning: From Lab to Production), had a second son, consulted for a while, went back to work for IBM Research, and became car free (this list is not ordered chronologically nor by significance).  He has been putzing around  generative AI since ~2017, and someday hopes to give a talk on his Star Trek Chat bots of 2018-2020. His primary open source interests at the moment is the `qumat` project of Apache Mahout, and the `gofannon` project of The AI Alliance", "public_name": "Trevor Grant", "guid": "0d7d580f-7cb9-5c8a-8827-c01a76ab893a", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/K387Y9/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/SBVMSZ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/SBVMSZ/", "attachments": []}, {"guid": "7df538cb-e8e9-52d4-94c5-41e50ea86d1f", "code": "ZGPRFQ", "id": 65581, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/ZGPRFQ/Entwurf_1_12_6THE0e4.png", "date": "2025-06-16T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-65581-do-what-i-mean-the-history-of-ai-and-program-synthesis", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/ZGPRFQ/", "title": "\"Do What I Mean\": The History of AI and Program Synthesis", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "We think of generating source code from a prompt as an AI-powered feature of modern IDEs, but the general problem has a rich history in research efforts and domain-specific programming systems. In this session, you'll learn about the history of program synthesis, its relationship to the history of AI, and what lessons this history has for us today.", "description": "Many programmers today rely on an AI-powered assistant in their editor, and many savvy users of language models have observed that LLMs are often better at writing code to solve a problem than at reasoning directly. However, relatively few developers know that generating correct programs from human specifications has been an active research area for over fifty years. \r\n\r\nIn this session, you'll learn about the fascinating history of this cross-disciplinary effort and see how it brings together topics from statistical machine learning, classical symbolic AI, programming language theory, program verification, and combinatorial search. We'll cover fundamental approaches, challenges, and historically-important applications; we'll also show some interesting parallels between the history of AI systems and the history of program synthesis.  We'll conclude with vital lessons from the history of program synthesis that can inform how we should build tomorrow's coding assistants \u2014 and how we can better use the ones available to us today.", "recording_license": "", "do_not_record": false, "persons": [{"code": "V7EF3R", "name": "William Benton", "avatar": "https://program.berlinbuzzwords.de/media/avatars/V7EF3R_KUX0tV5.jpg", "biography": "William Benton is passionate about making it easier for machine learning practitioners to benefit from advanced infrastructure and making it possible for organizations to manage machine learning systems. His recent roles have included defining product strategy and professional services offerings related to data science and machine learning, leading teams of data scientists and engineers, and contributing to many open source communities related to data, ML, and distributed systems. Will lives in the midwestern United States with his wife and three children and spends some of his spare time chasing light on bicycles or capturing it with cameras.", "public_name": "William Benton", "guid": "d0cb7874-0585-5c70-aab5-d818c70cc5ed", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/V7EF3R/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/ZGPRFQ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/ZGPRFQ/", "attachments": []}], "Frannz Salon": [{"guid": "6eb9d4e5-7600-5b9d-9e7a-83bcb9bb918d", "code": "H38JKG", "id": 63488, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/H38JKG/Entwurf_1_26_d4yvXlk.png", "date": "2025-06-16T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Frannz Salon", "slug": "bbuzz25-63488-going-local-first-a-primer", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/H38JKG/", "title": "Going Local-First: A Primer", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "The local-first paradigm promises transformative benefits \u2014 user-owned data, seamless offline capabilities, and instant interactions. But how do you get started? In this lightning talk, we\u2019ll cover the key concepts and show you how to begin your local-first journey.", "description": "Not every new trend in web development is destined to stick, but the local-first paradigm feels different. It\u2019s about more than offline capability \u2014 it\u2019s a shift toward user-owned data, instant interactions, and apps that work seamlessly, regardless of network conditions. This talk offers a short, approachable introduction to building local-first applications today, covering core principles, architectures, and tools.", "recording_license": "", "do_not_record": false, "persons": [{"code": "VHE7TK", "name": "Milo\u0161 Sutanovac", "avatar": "https://program.berlinbuzzwords.de/media/avatars/VHE7TK_VrO7SHL.png", "biography": "Milo\u0161 Sutanovac is a software engineer with nearly a decade of experience, including work with companies like BMW and Deutsche Telekom. He currently focuses on local-first architectures and building scalable, resilient applications. With a background in education, Milo\u0161 has mentored hundreds of students and enjoys sharing his knowledge through talks and workshops.", "public_name": "Milo\u0161 Sutanovac", "guid": "c0e3719f-2cd9-5399-a561-3f9fe5176b92", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/VHE7TK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/H38JKG/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/H38JKG/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/H38JKG/resources/Milos_Sutanovac__g_ytLlsps.pdf", "type": "related"}]}, {"guid": "4bded1f7-b84c-558a-96d5-cb5d2c373863", "code": "EGQNC8", "id": 59030, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/EGQNC8/Entwurf_1_46_ZYio5D7.png", "date": "2025-06-16T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-59030-the-dark-secrets-of-stream-processing", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EGQNC8/", "title": "The Dark Secrets of Stream Processing", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Stream processing systems promise fresh results, strong consistency, and S3-based cost savings. But pitfalls exist:\r\n\r\n* Backfilling takes too long due to incremental state maintenance.\r\n* Consistency causes system stalls during joins.\r\n* S3 costs spike with cache misses.\r\n\r\nThis talk explores these issues, mitigations, and hard truths.", "description": "Stream processing systems seem magical: they deliver much fresher results compared to batch processing, promise the highest levels of consistency, and leverage S3 to reduce state storage costs.\r\n\r\nBut is it too good to be true? In the world of data systems, there\u2019s no such thing as a free lunch. Every benefit comes with trade-offs.\r\n\r\nHere are three pitfalls that new stream processing practitioners often overlook:\r\n\r\n* Backfilling Takes Forever\r\nStream processing systems continuously maintain internal states to enable incremental computation. However, this comes at a cost: bootstrapping a streaming job\u2014or creating a materialized view in the database context\u2014can take an arbitrarily long time. The larger the historical data or the more complex the processing, the worse this problem becomes.\r\n\r\n* Consistency Isn't Free\r\nMany stream processing systems claim to offer \"strong\" consistency, even across multiple streaming jobs. However, this level of consistency has a price: system stalls during events like join amplifications or dependency mismatches. These bottlenecks can significantly impact real-time performance and overall system reliability.\r\n\r\n* S3 is Cost-Effective, Until It\u2019s Not\r\nModern stream processing systems often use S3 as the primary storage for maintaining states, as it promises lower costs compared to in-memory or on-disk alternatives. But here\u2019s the catch: S3 access costs can skyrocket when cache misses are too frequent. What starts as a cost-saving measure can quickly turn into a major expense.\r\n\r\nIn this talk, I\u2019ll dive deep into these three pitfalls, explaining their causes, possible mitigations, and the hard truths about unsolvable challenges. I\u2019ll share real-world examples of how these issues manifest and the \u201cbloody facts\u201d of how they can bite even the most experienced practitioners.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BH7KTW", "name": "Yingjun Wu", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BH7KTW_Rac9HrO.png", "biography": "Yingjun Wu is the founder of RisingWave Labs (https://www.risingwave.com/), a database company developing RisingWave, a distributed SQL database for stream processing. Before running the company, Yingjun was a software engineer at the Redshift team, Amazon Web Services, and a researcher at the Database group, IBM Almaden Research Center. Yingjun received his PhD degree from National University of Singapore, and was a visiting PhD at Carnegie Mellon University. He has been working in the field of stream processing and database systems for over a decade.", "public_name": "Yingjun Wu", "guid": "a705555f-19ac-55cd-843a-74d73eab0373", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/BH7KTW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EGQNC8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EGQNC8/", "attachments": []}, {"guid": "49af71ab-e255-5a3d-88b6-b8d93b0d355f", "code": "9BQWQ8", "id": 65505, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/9BQWQ8/Entwurf_1_51_7VFf8Va.png", "date": "2025-06-16T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-65505-state-of-native-access-in-apache-lucene", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/9BQWQ8/", "title": "State of native access in Apache Lucene", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Lucene 10 came out last year in October. One of the changes was about the minimum version requirement of Java 21 \u2013 this looks like it allows to introduce new features like native access to file system cache and therefore better memory mapping. Is it as easy as it sounds?", "description": "This talk will discuss the new features of Java 21 and how they can be used in Lucene 10. But it will also show the challenges that come from the fact that the many of the Lucene-relevant APIs are preview-only in that version. \r\nUwe will introduce the mechanisms used to provide access to the native layer of Java 21 but also the limitations. New features implemented using native APIs are `madvise`/`fadvise` kernel hints for the file system, but also preloading of pages required during search.\r\nThe limitations of the current system make it impossible to integrate modern Java APIs into the public API of Lucene, so Lucene 10 still has the same limitations like previous versions with regards to Java preview features. Discussions are ongoing to change the main branch to align to the \"[tip & tail](https://openjdk.org/jeps/14)\" model of development done in the OpenJDK community and start to integrate types like `MemorySegment` as first class citizen to Lucene's public API in the main (development) branch and release new versions with new minimum requirements more often. One example for this is a PR to offload calculations to a graphics card which was submitted to the Lucene project at the time of writing this proposal.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HRJC87", "name": "Uwe Schindler", "avatar": "https://program.berlinbuzzwords.de/media/avatars/HRJC87_fMJex7f.jpg", "biography": "Uwe is committer and PMC member of Apache Lucene and Apache Solr. His main focus is on development of Lucene Core. He implemented fast numerical search and is maintaining the new attribute-based text analysis API. He studied Physics at the University of Erlangen-Nuremberg and works as managing director for SD DataSolutions GmbH in Bremen, Germany, a company that provides consulting and support for Apache Lucene, Elasticsearch, and Apache Solr. He also works for \u201cPANGAEA \u2013 Publishing Network for Geoscientific & Environmental Data\u201d where he implemented the portal's geo-spatial retrieval functions with Lucene Java. Uwe had talks about Lucene at various international conferences like the previous Berlin Buzzwords, ApacheCon EU/US, Lucene Revolution, Lucene Eurocon, and various local meetups.", "public_name": "Uwe Schindler", "guid": "66a8fa69-bb21-5e95-bc1f-c8a092640daf", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/HRJC87/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/9BQWQ8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/9BQWQ8/", "attachments": []}, {"guid": "b53ae6d4-da10-53b8-90df-2bb6a75c76d5", "code": "DARWF8", "id": 64833, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/DARWF8/Entwurf_1_42_rTgALHZ.png", "date": "2025-06-16T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-64833-when-statefulsets-are-not-enough", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DARWF8/", "title": "When StatefulSets are not enough", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "K8s StatefulSets present significant hurdles for scaling and migrating large-scale cloud database workloads. We'll cover scaling strategies beyond vanilla StatefulSets and share lessons on executing zero-downtime live migrations using custom controllers, durable execution workflows, and tackling complex synchronization problems in ClickHouse Cloud.", "description": "This is a densely packed technical talk that teaches you Auto Scaling architecture, Kubernetes StatefulSets and their limitations, various scaling strategies and statefulset alternatives. We also look at building custom kubernetes controllers for the purpose of changing our orchestration code-path, and investigate leveraging durable execution workflows like Temporal for managing zero downtime migrations.\r\n\r\nYou will understand the Pros and Cons of Break-First and Make-First scaling models and which to use when. We focus on the challenges that prevent doing Make-first with traditional StatefulSets. We discuss open source projects such as Advanced StatefulSets, OpenKruise and a custom Multi-StatefulSet approach.  We go into the story of moving from one mode of orchestrating StatefulSet to another via a Live Migration, without breaking the running queries. Finally we end with some ClickHouse specific problems we encountered during the migrations and how we solved them.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EM8WQ7", "name": "Manish Gill", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EM8WQ7_bU6DEk5.jpg", "biography": "Manish Gill works at ClickHouse Inc, where he is managing the AutoScaling team for ClickHouse Cloud. He is based out of Berlin and is deeply interested in Databases and Cloud challenges and still considers himself new to Kubernetes.\r\n\r\nIn a past life, he worked in an ML research team doing Traffic prediction for at Global Scale and was a Data Engineer for more than half a decade before that.", "public_name": "Manish Gill", "guid": "4fd48c5a-ef36-5893-83dc-73ab9cf815ee", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/EM8WQ7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DARWF8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DARWF8/", "attachments": []}, {"guid": "9110e610-9b83-5cfd-9c77-914c7ac1b97c", "code": "J93LCV", "id": 63251, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/J93LCV/Entwurf_1_32_1fElk11.png", "date": "2025-06-16T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Frannz Salon", "slug": "bbuzz25-63251-reproducibility-in-embedding-benchmarks", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/J93LCV/", "title": "Reproducibility in Embedding Benchmarks", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Reproducibility in embedding benchmarks is challenging, especially with embedding models that are instruction-tuned and increasingly large. Learn how MTEB tackles prompt variability, scaling issues, and large datasets to ensure fair and consistent evaluations, setting a standard for benchmarking in embeddings.", "description": "Reproducibility in embedding benchmarks is no small feat. Prompt variability, growing computational demands, and evolving tasks make fair comparisons a challenge. The need for robust benchmarking has never been greater. \r\n\r\nThe Massive Text Embedding Benchmark (MTEB) addresses these challenges with a standardized, open-source framework for evaluating text embedding models. Covering diverse tasks like clustering, retrieval, and classification, MTEB ensures consistent and reproducible results. Extensions like MMTEB (multilingual) and MIEB (image) further expand its capabilities.\r\n\r\nIn this talk, we\u2019ll explore the quirks and complexities of benchmarking embedding models, such as prompt sensitivity, scaling issues, and emergent behaviors. We\u2019ll show how MTEB simplifies reproducibility, making it easier for researchers and industry practitioners to measure progress, choose the right models, and push the boundaries of embedding performance.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ABV3RR", "name": "Isaac Chung", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ABV3RR_DGDipeD.png", "biography": "My focus is on making AI systems usable, scalable, and maintainable. I'm currently a Staff Data Scientist at Zendesk QA, working on LLM-powered features that see millions of conversations a day. \r\n\r\nPreviously at Clarifai, I helped build and maintain multimodal retrieval systems in production. My background is in Aerospace Engineering and Machine Learning and I hold undergraduate (B.A.Sc in EngSci) and graduate (M.A.Sc) degrees from the University of Toronto.\r\n\r\nIn my spare time, I am a maintainer for MTEB, I like to see the world, and do a bit of swim/bike/run racing.", "public_name": "Isaac Chung", "guid": "7b2cecc4-1faa-52f2-964d-2c318a393619", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/ABV3RR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/J93LCV/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/J93LCV/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/J93LCV/resources/Isaac_Chung__Repro_sLSNfT5.pdf", "type": "related"}]}, {"guid": "94d2de79-efe2-571e-9885-f24ef9d349ad", "code": "CMBUQB", "id": 65493, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/CMBUQB/Entwurf_1_40_i3Hx0tw.png", "date": "2025-06-16T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-65493-best-practices-for-running-databases-on-kubernetes", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CMBUQB/", "title": "Best Practices for Running Databases on Kubernetes", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Running open source databases on Kubernetes? Learn best practices for high availability, security, backups, and disaster recovery. Discover key pitfalls to avoid and see how Operators simplify database management for MySQL, MongoDB, and PostgreSQL in Kubernetes environments.", "description": "So you\u2019re looking to run your Open Source Database on Kubernetes. What best practices should you follow and what pitfalls should you avoid ? In this presentation we will look at how to run stateful applications on Kubernetes overall as well as what is particularly important for databases - we will cover high availability, security, backups and disaster recovery. Finally we will show how these practices can be implemented with Percona Operators for MySQL, MongoDB, PostgreSQL - one of the leading solutions to run Open Source Databases on Kubernetes", "recording_license": "", "do_not_record": false, "persons": [{"code": "QGCFRJ", "name": "Peter Zaitsev", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QGCFRJ_5sNnZcj.jpg", "biography": "Peter Zaitsev is an entrepreneur and co-founder of Percona, Coroot, FerretDB and other tech companies. As one of the leading experts in Open Source strategy and database optimization, Peter has applied his technical knowledge and entrepreneurial drive to contribute as a board member and advisor to several open source startups. Additionally, Peter is the co-author of the book \"High Performance MySQL: Optimization, Backup and Replication,\" one of the most popular books on MySQL performance.", "public_name": "Peter Zaitsev", "guid": "6dcee61b-5dbd-5315-a326-5a6d4e695c8a", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/QGCFRJ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CMBUQB/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CMBUQB/", "attachments": []}, {"guid": "21e9101c-49b3-578b-8ab5-44c9629317df", "code": "YGPGNY", "id": 62364, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/YGPGNY/Entwurf_1_6_Oju7YbT.png", "date": "2025-06-16T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-62364-apache-iceberg-ingestion-with-apache-nifi", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YGPGNY/", "title": "Apache Iceberg ingestion with Apache NiFi", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "With Apache NiFi, a multimodal data pipelining tool, you can assemble existing and/or custom Java & Python processors into a variety of flows. Watch a rich data pipeline be constructed from Kafka, stored using the Apache Iceberg table format and consumed from Trino.", "description": "A cornerstone requirement of an Icehouse (Iceberg + Trino) is data ingestion. One approach is to leverage Apache NiFi. NiFi, a multimodal data pipelining tool, has a multitude of processors that can be assembled into a flow to address your specific scenarios. NiFi's low-code/no-code approach allows data engineers to rapidly build, deploy, and monitor their data ingestion & transformation pipelines. NiFi also allows custom processor development with a variety of languages, including Java and Python.\r\n\r\nThis presentation will iterate through a few common approaches and ultimately demonstrate a rich data pipeline that sources data from Kafka, performs typical transformation processing (including enrichment), and loads data into a high-performance Iceberg table that will be consumed via Trino.", "recording_license": "", "do_not_record": false, "persons": [{"code": "M8QLKK", "name": "Lester Martin", "avatar": "https://program.berlinbuzzwords.de/media/avatars/M8QLKK_5Mpl1YO.JPG", "biography": "Lester Martin is a seasoned developer advocate, trainer, blogger, and data engineer focused on data pipelines & data lake analytics using Trino, Iceberg, Hive, Spark, Flink, Kafka, NiFi, NoSQL databases, and, of course, classical RDBMSs.  Check out Lester's blog at https://lestermartin.blog.", "public_name": "Lester Martin", "guid": "2e76688f-658c-5ca2-84d5-7fd9fad7940a", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/M8QLKK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YGPGNY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YGPGNY/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/YGPGNY/resources/Lester_Martin__Ice_rGq8uOb.pdf", "type": "related"}]}, {"guid": "0b2e8d27-61d0-5f74-a6f3-3ce56f6022b2", "code": "JPFG8A", "id": 65379, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/JPFG8A/Entwurf_1_34_UKGZ8p2.png", "date": "2025-06-16T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-65379-exploring-reranking-depth-in-modern-search-pipelines", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JPFG8A/", "title": "Exploring reranking depth in modern search pipelines", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "The use of semantic reranking on top of a \u2018cheaper\u2019 retrieval step is common in modern search applications. The reranking depth represents the number of documents that we select to retrieve and feed into the reranking model. We experiment with different models and datasets and we present our findings including some counterintuitive ones.", "description": "The use of semantic reranking on top of a \u2018cheaper\u2019 retrieval step becomes more and more common in modern search applications. It offers a different cost quality profile to semantic retrieval trading indexing time compute for retrieval time compute. The depth represents the number of documents that we select to retrieve and feed into the reranking model in order to optimise their ordering. Intuitively, there is a \u201cnatural\u201d trade-off between the uplift we can achieve by operating on an increased pool of candidates and the associated cost of running \u201cexpensive\u201d semantic rerankers for longer. In this presentation we start by investigating the behaviour of different models across different scenarios and we present our observations including some counterintuitive ones. Then, we attempt to explain the emergence of certain patterns and finally we revisit the \u201cefficiency vs effectiveness\u201d trade-off from two different perspectives.\r\n\r\nHere, is an outline of the talk:\r\n\r\nFirst, we analyse the retrieval performance as a function of the reranking depth and we identify three main patterns: \r\n- Fast increase followed by saturation: this is the most common scenario where larger reranking depth leads to increased performance \r\n- Fast increase to a maximum then decay: this is the first \u201ccounter-intuitive\u201d result where reranking is beneficial until a certain depth but then performance degrades\r\n- Steady decay: this is the case where the reranker actually worsens the ordering of the results provided by the retriever - it\u2019s the least common scenario but still a counter-intuitive result\r\n\r\nSecond, we dive into these three classes and we attempt to explain the observed behaviour. For the first pattern we design a curve fitting procedure which provides a surprisingly good fit. For the other two cases we discuss some potential underlying causes for the performance decline.\r\n\r\nThird, we connect our findings to existing works in the industry or academia and we highlight some of the dataset characteristics that seem relevant to the observed results\r\n\r\nFourth, we show how the interplay of the scores between positive(or relevant) and negative(or irrelevant) documents can explain the emergence of the patterns\r\n\r\nFinally, we revisit the \u201cefficiency vs effectiveness\u201d tradeoff . We start with a \u201clatency-free\u201d analysis where we focus only on the evolution of our performance metric and examine the possibility of using a smaller reranking depth without losing much of the gains. We also show how this correlates with the recall performance of the first-stage retriever. Then, we incorporate the latency cost in order to present a more realistic scenario and explain the trade-offs under different budget constraints.\r\n\r\nThis talk is relevant to the audience because:\r\n- Retrieval performance remains critical in modern applications such as RAG. \r\n- Highlights the importance of domain-specific evaluation", "recording_license": "", "do_not_record": false, "persons": [{"code": "CYTFGU", "name": "Athanasios Papaoikonomou", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CYTFGU_o4LOdJD.png", "biography": "Senior ML Engineer / NLP at Elastic", "public_name": "Athanasios Papaoikonomou", "guid": "090d4c65-d3d8-591c-a974-c1d64e30201a", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/CYTFGU/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JPFG8A/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JPFG8A/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/JPFG8A/resources/Thanos_Papaoikonom_M3tfVAX.pdf", "type": "related"}]}]}}, {"index": 3, "date": "2025-06-17", "day_start": "2025-06-17T04:00:00+02:00", "day_end": "2025-06-18T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "f93eda63-aba3-5c99-9d52-f018f0dae5da", "code": "HYMXUP", "id": 65316, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/HYMXUP/Entwurf_1_31_Fwr1Nmu.png", "date": "2025-06-17T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz25-65316-breaking-search-for-fun-and-profit", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HYMXUP/", "title": "Breaking Search For Fun and Profit", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "With a little experience it's easy to find site search queries that don't work. With live examples picked from a variety of high-profile websites, I'll show you how to easily break search - and discuss what we mean by 'broken', the different kinds of failure and what they reveal about the underlying search engine and how we might improve it.", "description": "Even the most sophisticated search system will fail in some cases - we simply can't predict all the possible queries the user might try, what they are actually trying to achieve and how they might express their needs. Let's try together to break search on a number of well-known websites - but not just for fun! There are different kinds of 'broken' - zero result searches, irrelevant results, system errors - we'll describe each of these and show examples, creating a classification system for search failures. We'll then talk about what underlying issues with search we might be able to reveal and how they could be fixed to improve overall search quality.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3MCCUJ", "name": "Charlie Hull", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3MCCUJ_nBIHDxD.jpg", "biography": "I am a leading figure in the search industry, known for an honest, neutral and pragmatic viewpoint; I have held multiple roles including senior consultant, strategic advisor, project manager, sales & marketing director, conference organiser & speaker, trainer, writer & mentor. I am deeply connected with the business & technology of website and enterprise search engines with particular experience of small, high-value consulting companies. My past experience in software engineering gives me a highly informed perspective on search technology with a particular focus on open source platforms such as Lucene, Apache Solr, Elasticsearch and OpenSearch.  More recently I have helped several companies use modern AI techniques to supercharge search. I co-wrote Searching the Enterprise, ran the Haystack conference for 5 years and held leadership positions at OpenSource Connections and Flax.", "public_name": "Charlie Hull", "guid": "d4e6be87-f5d7-5719-ac4f-f0df4a854ca4", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/3MCCUJ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HYMXUP/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/HYMXUP/", "attachments": []}, {"guid": "3684cb1a-bce0-5f03-8204-6bd94faa4beb", "code": "UAFNDR", "id": 64799, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/UAFNDR/Entwurf_1_15_5PEtBSZ.png", "date": "2025-06-17T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-64799-hybrid-search-on-hybrid-models-at-scale", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UAFNDR/", "title": "Hybrid search on hybrid models, at scale", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "We present an extensible hybrid search solution using Elasticsearch, built on a multi-index architecture and allowing the integration of multiple embedding models. Our approach addresses the challenges of searching a vast and heterogeneous collection, using different chunking granularity and offering an alternative to reciprocal rank fusion.", "description": "Over the last few years we have been pushing to the limits our full-text search solution for the French Audiovisual Institute. However, some areas of our immense corpus are still inaccessible, either because the multimedia content lacks textual annotations, or because the automatic transcriptions are not self-sufficient for an efficient full-text search.\r\n\r\nSemantic search appears as a natural complement, but the scalability of the implementation reveals specific challenges in capacity planning and chunking strategies to accommodate different embedding models.\r\n\r\nNevertheless, when it comes to merging the benefits of both text and vector search methods, the success of the hybrid search approach relies essentially on the reranking algorithm. To address this, we developed an alternative to the reciprocal rank fusion based on our needs, specifically tailored for a multi-index architecture and integrating multiple embedding sets.\r\n\r\nIn this talk, we share our experience in building an extensible hybrid search solution, covering everything from complex functional modeling to cluster architecture design. Attendees will gain practical insights into handling billions of vectors in real-world scenarios, such as within large graph data structures. Additionally, we will explore the challenges of hybrid reranking, discussing the limitations of standard fusion techniques and the rationale behind our novel approach.\r\n\r\nWhile relevance evaluation is still ongoing, our modular architecture enables continuous iteration, ensuring the adaptability to the rapid evolution of embedding models and vector optimizations. This flexibility positions our solution to remain at the forefront of large-scale semantic search, balancing precision, scalability, and efficiency.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JNFKMQ", "name": "Radu Pop", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JNFKMQ_bCBicJL.JPG", "biography": "Radu provides consulting services as a Solutions Architect at Adelean. He handles projects around Elasticsearch and Adelean\u2019s A2 search technology. He oversees the integration and evolution of search engines within large e-commerce platforms, marketplaces, or organizations' data lakes. Prior to joining Adelean, Radu acquired solid experience in web archiving, operating large-scale crawling systems in the context of several European research projects. He holds a PhD in Computer Science and an MSc in Distributed Systems.", "public_name": "Radu Pop", "guid": "adbcba05-e957-5634-87ae-766e99e33c4c", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/JNFKMQ/"}, {"code": "CQLPDE", "name": "Pietro Mele", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CQLPDE_BqInQg7.jpg", "biography": "Italian, adopted by France not long ago, I am a constant learner, dedicated to computer science and discovery\u2014whether uncovering solutions or gaining insights.\r\n\r\nSpeaker at : \r\n\r\n- ElasticON 2023 - Searching through large graphs using Elasticsearch\r\n- Devoxx France 2023 - Cloning CHATGPT with ElasticSearch and HuggingFace\r\n- 10th Meetup Search & Data - Construire une API conversationnelle au dessus d'un moteur de recherche\r\n- Haystack US 2023 - Dive into NLP with the Elastic Stack\r\n- VoxxedDays Luxembourg 2023 - Cloner ChatGPT avec Hugging Face et Elasticsearch\r\n- DevoxxMorocco 2023 - Conversational Search - Unleashing the Power of Voice Search, Question Answering, and LLMs\r\n- DevFest Toulouse 2023 -  Cloner ChatGPT avec Hugging Face et Elasticsearch\r\n- 11th Meetup Search & Data - Exploration of an Open Source Rag System\r\n- Devoxx France 2024 - Mettre en place un RAG Open Source en 30 minutes\r\n- Devoxx France 2024 - Construire son Assistant Intelligent avec Hugging Face et Elasticsearch\r\n- OpensearchCon EU 2024 - Implementing an open-source RAG with OpenSearch\r\n- VoxxedDays Luxembourg 2024 - Home Assistant sous surveillance\r\n- Devoxx Morocco 2024 - A practical guide about prompt engineering\r\n- 1st OpenSearch France UG - To the discovery of OpenSearch AI superpowers!\r\n- Big Data Europe 2024 - Exploring Large Graphs at the Heart of the French National Audiovisual Institute\r\n-ElasticON 2025 - Billion vectors baby\r\n-Devoxx UK 2025 - Exploring Large Graphs at the Heart of the French National Audiovisual Institute\r\n-OpensearchCon EU 2025 - Monitoring a smart home with Opensearch", "public_name": "Pietro Mele", "guid": "26efca5d-0c58-5034-804b-9a0312f01178", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/CQLPDE/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UAFNDR/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UAFNDR/", "attachments": []}, {"guid": "51263952-e793-57d1-9f5d-80f4958aa353", "code": "YQJEPH", "id": 65394, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/YQJEPH/Entwurf_1_8_fcfCyVo.png", "date": "2025-06-17T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-65394-minicoil-sparse-neural-retrieval-done-right", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YQJEPH/", "title": "miniCOIL: Sparse Neural Retrieval Done Right", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this talk, we present miniCOIL \u2014 our attempt to make a sparse neural retrieval model as it should be \u2014 combining the benefits of dense and lexical retrieval without propagating their drawbacks. We will share how to design and train a lightweight model that is performant on out-of-domain data and demonstrate its capabilities.", "description": "Production search solutions often need the benefits of exact matching and semantic similarity \u2014 who wouldn\u2019t want to have it all?\r\nThe most famous to-go approach is hybrid search, which combines old but gold lexical methods with dense retrieval models. Hybrid search is famous for a reason; however, due to its dual component nature, taking the best of both worlds, it also takes the worst \u2014 propagates all the intricacies of vector search (heavy vectors, capricious indexes) and limitations of lexical approaches (low recall).\r\nA less famous solution is sparse neural retrieval \u2014 models, which make exact matching semantically aware, can distinguish \u201ca fruit bat\u201d and \u201ca baseball bat\u201d. You might know sparse neural retrieval for SPLADE, a leader in sparse neural benchmarks & a heavy model creating not-so-sparse vectors with its query/document extension mechanisms.\r\nSparse neural retrieval seems pitch-perfect from afar: inverted indices and semantical understanding combined. It\u2019s perhaps overlooked since many attempts to make it lightweight & performant on out-of-domain data failed.\r\nminiCOIL is our shot to give sparse neural retrieval more deserved attention \u2014 a lightweight model understanding words\u2019 meaning within the context, performant on out-of-domain datasets and easy to adapt to custom data.\r\nIn this talk, after an introduction in the context of sparse neural retrieval, we will show the architecture behind miniCOIL and demonstrate its capabilities.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DQMD97", "name": "Evgeniya Sukhodolskaya", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DQMD97_uVXHLSm.jpg", "biography": "Developer Relations at Qdrant with 7 years of IT experience across software engineering, machine learning, and technical management, and 3 years in Developer Relations. Holds a Master\u2019s in Machine Learning, Data Analytics, and Data Engineering. Passionate about NLP, data-centric AI, and the role of vector databases in advancing AI technologies.", "public_name": "Evgeniya Sukhodolskaya", "guid": "76b95d48-3f05-5de6-a988-646d836fb2e3", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/DQMD97/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YQJEPH/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/YQJEPH/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/YQJEPH/resources/Evgeniya_Sukhodols_qhMf54p.pdf", "type": "related"}]}, {"guid": "3b000086-b251-59be-9ba3-609cbe424d52", "code": "GFBXSY", "id": 61248, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/GFBXSY/Entwurf_1_23_hdfAic6.png", "date": "2025-06-17T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-61248-flinkcdc-streamlining-your-data-analytics-pipelines", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/GFBXSY/", "title": "FlinkCDC: Streamlining your data analytics pipelines", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Change Data Capture (CDC) is a powerful technique that enables organisations to react to data changes in real-time. In this talk we will explore FlinkCDC, a component of Apache Flink, and demonstrate how it leverages Flink's robust stream processing capabilities to provide CDC pattern.", "description": "Through practical demo, we'll see how FlinkCDC efficiently captures, transforms and loads data change across many systems with minimal latency, enabling seamless data integration and real-time analytics. Moreover, we'll look under the hood to learn more about its fault-tolerance mechanisms.\r\n\r\nWhether you're dealing with legacy systems or building modern data architectures, you will gain insights to implement efficient, reliable, and robust CDC solutions using FlinkCDC.", "recording_license": "", "do_not_record": false, "persons": [{"code": "AE8HH7", "name": "Muhammet Orazov", "avatar": "https://program.berlinbuzzwords.de/media/avatars/AE8HH7_qXi4Axv.png", "biography": "Muhammet is Software Engineer at Ververica, the original creators of Apache Flink\u00ae. He is member Engine team that develops various Flink engines for different platforms. He is experienced in databases, distributed systems and started his journey in streaming systems at Ververica.", "public_name": "Muhammet Orazov", "guid": "9fdffa4d-5f5a-517c-bab0-b1358cbe19ff", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/AE8HH7/"}], "links": [{"title": "FlinkCDC Documentation", "url": "https://nightlies.apache.org/flink/flink-cdc-docs-master/", "type": "related"}, {"title": "FlinkCDC GitHub Repository", "url": "https://github.com/apache/flink-cdc", "type": "related"}], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/GFBXSY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/GFBXSY/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/GFBXSY/resources/Muhammet_Orazov_Be_QFUP7fg.pdf", "type": "related"}]}, {"guid": "e1622288-f020-5a3b-bbc5-51d0bf90828f", "code": "CHAZHA", "id": 64828, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/CHAZHA/Entwurf_1_39_nRhBTK5.png", "date": "2025-06-17T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-64828-mastering-real-time-anomaly-detection-with-open-source-tools", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CHAZHA/", "title": "Mastering real-time anomaly detection with open source tools", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "With data moving faster than ever, detecting problems as they happen is crucial. This talk covers how to build a real-time anomaly detection system using Apache Kafka for streaming, Apache Flink for processing, and AI for pattern recognition. Plus, we\u2019ll explore Apache Iceberg for storing historical data to refine models.", "description": "Detecting problems as they happen is essential in today\u2019s fast-moving world. This talk shows how to build a simple, powerful system for real-time anomaly detection. We\u2019ll use Apache Kafka for streaming data, Apache Flink for processing it, and AI to find unusual patterns. Whether it\u2019s spotting fraud, monitoring systems, or tracking IoT devices, this solution is flexible and reliable.\r\n\r\nFirst, we\u2019ll explain how Kafka helps collect and manage fast-moving data. Then, we\u2019ll show how Flink processes this data in real time to detect events as they happen. We\u2019ll also explore how to add AI to the pipeline, using pre-trained models to find anomalies with high accuracy. Finally, we\u2019ll look at how Apache Iceberg can store past data for analysis and model improvements. Combining real-time detection with historical data makes the system smarter and more effective over time.\r\n\r\nThis talk includes clear examples and practical steps to help you build your own pipeline. It\u2019s perfect for anyone who wants to learn how to use open-source tools to spot problems in real-time data streams.", "recording_license": "", "do_not_record": false, "persons": [{"code": "LLBXBT", "name": "Olena Kutsenko", "avatar": "https://program.berlinbuzzwords.de/media/avatars/LLBXBT_HE8avlV.png", "biography": "Olena is a Staff Developer Advocate at Confluent and a recognized expert in data streaming and analytics. With two decades of experience in software engineering, she has built mission-critical applications, led high-performing teams, and driven large-scale technology adoption at industry leaders like Nokia, HERE Technologies, AWS, and Aiven.\r\n\r\nA passionate advocate for real-time data processing and AI-driven applications, Olena empowers developers and organizations to use the power of streaming data. She is an AWS Community Builder, a dedicated mentor, and a volunteer instructor at a nonprofit tech school, helping to shape the next generation of engineers.\r\n\r\nAs an international speaker and thought leader, Olena regularly presents at top global conferences, sharing deep technical insights and hands-on expertise. Whether through her talks, workshops, or content, she is committed to making complex technologies accessible and inspiring innovation in the developer community.", "public_name": "Olena Kutsenko", "guid": "317c2014-3ceb-566c-80f7-b9f6e41f062d", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/LLBXBT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CHAZHA/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/CHAZHA/", "attachments": []}, {"guid": "fc620c2f-d451-566e-a9f7-6773f6d69904", "code": "VQJL9U", "id": 65991, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/VQJL9U/Entwurf_1_17_sN7kARA.png", "date": "2025-06-17T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-65991-go-beyond-basic-rag-with-agentic-behavior", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/VQJL9U/", "title": "Go Beyond Basic RAG with Agentic Behavior", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "RAG revolutionized AI by merging search and generation, and agentic behavior takes this search to the next level by enabling LLMs to make decisions and call tools. This talk covers agentic behavior's key features: tool integration and reasoning, along with a live demo.", "description": "Retrieval-Augmented Generation (RAG) has transformed how we build Q&A systems with Large Language Models (LLMs) by combining the strengths of search and generation. However, traditional RAG workflows are static and often struggle to handle the dynamic and complex demands of real-world applications, such as answering multi-step queries, integrating external APIs, or gracefully recovering from retrieval failures. Agentic behavior addresses these challenges by extending RAG pipelines, enabling LLMs to make decisions, integrate tools, and dynamically adapt workflows.\r\n\r\nIn this talk, we\u2019ll explore how agentic behavior enhances pipelines. We\u2019ll define what it means for a system to act as an \u201cagent\u201d and cover core concepts like routing, tool calling, and reasoning. Using hands-on examples implemented in Python, we\u2019ll walk through practical use cases, such as integrating external APIs and solving multi-step problems. Finally, we\u2019ll tackle challenges like transparency in complex systems and share how graph-based approaches can make these workflows more interpretable.", "recording_license": "", "do_not_record": false, "persons": [{"code": "PVSNUG", "name": "Bilge Y\u00fccel", "avatar": "https://program.berlinbuzzwords.de/media/avatars/PVSNUG_N0VnMiY.JPG", "biography": "She is a developer relations engineer at deepset and is passionate about RAG, LLMs, and all things Gen AI. She enjoys making complex AI concepts accessible to all and helps developers build powerful AI applications with Haystack and beyond.", "public_name": "Bilge Y\u00fccel", "guid": "f8c5b23a-9de7-56c2-8983-a22c3cce5eb0", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/PVSNUG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/VQJL9U/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/VQJL9U/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/VQJL9U/resources/Bilge_Yucel__Go_Be_gv9Ecj7.pdf", "type": "related"}]}, {"guid": "a9dfcaef-b8cc-536e-9e0f-e2cf659ea424", "code": "UABLUX", "id": 59528, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/UABLUX/Entwurf_1_13_MYBgw14.png", "date": "2025-06-17T16:00:00+02:00", "start": "16:00", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz25-59528-text-search-on-images-with-quantized-colpali", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UABLUX/", "title": "Text Search on Images with Quantized ColPali", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "ColPali is revolutionary\u2014here\u2019s why: it combines document retrieval with a vision-based large language model, allowing you to search directly within images without needing to extract text. However, running the full model on personal hardware can be challenging due to its computational demands. And thus we\u2019ve released a quantized version of ColPali.", "description": "ColPali is a late interaction model, that is the context remain intact. And it's finetuned on vision LLM, Pali Gemma to be able to perform text search on images. But what we did was to be able to bring it more towards consumer by quantizing the model, so you can perform search locally on your laptop.\r\n\r\nThe talk will cover:\r\nWhat is ColPali?\r\nWhat is Late-Interaction?\r\nHow can you deploy it locally?", "recording_license": "", "do_not_record": false, "persons": [{"code": "FTZPWC", "name": "Sonam Pankaj", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FTZPWC_fwShrFm.jpg", "biography": "Sonam is a GenerativeAI Evangelist. She is also the author of embedanything,  which is an opensource ingestion, inference and indexing solution in rust with more than 200k+ downloads and 500+ stars in past 9 months. She has previously worked in generative AI and conversational AI. She is also building StarlightSearch, a local and on-premise solution for search and agents in rust.", "public_name": "Sonam Pankaj", "guid": "baefcab1-2ffc-5372-97d8-749afc1fff0b", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/FTZPWC/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UABLUX/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UABLUX/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/UABLUX/resources/Sonam_Pankaj_-_Tex_ozbwSCA.pdf", "type": "related"}]}, {"guid": "782df7a5-6d14-537d-9f33-5ae2a44bf1bb", "code": "X9Q3Q7", "id": 65600, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/X9Q3Q7/Entwurf_1_3_L3yNLiy.png", "date": "2025-06-17T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz25-65600-flink-jobs-as-agents-stream-processing-for-agentic-ai", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/X9Q3Q7/", "title": "Flink Jobs as Agents \u2013 Stream Processing for Agentic AI", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Apache Flink is uniquely positioned to serve as the backbone for AI agents, providing them with stream processing as a powerful new tool. We'll explore how Flink jobs can be transformed into \"Agents\"\u2014autonomous, goal-driven entities that dynamically interact with data streams, trigger actions, and adapt their behavior based on real-time insights.", "description": "We\u2019ll showcase Flink jobs as AI agents through two key stream processing & AI use cases: 1) financial planning & detection of spending anomalies, as well as 2) forecasting demand & supply chain monitoring for disruptions.\r\n\r\nAI agents need business context. We\u2019ll discuss embedding foundation models with schema registries and data catalogs for contextual intelligence while ensuring data governance and security. We\u2019ll integrate Apache Kafka event streams with data lakes in open-table formats like Apache Iceberg, enabling AI agents to leverage real-time and historical data for consistency and reasoning. We\u2019ll also cover latency optimization for time-sensitive use cases while preventing hallucinations.\r\n\r\nFinally, we\u2019ll demonstrate an open-source conversational platform on Apache Kafka, where multiple AI agents are assigned to a business process, continuously process real-time events while optimizing for their individual goals, interacting, and negotiating with each other.\r\n\r\nBy combining Flink and Kafka, we can build systems that are not just reactive but proactive and predictive, paving the way for next-generation agentic AI.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HNMPC7", "name": "Steffen Hoellinger", "avatar": "https://program.berlinbuzzwords.de/media/avatars/HNMPC7_gSCmcZX.png", "biography": "Steffen Hoellinger is the co-founder and CEO of Airy, an innovative AI startup focused on building open source data infrastructure that combines the power of data streaming, stream processing, and AI. With a deep passion for the power of real-time, AI-driven insights, Steffen leads Airy in providing scalable, efficient solutions that empower enterprises to harness the full potential of generative AI and advanced machine learning and help shape the future of business.", "public_name": "Steffen Hoellinger", "guid": "7e24e851-6f32-52e2-b9f5-de5386a9c88f", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/HNMPC7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/X9Q3Q7/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/X9Q3Q7/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/X9Q3Q7/resources/Steffen_Hollinger-_EWziiiv.pdf", "type": "related"}]}, {"guid": "fd029c97-db73-53fb-9251-e6c072842b79", "code": "JJ3FM7", "id": 68467, "logo": null, "date": "2025-06-17T17:15:00+02:00", "start": "17:15", "duration": "00:15", "room": "Kesselhaus", "slug": "bbuzz25-68467-closing-session", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JJ3FM7/", "title": "Closing Session", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Join us as we wrap up Berlin Buzzwords.", "description": "", "recording_license": "", "do_not_record": false, "persons": [{"code": "LWMKUK", "name": "Berlin Buzzwords Team", "avatar": null, "biography": null, "public_name": "Berlin Buzzwords Team", "guid": "be54fbab-5192-5dce-8c8d-f50df81e263c", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/LWMKUK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JJ3FM7/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JJ3FM7/", "attachments": []}], "Maschinenhaus": [{"guid": "2ccf7cf1-5d3d-50ea-8d12-571234a24dd9", "code": "EGTSQD", "id": 64770, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/EGTSQD/Entwurf_1_48_uSx9uyh.png", "date": "2025-06-17T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz25-64770-contexts-machines-how-document-parsing-shapes-rag-results", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EGTSQD/", "title": "Contexts & Machines: How Document Parsing Shapes RAG results", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "How different document parsing and chunking strategies impact RAG pipeline performance? Using real-life documents and LLM-generated question/answer pairs, we assess multiple methods \u2013 both open-source and commercial \u2013 showing that parsing quality significantly affects response accuracy and that the best approach may depends on the question type.", "description": "Retrieval-Augmented Generation (RAG) pipelines have shown their effectiveness in exploring complex documents. However, their performance hinges on the quality of the retrieved context, which depends on well-structured document inputs. Real-world documents often contain unstructured elements - images, tables, multi-column text, etc. - making parsing and chunking a critical challenge. Poor document processing can degrade retrieval quality, increasing the risk of hallucinations in LLM responses.\r\n\r\nIn our talk, we will report on the results of a study conducted to  evaluate different PDF parsing and document chunking strategies \u2013 spanning both open-source and commercial-grade solutions \u2013 to determine their impact on RAG performance. Using a dataset of complex documents and LLM-generated question/answer pairs, we apply several evaluation metrics to quantify how different parsing techniques affect the relevance of retrieved information and response accuracy. Our findings reveal that parsing and chunking strategies significantly shape RAG output quality and that the most effective approach may depend on the nature of the queries. By highlighting the interplay between document processing and RAG performance, this study provides actionable insights for building more reliable knowledge retrieval systems.", "recording_license": "", "do_not_record": false, "persons": [{"code": "RE7CD9", "name": "Alessio Vertemati", "avatar": "https://program.berlinbuzzwords.de/media/avatars/RE7CD9_68OLBMr.jpg", "biography": "I'm a software architect and developer. I started as developer on a wide range of technologies spanning from networks, automation and web before becoming technological advisor for knowledge management and document management projects. I'm approaching AI from a deterministic perspective.", "public_name": "Alessio Vertemati", "guid": "587c4444-daba-5c0c-b628-5a0b4ef78095", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/RE7CD9/"}, {"code": "FHD9YS", "name": "Andrea Ponti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FHD9YS_dSIRK8E.JPEG", "biography": "I am a Data Scientist with a Master's degree in Computer Science. I combine academic rigour with hands-on industrial experience, using cutting-edge technologies at the intersection of research and practice.\r\n\r\nMy research focuses on the optimisation of black-box functions using advanced Bayesian methods. From an industrial perspective, I specialise in the development of versatile machine learning solutions, with a focus on foundation models and Large Language Models (LLMs, aka what's behind ChatGPT).\r\n\r\nI am fluent in Italian and English and can converse with AI models.", "public_name": "Andrea Ponti", "guid": "5f14dabc-c4f7-56f6-b4b5-821f4125d477", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/FHD9YS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EGTSQD/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EGTSQD/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/EGTSQD/resources/Alessio_Vertemati__R6PBXT7.pdf", "type": "related"}]}, {"guid": "1d1346a6-b8dd-53eb-9cfc-b7f7c38cc21f", "code": "FNXN8K", "id": 64879, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/FNXN8K/Entwurf_1_19_K0jvY53.png", "date": "2025-06-17T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-64879-observability-for-all", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/FNXN8K/", "title": "Observability for All!", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Observability is the ability to measure the state of the whole system. OpenTelemetry can be used to instrument applications and diagnose issues. But frontend instrumentation is often an afterthought.\r\n\r\nJoin me as I show how OTel, RUM agents and Synthetic Monitoring can help us identify and diagnose issues in all layers of our applications.", "description": "## Background\r\n\r\nBefore joining Elastic as a Developer Advocate, I spent over 10 years working for a bank as a frontend engineer. I have felt the pain of trying to diagnose issues and errors in UIs using logs and diving into minified JavaScript code.\r\n\r\nIn that time, the state of DevOps and SRE has established many practices to help developers instrument their applications to identify unexpected behaviour and performance issues. These practices are generally, backend focused. By combining backend tracing with frontend tracing and metrics, we can better understand how our application behaves and where the issue lies.\r\n\r\n## Outline\r\n\r\nI will discuss how to combine logs, metrics and traces from application services with tools for frontend instrumentation, specifically:\r\n\r\n1. An overview of key observability signals (for anyone unfamiliar with them), and why logs are insufficient in diagnosing issues in our UIs.\r\n2. An examination of how RUM agents work, using the Elastic RUM agent as an example, and the metrics and tracing information they capture that relate to the observability pillars including Google Core Web vitals, latency metrics and traces.\r\n3. Examples showing how front-to-back tracing can be achieved using OpenTelemetry instrumentation combined with existing RUM agents. I'll also touch on the state of the Client Implementation (RUM) approach within the CNCF OpenTelemetry community.\r\n4. An overview of RUM metrics that can be captured to help track usage, potentially as KPIs, and how they can be collected.\r\n5. An outline of what Synthetic Monitoring is, using Playwright and Elastic Synthetics as an example. I'll also cover how it can be used with alerting and SLOs to alert SREs of potential issues in our applications.\r\n\r\n## Target Audience\r\n\r\nI believe the following individuals would be interested in this talk:\r\n\r\n1. UI Developers interested in observing their applications and unsure how to instrument their applications or the tools currently available.\r\n2. DevOps and SRE engineers looking to monitor frontends as part of a wide system-estate.\r\n3. More experienced frontend engineers or designers looking for tools to measure application performance as a regular best practice compared to ad-hoc profiling of web applications.\r\n4. Tech leads and team leads looking for ways to be alerted to potential application issues and behaviours that impact the user experience.", "recording_license": "", "do_not_record": false, "persons": [{"code": "FH8XY7", "name": "Carly Richmond", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FH8XY7_dRJ62KB.jpeg", "biography": "Carly is a Developer Advocate and Manager at Elastic, based in London, UK. Before joining Elastic in 2022, she spent over 10 years as a technologist at a large investment bank, specialising in front-end web development and agility. She is a UI developer who occasionally dabbles in writing backend services, a speaker, and a regular blogger.\r\n\r\nShe enjoys cooking, photography, drinking tea, and chasing after her young son in her spare time.", "public_name": "Carly Richmond", "guid": "e3b21874-4e92-5fb6-a920-a4cd848039a5", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/FH8XY7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/FNXN8K/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/FNXN8K/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/FNXN8K/resources/Carly_Richmond__Ob_GRF9rnm.pdf", "type": "related"}]}, {"guid": "4139aad9-d151-5d42-8bee-8f68acff66a5", "code": "WYLNU9", "id": 64859, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/WYLNU9/Entwurf_1_2_F7d7j6F.png", "date": "2025-06-17T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-64859-from-culture-to-open-source-build-value-driven-communities", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WYLNU9/", "title": "From Culture to Open Source: Build Value-driven Communities", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "A great open-source community isn\u2019t just about code\u2014it\u2019s about people. A strong company culture fosters engagement and growth. This session explores how values like kindness, collaboration, and developer experience (DX) shape both our company and community. Learn practical insights on fostering inclusion, engagement, and long-term impact.", "description": "A great open-source community isn\u2019t just about code\u2014it\u2019s about people. For us \u2013 Weaviators \u2013 we believe a strong company culture of kindness and collaboration is essential for building an inclusive and engaged open-source ecosystem.\r\n\r\nJoin us to learn what you can do to ensure your culture extends beyond your organization and how your values translate into external impact.\r\nBased on real-life examples and tactics,, we will explore:\r\n- The connection between company culture and open-source communities.\r\n- How we foster an internal culture that supports (developer) engagement, and what you could copy from us.\r\n- Practical steps for creating a welcoming, inclusive, and sustainable community.\r\n- The role of developer experience (DX) in driving long-term participation.\r\n\r\nThis session is for you\u2014whether you're an open-source community builder, a company leader, or someone eager to learn how to create a company culture that strengthens both your organization and your community. You'll walk away with actionable insights to help you cultivate a culture that fuels open-source success.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZSHGDJ", "name": "Marion Nehring", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ZSHGDJ_fgeIeQR.jpg", "biography": "I am Marion - Community Manager and DX User Research Lead at Weaviate, tech innovation enthusiast, and lover of fantasy books! I am a highly positive personality with a great sense of humor and a strong human-centered and growth-minded approach to everything.\r\n\r\n\ud83d\udc9e During my 20 years in tech my biggest passion was (and still is) uniting people and tech in order to tackle everyday challenges, grow innovation, and drive change for a sustainable future with the help of technology.\r\n\r\nSo I am getting very excited when people (especially developers) come together to build the next big thing, be creative, and help each other be their most successful and the best version of themself.", "public_name": "Marion Nehring", "guid": "986fb191-0d38-5619-9386-5b2191e6a089", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/ZSHGDJ/"}, {"code": "QL7PGJ", "name": "Jessie de Groot", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QL7PGJ_3AGGvfU.jpg", "biography": "Jessie is VP of People & Culture at Weaviate, a remote-first and open source AI-native start-up. Jessie is passionate about everything related to creating great people programs and sustaining a strong remote company culture. \r\n\r\nJessie loves to talk about remote-first culture, everything people-related, traveling, interior design, coffee and food.", "public_name": "Jessie de Groot", "guid": "7c4c2e1d-9634-58cc-b5d7-031b1a4c6388", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/QL7PGJ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WYLNU9/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WYLNU9/", "attachments": []}, {"guid": "4730ac87-3ca2-53bf-8c50-936b7f23f6e2", "code": "FTSXAX", "id": 63350, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/FTSXAX/Entwurf_1_21_POXumCN.png", "date": "2025-06-17T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-63350-what-s-new-in-the-opensearch-project-and-ecosystem", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/FTSXAX/", "title": "What\u2019s New in the OpenSearch Project and Ecosystem", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Discover how OpenSearch powers search and observability at scale! Now part of The Linux Foundation, OpenSearch is evolving with vector search, NLP, and real-time analytics. Join this session to explore its latest innovations, performance boosts, and expanding ecosystem\u2014directly from the project's Chief Evangelist.", "description": "Audience that is new to OpenSearch will get a brief introduction to the project.\r\nMore advanced audience will get Key Updates and Features\r\nIn addition, the talk will cover Integrations and Ecosystem Growth, highlighting potential for collaboration.\r\n\r\nThe talk will be delivered by the chief advocate of OpenSearch, and will provide an authoritative take on the project and its vision and roadmap.\r\n\r\nMain topics covered include:\r\n-Introduction to OpenSearch\r\n-Evolving OpenSearch: Key Updates and Features\r\n-Real-Time Analytics and Observability\r\n-Integrations and Ecosystem Growth\r\n-Future Roadmap and Vision", "recording_license": "", "do_not_record": false, "persons": [{"code": "Q8N9EV", "name": "Dotan Horovits", "avatar": "https://program.berlinbuzzwords.de/media/avatars/Q8N9EV_gboZIXX.jpg", "biography": "Horovits lives at the intersection of technology, product and innovation. With over 20 years in the hi-tech industry as a software developer, a solutions architect and a product manager, he brings a wealth of knowledge in cloud and cloud-native architectures, big data solutions, DevOps practices and more. \r\n\r\nHorovits is an international speaker and thought leader, as well as an Ambassador of the Cloud Native Computing Foundation (CNCF). He runs the successful OpenObservability Talks podcast, and is a sought writer. \r\n\r\nCurrently working as senior developer advocate for the Open Source Strategy & Marketing team at AWS, Horovits evangelizes on the OpenSearch open source project by the Linux Foundation.", "public_name": "Dotan Horovits", "guid": "b5745369-b304-5896-827c-c35f504ace80", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/Q8N9EV/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/FTSXAX/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/FTSXAX/", "attachments": []}, {"guid": "5d197a62-0a6d-510c-ab92-3ade72d9f973", "code": "KT9BAG", "id": 58999, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/KT9BAG/Entwurf_1_36_H9A3GpJ.png", "date": "2025-06-17T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-58999-cross-domain-enterprise-search-content-diversity-at-scale", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/KT9BAG/", "title": "Cross Domain Enterprise Search - Content Diversity at Scale", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk will focus on learnings gathered when building an enterprise search platform with multi modal content - ranging from highly domain specific content to images to unstructured content. Problems of extraction, inference and relevance shall be discussed, while showcasing cross domain search at scale.", "description": "Cross domain search is a long lasting problem -- from the challenges of ingesting variety of data with different structures, content, noise and extraction strategies to generating multiple ground truth golden data sets to benchmark individual corpus' relevance. Coupled with the challenge of cross domain relevance across multi modal content, with no defined mechanism to normalise scores across individual queries across different content, to the challenges of domain specific terminology, to the challenges of cross modal embedding generators and language specific challenges, the list goes on.\r\n\r\nThis talk will focus on learnings of building an enterprise search system, which literally deals with more than 10 different types of content at the same time, and scales into billions of documents. Attendees can expect to learn novel techniques involved in cross domain ranking, content curation, content extraction and natural language query processing.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KBTXZL", "name": "Atri Sharma", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KBTXZL_9OTe0Wp.jpg", "biography": "Distributed systems and information retrieval guy", "public_name": "Atri Sharma", "guid": "6b1a3e7b-e6f3-5452-9186-8032cd0da689", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/KBTXZL/"}, {"code": "KYHPK7", "name": "Abhishek Singh", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KYHPK7_WudzAsK.jpg", "biography": "I turn complex engineering challenges into scalable, high-impact solutions\u2014while building rockstar teams along the way. With a deep expertise in search, recommender systems, and distributed systems, I thrive at the intersection of machine learning, engineering, and business growth.\r\nIf you like to geek out over AI, search, or the magic of data-driven decisions, let\u2019s connect!", "public_name": "Abhishek Singh", "guid": "50a166f7-1ab8-56e8-87fa-14ec0527ba0c", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/KYHPK7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/KT9BAG/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/KT9BAG/", "attachments": []}, {"guid": "2203d5d6-0648-547f-8726-8a9d7b2ac21e", "code": "7YSV83", "id": 65521, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/7YSV83/Entwurf_1_58_ou1MAId.png", "date": "2025-06-17T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz25-65521-streamlining-search-quality-search-relevance-workbench", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/7YSV83/", "title": "Streamlining Search Quality: Search Relevance Workbench", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Robust Search Evaluation is both a \u201cmust have\u201d for any modern day Search team and an \u201cafter thought\u201d that never gets the team\u2019s full attention. This is especially true with the various open source search engines.  Most teams build their own data collection and eval tools. Some use standalone open source tools. We present a better solution!", "description": "In this talk we will lay out the history of Search Evaluation, why it\u2019s critical in today\u2019s AI powered world, and make the case for why Search Evaluation needs to be part and parcel of any modern Search Engine.  We will share some lessons from building multiple Search Evaluation toolsets, including the popular open source tool Quepid, and why we felt we needed to build the Search Relevance Workbench as an integrated suite.  We will show how SRW collects user click behavior using the User Behavior Insights open standard, and how click data is combined with labeled data to measure search quality.   We\u2019ll show how you can use that information to run optimizers like Learning to Boost and Hybrid Search Optimizers that replace traditional manually tuned algorithms.  You will leave understanding how SRW is different from previous tools, and how you can take advantage of it with your own search engine (not just OpenSearch) as well.", "recording_license": "", "do_not_record": false, "persons": [{"code": "P7BUHK", "name": "Eric Pugh", "avatar": "https://program.berlinbuzzwords.de/media/avatars/P7BUHK_MGO9TMz.png", "biography": "Eric Pugh is the co-founder of OpenSource Connections. Today he helps OSC\u2019s clients, especially those in the ecommerce space, build their own search teams and improve their search maturity, both by leading projects and by acting as a trusted advisor.\r\n\r\nHe is an active maintainer on the OpenSearch Documentation project, and is focused on expanding the suite of Search Relevance features in the OpenSearch Project.\r\n\r\nFascinated by the craft of software development, Eric Pugh has been involved in the open source world as a tester, developer, committer and user for the past twenty years. He is a member of the Apache Software Foundation and co-authored the book Apache Solr Enterprise Search Server, now on its third edition.\r\n\r\nOpenSource Connections mission to empower the world\u2019s search teams comes directly from Eric\u2019s belief in the open source software movement, and the importance of educating people to succeed with it, so that people own their technology.\r\n\r\nWhen not thinking about search, Eric likes to get his hands dirty by building furniture. His next project is a reproduction Danish modern couch, using just hand tools!", "public_name": "Eric Pugh", "guid": "8c2d90c4-e473-561d-b315-26e8a1263945", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/P7BUHK/"}, {"code": "FYJLGL", "name": "Stavros Macrakis", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FYJLGL_SVQMP4e.jpeg", "biography": "Stavros Macrakis is the senior technical product manager for OpenSearch focusing on document and e-commerce search. He has worked on search for 20 years and is passionate about search relevance.", "public_name": "Stavros Macrakis", "guid": "a7e14829-5b90-5761-8bb6-5a8c50a66862", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/FYJLGL/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/7YSV83/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/7YSV83/", "attachments": []}, {"guid": "c67f5936-f621-5c1d-acc4-ff769ad43103", "code": "DNQ8EY", "id": 62908, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/DNQ8EY/Entwurf_1_43_hucokpp.png", "date": "2025-06-17T16:00:00+02:00", "start": "16:00", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz25-62908-siphon-modern-data-stack-with-sf-ch-iceberg", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DNQ8EY/", "title": "Siphon : Modern Data Stack with SF-CH & Iceberg", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Tired of waiting for batch jobs? See how we transformed our data pipeline using Apache Iceberg to stream quality data into Snowflake and Clickhouse simultaneously. Learn about our battle-tested architecture, performance gains, and how we maintain data consistency across dual analytics engines", "description": "Ever wondered how to stream data reliably to multiple warehouses without compromising data quality? We'll show you how Siphon uses Apache Iceberg's time travel and ACID properties to ensure data consistency across Snowflake and Clickhouse. Dive into our journey from batch to streaming - covering architecture evolution, data quality frameworks, and performance optimizations. We'll share our battle-tested patterns for handling schema evolution, managing data contracts, and implementing quality gates. Learn how we achieved sub-minute latency while preventing bad data from corrupting our warehouses. Perfect for data engineers and architects looking to modernize their data infrastructure with real-world proven solutions.", "recording_license": "", "do_not_record": false, "persons": [{"code": "VRAU8K", "name": "Ved Prakash", "avatar": "https://program.berlinbuzzwords.de/media/avatars/VRAU8K_whKx3EI.jpg", "biography": "A Staff Data Engineer with over 15 years of experience in building enterprise data products. Currently pioneering the development of Siphon, a real-time data streaming product that enables reliable data delivery across Snowflake and Clickhouse using Apache Iceberg. Specializes in transforming traditional data pipelines into scalable data products with emphasis on reliability, observability, and user experience.\r\nTheir product engineering journey includes developing self-service data platforms, automated data quality frameworks, and real-time analytics solutions using Snowplow, Monte Carlo, and cloud-native technologies. They've successfully led the productization of data infrastructure across GCP and AWS, implementing infrastructure-as-code practices with Terraform and continuous delivery pipelines.\r\nPassionate about building data products that deliver immediate business value, they focus on creating intuitive, reliable data solutions that empower organizations to make data-driven decisions with confidence. Their product-first approach combines technical expertise with user-centric design to deliver data solutions that scale.", "public_name": "Ved Prakash", "guid": "c4cc4bd6-7b42-56f9-b72a-93d6d7810b94", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/VRAU8K/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DNQ8EY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DNQ8EY/", "attachments": []}], "Palais Atelier": [{"guid": "19dbe2c6-aacd-5850-b7dc-ffdd45ef9bbb", "code": "ZUXSBZ", "id": 63347, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/ZUXSBZ/Entwurf_1_14_pZZZOES.png", "date": "2025-06-17T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-63347-evolution-of-uber-s-search-platform", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/ZUXSBZ/", "title": "Evolution of Uber's Search Platform", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Search is integral to Uber's core business and user experience. In this talk, we\u2019ll explore the unique challenges of Search at Uber and chart the evolution of Uber\u2019s Search Platform\u2014from leveraging Elasticsearch to developing an in-house solution, and finally, innovating in collaboration with the OpenSearch community.", "description": "Search powers critical functionality across all Uber products, including product discovery in the Uber Eats app, seamless pickup and drop-off experiences in Uber Rides, and real-time geospatial matching for drivers and riders. However, this comes with unique technical challenges such as real-time updates, geospatial awareness, and semantic search at scale.\r\n\r\nOver the years, Uber\u2019s Search Platform has undergone significant transformation:\r\n - Initially built entirely on Elasticsearch, we faced challenges related to scalability and feature limitations.\r\n - To address these, Uber developed a custom, in-house solution tailored to meet our unique needs.\r\n - Recognizing the importance of open standards and community-driven innovation, we later embraced OpenSearch, collaborating with its vibrant community to contribute enhancements and ensure long-term sustainability.\r\n\r\nIn this talk, we will discuss:\r\n - The unique technical requirements of Search at Uber.\r\n - The architectural evolution of our platform in response to business growth and new challenges.\r\n - The strategic shift toward collaborating with the open-source ecosystem to foster innovation and scalability.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZPKG3Y", "name": "Yupeng Fu", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ZPKG3Y_qRP04V1.JPG", "biography": "Yupeng Fu is a Principal Engineer in the Platform Engineering organization at Uber. He leads the Search and Real-time Data Platforms. Yupeng is also an active contributor to open-source projects. He is an OpenSearch TSC member and Apache Pinot PMC.", "public_name": "Yupeng Fu", "guid": "158dec16-5c88-54b4-bc92-ad77ef7a78b7", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/ZPKG3Y/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/ZUXSBZ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/ZUXSBZ/", "attachments": []}, {"guid": "c06a4f2d-7a64-5157-a464-6b5ad9f61a91", "code": "L9HNUE", "id": 71568, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/L9HNUE/Entwurf_2_1_R9kLb6R.png", "date": "2025-06-17T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-71568-vespa-ai-s-personalized-search-advanced-ranking-tensor-framework", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/L9HNUE/", "title": "Vespa.ai\u2019s Personalized Search: Advanced Ranking & Tensor framework", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Modern search demands scalable personalization. Discover Vespa\u2019s multi-stage ranking and tensor framework for hybrid queries, multimodal retrieval and real-time ML Learn how to deploy low-latency, high-relevance search systems at petabyte scale.", "description": "Today\u2019s applications require search engines to unify text, vectors, and business logic with millisecond latency at petabyte scale. It\u2019s not easy to balance speed, relevance, and personalization for a large user population and a billion scale item base. Vespa.ai, the open-source engine powering Yahoo, Perplexity, Qwant, Vinted, Spotify addresses this through multi-stage ranking  with close to data tensor operations and easy to understand custom functions. \r\nVespa\u2019s phased architecture enables high performance due to the ability to filter candidates via hybrid retrieval (text + multi vector + filters) before applying ML models for precision or logic for personalisation. Its tensor framework enables multimodal (text/image/video) and multivector queries with real-time individual personalization, scaling beyond 100k QPS with milliseconds latency.\r\nYou will learn Vespa.ai configuration concepts and ideas how all the building blocks (LLMs, VLMs, embedding models, sparse and dense representations for items and users) can be connected together.\r\n\r\n---\r\nThis session is sponsored by <a href=\"https://vespa.ai\">Vespa.ai</a>.", "recording_license": "", "do_not_record": false, "persons": [{"code": "GGXRHR", "name": "Piotr Kobziakowski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GGXRHR_DQQ0yUa.png", "biography": "Piotr Kobziakowski is a Senior Principal Solutions Architect at Vespa.ai, where he leverages over 20 years of expertise in software architecture, network security, big data, and search technologies to design scalable AI-driven solutions for global enterprises. Based in Warsaw, Poland, he specializes in advising organizations on data, analytics and search applications. \r\nPrior to joining Vespa.ai in October 2024, Kobziakowski held progressive technical roles at Elastic, where he architected search and analytics solutions for telecommunications. His career spans across industry leaders like Akamai, Nominum, Cloudmark and Bytemobile, with a focus on optimizing large-scale data and analytics infrastructure and security systems.\r\nPiotr\u2019s approach combines hands-on technical advisory with strategic problem-solving, \r\nthrough delivering workshops and customized training programs. He is recognized for translating complex technical concepts into actionable roadmaps, enabling enterprises to operationalize technology capabilities.  A frequent speaker at many events related to GenAI, Data and Analytics.", "public_name": "Piotr Kobziakowski", "guid": "b4ba3792-4b1c-58f6-acb0-ff31fb102dad", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/GGXRHR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/L9HNUE/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/L9HNUE/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/L9HNUE/resources/Piotr_Kobziakowski_UMxtzNK.pdf", "type": "related"}]}, {"guid": "397b1437-18ca-571b-8e37-e90ed137e632", "code": "JHUSGA", "id": 65586, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/JHUSGA/Entwurf_1_33_uDDYrLU.png", "date": "2025-06-17T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-65586-how-not-to-evaluate-your-rag", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JHUSGA/", "title": "How [not] to evaluate your RAG", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "How do you know if your RAG system is actually working? We\u2019ll share a real-world case study on evaluating RAG in production\u2014tackling messy data, chunking fails, and unexpected chatbot behavior\u2014so you can measure quality with confidence.", "description": "Judging search relevance seems straightforward: the higher a relevant product ranks, the better your search system works. But when it comes to RAG, things get complicated\u2014there\u2019s no ranking, no traditional documents, just an LLM-generated response to a query. So how do you know if it\u2019s any good? Is there an objective way to measure progress, or are you just guessing?\r\n\r\nIn this talk, we\u2019ll share a real (if not exactly glamorous) case study of building and evaluating a production RAG system for a fintech company. We\u2019ll cover the headaches of working with a small and noisy corpus, chunking gone wrong, handling low-resource languages (plus users who think your support chatbot is their therapist), and the different frameworks (like RAGAS) to evaluate a RAG system\u2014so you\u2019re not flying blind.", "recording_license": "", "do_not_record": false, "persons": [{"code": "NGCSXL", "name": "Roman Grebennikov", "avatar": "https://program.berlinbuzzwords.de/media/avatars/NGCSXL_fa3ugxj.png", "biography": "A principal ML engineer and an ex startup CTO working on modern search and recommendations problems. A pragmatic fan of open-source software, functional programming, LLMs and performance engineering.", "public_name": "Roman Grebennikov", "guid": "45e06948-5b1f-55ba-a894-653eb73a9f1e", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/NGCSXL/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JHUSGA/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/JHUSGA/", "attachments": []}, {"guid": "753d67c7-e5e4-52da-af55-dbbbee5d94e2", "code": "DYUCFN", "id": 64475, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/DYUCFN/Entwurf_1_45_qf2yvjX.png", "date": "2025-06-17T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-64475-why-chatbots-still-fail-the-hidden-pitfalls-of-rag", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DYUCFN/", "title": "Why Chatbots Still Fail: The Hidden Pitfalls of RAG", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Many knowledge chatbots and search engines use RAG. Despite their popularity, these chatbots are often worse than ChatGPT and frustrate users by failing to answer even the simplest questions. In my talk, I reveal how ineffective chunking strategies are a key culprit and demonstrate how to refine chunking to build more reliable RAG systems.", "description": "Large Language Models (LLMs) have experienced a significant boom. Among the most popular use cases are intelligent knowledge search engines, or put simply - chatbots. Whether on an airline's website, facing customers, or as the new search tool in your company's intranet, chatbots are everywhere. However many applications fall short of expectations. The system used behind many knowledge applications is called Retrieval Augmented Generation (RAG), in which a specific database is connected with an LLM. Many strategies have emerged to enhance RAG performance, but the core is often overlooked\u2014the data itself. In my presentation I will explain RAG as the foundation of intelligent knowledge applications, its pitfalls and caveats. Using a RAG's first step - chunking - as an example, I show what is necessary to improve the reliability and robustness of RAG systems and what you absolutely have to do before you can trust your own chatbot.", "recording_license": "", "do_not_record": false, "persons": [{"code": "PHBCKN", "name": "Lewin von Saldern", "avatar": "https://program.berlinbuzzwords.de/media/avatars/PHBCKN_pAitAqg.jpg", "biography": "Entrepreneur in the AI space, ex-McKinsey", "public_name": "Lewin von Saldern", "guid": "ae2131c0-ab12-568a-8b63-6cd46da56f05", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/PHBCKN/"}, {"code": "7WCSVU", "name": "Jennifer Gaubatz", "avatar": "https://program.berlinbuzzwords.de/media/avatars/7WCSVU_eyO8MlF.png", "biography": "Entrepreneur in the AI Space, ex-McKinsey, Medical Doctor", "public_name": "Jennifer Gaubatz", "guid": "22243efe-33b7-5280-839c-63918953752c", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/7WCSVU/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DYUCFN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/DYUCFN/", "attachments": []}, {"guid": "8212a3f6-c128-54ed-aca9-1a139234e281", "code": "EHWVZJ", "id": 65357, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/EHWVZJ/Entwurf_1_49_EiLw86R.png", "date": "2025-06-17T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-65357-visual-literacy-complex-document-retrieval-with-vlms", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EHWVZJ/", "title": "Visual Literacy: Complex Document Retrieval with VLMs", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Traditional document retrieval systems struggle with visually rich documents as they discard visual elements during text extraction. This talk shows how vision language models (VLMs) can address these limitations and presents a new benchmark for evaluating document retrieval systems across languages, domains, and document types.", "description": "The field of document retrieval has traditionally relied on text-based approaches, which have served well for simple text documents but show significant limitations when dealing with visually complex documents. Many real-world documents contain crucial information embedded in diagrams, charts, plots, tables, and intricate layouts that conventional systems fail to properly process. Thus, if we query these systems with information that is only included in visual elements (for example, \"How much did the average temperature in Germany increase from 1990 to 2025?\"), they will fail to retrieve relevant documents even if they contain plots or charts with the exact answer.\r\n\r\nVision Language Models (VLMs) offer a new way to approach document retrieval. By processing both text and visual elements together, these models can better understand documents as a whole, seeing how text works together with graphics and layout. This is especially useful for technical documents, research papers, financial reports, and educational materials where images and diagrams are key to understanding the content.\r\n\r\nIn this talk, we will explore how VLMs can be effectively applied to document retrieval tasks. We'll explain how to fine-tune these models for handling complex documents, including important considerations for data preparation, model architecture choices, and training strategies. We'll also present a new benchmark for testing document retrieval systems across different languages, domains, and document types. This benchmark provides a framework for comparing traditional and VLM-based retrieval systems, enabling practitioners to make informed decisions for their specific use cases.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DXVXQJ", "name": "Saba Sturua", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DXVXQJ_99lJUti.jpg", "biography": "Saba is an ML Research Engineer in the Model Training team at Jina AI, where he develops state-of-the-art text and multimodal embedding models, focusing on enhancing search capabilities.", "public_name": "Saba Sturua", "guid": "a6f0c60d-9760-5e9a-8f74-527fe22e8739", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/DXVXQJ/"}, {"code": "JRWXZH", "name": "Isabelle Mohr", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JRWXZH_SeabK7J.jpg", "biography": "Isabelle is a Machine Learning Engineer at Jina AI, where she develops and trains embedding models, working closely with her team to push the boundaries of what\u2019s possible. Passionate about knowledge sharing, she regularly gives talks on machine learning and NLP, inspiring and connecting with others in the field.", "public_name": "Isabelle Mohr", "guid": "9245bd73-b981-5d32-9819-7fb620038c3f", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/JRWXZH/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EHWVZJ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EHWVZJ/", "attachments": []}, {"guid": "c3e0a257-5b52-5bab-9461-77f756cd8c39", "code": "8JNKJC", "id": 65250, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/8JNKJC/Entwurf_1_59_wQr2hrY.png", "date": "2025-06-17T16:00:00+02:00", "start": "16:00", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz25-65250-delay-accounting-an-underrated-feature-of-the-linux-kernel", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/8JNKJC/", "title": "Delay accounting: an underrated feature of the Linux kernel", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "This talk delves into delay accounting, an often-overlooked feature that provides valuable insights into CPU time shortages and application latency. Attendees will learn how to leverage these kernel metrics for better performance analysis and system optimization.", "description": "Understanding whether a process is truly starved of CPU time isn\u2019t as simple as looking at traditional metrics like CPU usage or Load Average. Few realize that the Linux kernel has built-in mechanisms to precisely measure how long each task waits for kernel resources. This talk delves into delay accounting, an often-overlooked feature that provides valuable insights into CPU time shortages and application latency. Attendees will learn how to leverage these kernel metrics for better performance analysis and system optimization.", "recording_license": "", "do_not_record": false, "persons": [{"code": "FFZVXN", "name": "Nikolay Sivko", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FFZVXN_or6k2lU.png", "biography": "Nikolay Sivko, co-founder and CEO of Coroot, aims to simplify troubleshooting in production for developers. He is passionate about Site Reliability Engineering practices, observability, and open source. Previously, he was the head of the Engineering group at a large technology company and founded an observability tool development company in Russia, which he successfully acquired. Currently, he resides in Turkey, focusing on developing a startup with an international market orientation.", "public_name": "Nikolay Sivko", "guid": "7bc94834-e914-5eb6-a46a-a498caad54ce", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/FFZVXN/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/8JNKJC/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/8JNKJC/", "attachments": []}, {"guid": "05063e2d-0b21-59ef-b199-10d822f32e12", "code": "EZYSGS", "id": 60038, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/EZYSGS/Entwurf_1_37_9WwYDhh.png", "date": "2025-06-17T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz25-60038-advancing-multi-modal-search-capabilities-in-search-pipeline", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EZYSGS/", "title": "Advancing Multi-Modal Search Capabilities in Search Pipeline", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Exploring the integration of machine learning inference processors in OpenSearch pipelines, focusing on multi-modal search capabilities, we demonstrate how these processors enhance ingest, search request, and response processes for text, image, and audio data, significantly improving search and analytical capabilities in multi-modalities worlds.", "description": "The integration of machine learning (ML) inference processors within search pipeline architecture represents a significant advancement in search and analytics technology in OpenSearch. This presentation delves into the implementation and impact of these processors across three critical stages: ingest, search request, and search response.\r\n\r\nWe begin by examining the ML inference ingest processor, which allows for real-time enrichment of data as it enters the system. This processor can generate embeddings, classify content, or extract features from various data types, including text, images, and audio. We'll demonstrate how this enhances data quality and searchability from the point of ingestion.\r\n\r\nNext, we explore the ML inference search request processor, which dynamically modifies search queries based on ML model outputs. This powerful feature enables context-aware query expansion, semantic understanding, and even cross-modal query translation. For instance, we'll show how a text query can be used to search for relevant images or how an audio input can be transformed into a text-based search.\r\n\r\nThe ML inference search response processor is then discussed, highlighting its ability to rerank, filter, or augment search results using ML models. This can significantly improve result relevance, especially in multi-modal scenarios where traditional ranking algorithms may fall short.\r\n\r\nThroughout the presentation, we'll showcase practical examples of these processors in action, demonstrating their application in various use cases such as:\r\n\r\nVisual similarity search in e-commerce catalogs\r\nAudio transcription and searchability in media archives\r\nCross-lingual document retrieval in multilingual databases\r\nSentiment-based filtering in social media analytics\r\n\r\nWe'll also address the technical considerations of implementing these processors, including model selection, performance optimization, and scalability concerns. The presentation will touch upon the flexibility of using both locally hosted and externally connected ML models, allowing organizations to leverage AI capabilities within their search infrastructure.\r\n\r\nFinally, we'll discuss the future potential of this technology, including the possibility of more advanced multi-modal interactions, real-time learning models, and the integration of large language models for even more sophisticated search and analytics capabilities.\r\n\r\nThis presentation aims to provide attendees with a comprehensive understanding of how ML inference processors can revolutionize multi-modal search in OpenSearch, offering insights into both the current state of the technology and its future directions.", "recording_license": "", "do_not_record": false, "persons": [{"code": "PTHZGQ", "name": "Dhrubo Saha", "avatar": "https://program.berlinbuzzwords.de/media/avatars/PTHZGQ_s05TEaZ.png", "biography": "Dhrubo Saha is a machine learning engineer at Amazon Web Services (AWS) interested in machine learning algorithms, large language models, and distributed systems.", "public_name": "Dhrubo Saha", "guid": "93d456fc-66dc-5884-a5f6-f1917d0efcd0", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/PTHZGQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EZYSGS/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/EZYSGS/", "attachments": []}], "Frannz Salon": [{"guid": "82aa1f26-a76a-5c4a-8b6f-c51ac384c6f7", "code": "B9MTTQ", "id": 65548, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/B9MTTQ/Entwurf_1_38_ydCUQkE.png", "date": "2025-06-17T09:30:00+02:00", "start": "09:30", "duration": "01:10", "room": "Frannz Salon", "slug": "bbuzz25-65548-more-than-just-the-tip-of-the-iceberg", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/B9MTTQ/", "title": "More Than Just The Tip Of The Iceberg", "subtitle": "", "track": null, "type": "Workshop", "language": "en", "abstract": "A comprehensive workshop in which you will gain practical knowledge about how to deploy, configure, interact with and use advanced features of Apache Iceberg. Presented using a local coding environment based on Jupyter notebooks and a Docker Compose stack.", "description": "In recent years, several table formats for large datasets have emerged to help data engineers deal with complexity of handling substantial amounts of data in a flexible, performant and safe way. One of the most popular among those formats is Apache Iceberg.\r\n\r\nIn this workshop, you will gain up-to-date, hands-on experience on how to work with Iceberg. Using a local coding environment based on Jupyter Notebooks and a Docker Compose stack, you are going to:\r\n\r\n1. Learn about required components of a data processing system that uses Iceberg.\r\n2. Practice examples of how to update and query Iceberg using several query engines and libraries.\r\n3. Use advanced features of Iceberg, like flexible partitioning scheme, time travel or dataset branching.\r\n4. Learn about optimisation techniques and configuration \"levers\" you can pull to improve the overall performance and query speed of workloads using Iceberg.\r\n5. Peek under the hood of an Iceberg dataset, to understand its metadata and ways it improves query speed and supports data audits and lineage.\r\n\r\nThis workshop is recommended for Data Engineers, Analytics Engineers and Machine Learning Engineers wanting to improve their data pipelines and data processing workflows.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BM87Y8", "name": "Michal Gancarski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BM87Y8_pH9MSkG.jpeg", "biography": "Michal Gancarski is a software and data engineer with over ten years of experience gained freelancing, working for startups and at Zalando, where he helped to build some of the core components of the company's data lake. Currently employed as a staff data engineer for GROPYUS Technologies GmbH, he focuses on knowledge graphs and RDF datasets, helping the company disrupt and optimise the residential construction industry.\r\n\r\nIn addition to that, Michal is an Apache Iceberg instructor with video and live trainings on this table format published on the O'Reilly learning platform.\r\n\r\nMichal holds a degree in Mathematics and Economics, as well as a graduate diploma in Data Science, both completed at the University Of London, under the academic direction of the London School of Economics.", "public_name": "Michal Gancarski", "guid": "24232ea5-72f2-5687-81a5-fefb17100c8b", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/BM87Y8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/B9MTTQ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/B9MTTQ/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/B9MTTQ/resources/Michal_Gancarski_M_ZcKmCv5.pdf", "type": "related"}]}, {"guid": "c7620b06-237f-5899-bb94-a5990523cffc", "code": "VPXKLT", "id": 65267, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/VPXKLT/Entwurf_1_16_TZSWhir.png", "date": "2025-06-17T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-65267-flavors-of-postgresql-and-you-how-to-choose-a-postgres", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/VPXKLT/", "title": "Flavors of PostgreSQL\u00ae and you: how to choose a Postgres", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Postgres continues to be widely used, and Postgres-derived closed source databases such as AlloyDB and AWS Aurora and have gained popularity in recent years.  In this talk, you\u2019ll learn about the architecture of these radically different kinds of systems, what each of these companies means when they say \u201cPostgres-comptatible\u201d and how to choose one!", "description": "Who this is for: \r\nThis talk is ideally suited for engineers looking to either migrate an existing database to something new, or those wanting an overview of the Postgres-derived database landscape.\r\n\r\nRelevance: \r\nNearly all the major cloud computing providers provide some sort of \u201cPostgres-compatible\u201d relational database service, but the choice isn\u2019t as simple as picking whichever one your cloud provider offers. Some provide deep integration for AI/ML workloads, and others are serverless databases that aren\u2019t Postgres-related at all. Combined with low awareness of more recent additions to open source Postgres\u2019 feature set, many developers aren\u2019t sure how to proceed in the Postgres in a way that best reflects their needs. \r\n\r\nTalk outline: \r\n- Intro: What makes Postgres Postgres-y? How has the open source community dealt with forks, rewrites and extensions over time, and how is that relevant to our discussion of \u2018modern\u2019 Postgres-derived databases? \r\n- The meat: comparing and contrasting various Postgres-derived databases, understanding their feature sets, what makes them unique and what use cases they\u2019re particularly well suited for\r\n    - Google\u2019s AlloyDB Omni, AI/ML capabilities and columnar engines\r\n    - Amazon Aurora and Neon, both serverless Postgres-compatible databases, and what we mean by \"Postgres compatible' \r\n    - TimescaleDB, PostGIS and other specialized extensions of Postgres, and why open source is cool and allows for infinite extensibility\r\n    - And of course open source Postgres, and what makes its most recent features relevant in 2025\r\n\r\nConclusion: \r\nHow the open source nature of Postgres has led to its continued evolution and relevancy in the data landscape, allowing it to evolve to meet new use cases like realtime data analytics and AI/ML. \r\n\r\nWhat the audience will learn: \r\nThe feature sets of a variety of Postgres alternatives, what features are best suited for certain use cases, how some of those features (for instance, AlloyDB\u2019s columnar engine) stack up against databases dedicated to those features (for instance, vs. ClickHouse for columnar data), and how open source project licensing affects the creation of all these new alternatives.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KVW9PZ", "name": "Celeste Horgan", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KVW9PZ_81Wgoiy.png", "biography": "Celeste is a Developer Educator at Aiven, a managed database services company heavily invested in the PostgreSQL ecosystem. She has been involved in open source software as a technical writer and contributor for the Kubernetes project since 2020, and has had her work on inclusive language in tech featured in the New York Times.", "public_name": "Celeste Horgan", "guid": "10bff601-53ec-5708-bde9-89c09049598a", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/KVW9PZ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/VPXKLT/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/VPXKLT/", "attachments": []}, {"guid": "a6730750-d1c1-5aa6-bf5f-baeab88e4265", "code": "PNQJZQ", "id": 65293, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/PNQJZQ/Entwurf_1_22_pnbKA1m.png", "date": "2025-06-17T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-65293-data-quality-management-the-good-the-bad-and-the-messy", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/PNQJZQ/", "title": "Data Quality Management: The Good, The Bad, and The Messy", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Not all data is good\u2014some is bad, and much is messy. Poor data quality affects customers, employees, and decisions. This session traces issues from symptoms to root causes and explores strategies to fix them. Managing data quality is like battling a seven-headed beast, but with the right approach, you can turn chaos into clarity.", "description": "We all recognize that high-quality data is essential for driving value in analytics, AI, and business operations. Yet, in reality, not all data is good\u2014some is bad, and much of it is just plain messy. While organizations acknowledge the importance of data quality, choosing the right approach to improve it remains a challenge. How can you systematically turn messy, unreliable data into a trustworthy asset?\r\n\r\nIn this session, we take a fresh approach to data quality management. Instead of tackling issues in isolation, we start downstream\u2014examining the real-world symptoms of poor data quality as experienced by customers and employees. From there, we trace problems back to their upstream root causes and explore practical solutions to address them. However, resolving data quality issues is rarely straightforward\u2014it\u2019s like battling a seven-headed beast, where fixing one issue often reveals several others. To tackle this effectively, we introduce data quality management strategies tailored to different levels of organizational maturity.\r\n\r\nWhat You\u2019ll Learn:\r\n\u2705 The Data Quality Triangle: Symptoms, Root Causes, and Solutions\r\n\u2705 Why solving data quality issues feels like battling a seven-headed beast\r\n\u2705 Practical data quality management strategies for different maturity levels", "recording_license": "", "do_not_record": false, "persons": [{"code": "SSTGVM", "name": "Jan Meskens", "avatar": "https://program.berlinbuzzwords.de/media/avatars/SSTGVM_RT635DL.png", "biography": "Jan Meskens is a seasoned data consultant with over a decade of experience in various data consulting roles. Through his consulting firm, Sievax, Jan has been pivotal in helping companies successfully integrate and implement data-driven strategies.\r\n\r\nIn academia, Jan shares his expertise with students at University College, where he teaches courses focused on artificial intelligence and data-centric topics. Beyond his consultancy and teaching, he actively contributes to the broader data community by writing insightful articles\r\non Medium and presenting on data-related subjects at numerous conferences, meetups, and workshops.\r\n\r\nHolding a PhD in Human-Computer Interaction, Jan brings a unique perspective to the fields of data and artificial intelligence. His guiding principle is clear: making data usable and understandable for everyone within an organization leads to valuable insights and\r\noutcomes.", "public_name": "Jan Meskens", "guid": "372d6ccd-9d20-56ae-8a93-a9112481d64b", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/SSTGVM/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/PNQJZQ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/PNQJZQ/", "attachments": [{"title": "Slides", "url": "/media/bbuzz25/submissions/PNQJZQ/resources/Jan_Meskens_-_Data_7etFNzK.pdf", "type": "related"}]}, {"guid": "3dd38de0-e7fd-5ad2-9eb1-1179fb347cba", "code": "KKCRNG", "id": 59390, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/KKCRNG/Entwurf_1_35_jNyldvx.png", "date": "2025-06-17T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-59390-analysing-public-kafka-data-from-nasa-satellites", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/KKCRNG/", "title": "Analysing Public Kafka Data from NASA Satellites", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This session builds on the foundational OSS technologies of the modern Lakehouse\u2014Apache Kafka, Spark, Unity Catalog and MLFlow\u2014and shows how everyone can analyze supernova data coming from NASA's satellites and analyze data streams with natural language and plot their own map cosmic events.", "description": "Experience how cosmic events become streaming data in this tech-focused demo, running on the Databricks Lakehouse. Using foundational OSS technologies (Apache Kafka, Apache Spark\u2122, Unity Catalog, MLflow), we'll capture and analyze supernova data streams in real time. While this is a pure tech talk with reusable open-source code, you'll naturally grasp unified lakehouse concepts along the way.", "recording_license": "", "do_not_record": false, "persons": [{"code": "SWN3QG", "name": "Frank Munz", "avatar": "https://program.berlinbuzzwords.de/media/avatars/SWN3QG_vcD98KG.jpg", "biography": "Frank Munz solves large-scale data and AI challenges at Databricks. He authored three computer science books, built up technical evangelism for Amazon Web Services in Germany, Austria, and Switzerland, and once upon a time worked as a data scientist with a group that won a Nobel prize.\r\n\r\nFrank has presented at top-notch conferences on every continent (except Antarctica, due to its inhospitable climate). His speaking engagements include Devoxx, Kubecon, and Java One.\r\n\r\nHe is renowned for his world-class demos, which often showcase innovative and interactive applications of technology. Some notable examples include:\r\n\r\n* Once Frank spit into a test tube, got his DNA analyzed and shared it with attendees using OSS Delta Sharing to let them explore his personal coffee metabolism snip. \r\n* Last year at the Data+AI Summit, Frank created a crowd-sourced distributed earthquake detection system that ingested streaming data from 250 attendees\u2019 phone motion sensors at a rate of 100 million IoT events per day.\r\n\r\nHe holds a Ph.D. with summa cum laude in Computer Science from TU Munich where he worked on Supercomputing in brain research (a system that allows better diagnosis for children with epilepsy possibly undergoing brain surgery)", "public_name": "Frank Munz", "guid": "2739bf9a-a43e-5600-afde-cec4822e520f", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/SWN3QG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/KKCRNG/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/KKCRNG/", "attachments": []}, {"guid": "26b4a3e8-6fc2-5d1c-828c-cc6707f8a673", "code": "UDDJ7T", "id": 65582, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/UDDJ7T/Entwurf_1_DX8hhH0.png", "date": "2025-06-17T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-65582-all-the-dataops-all-the-paradigms", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UDDJ7T/", "title": "All the DataOps, all the paradigms", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Data warehouses, lakes, lakehouses, streams, fabrics, hubs, vaults, and meshes. We sometimes choose deliberately, sometimes influenced by trends, yet often get an organic blend. But the choices have orders of magnitude in impact on operations cost and iteration speed. Let's dissect the paradigms and their operational aspects once and for all.", "description": "I have seen dozens of data platforms and noticed how architectural choices are often made without regarding the operational consequences, resulting in excessive operational burden and slow development. These choices have huge impact on effectiveness of data-centric organisations and separate disruptive companies from legacy enterprises. I will explain how the common operational procedures \u2013 deployment, failure handling, late data, data quality problems, bug remediation \u2013 have different impact depending on data processing paradigm, and how to handle them with minimal cost and latency where possible. I will also cover when and how to bridge between the paradigms. I will finally share some innovations that we have discovered further improves development iteration speed and operational efficiency.\r\n\r\nI have found that the distinction between different data processing paradigms is often not clear, and that their differences in practice is not concisely explained anywhere. This presentation is an attempt to create that explanation.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HUXQYG", "name": "Lars Albertsson", "avatar": "https://program.berlinbuzzwords.de/media/avatars/HUXQYG_ku8U5xI.jpg", "biography": "Lars Albertsson is the founder of Scling, a data engineering startup based in Stockholm. Scling provides customer tailored data engineering, analytics, and artificial intelligence implementations. Lars is a frequent conference speaker on data engineering and data strategy. Before founding Scling, Lars has worked at Google, Spotify, Schibsted, and as an independent consultant, helping organisations create business value from data processing and AI.", "public_name": "Lars Albertsson", "guid": "703b40f4-9485-5763-a724-c7863aefe36f", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/HUXQYG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UDDJ7T/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/UDDJ7T/", "attachments": []}, {"guid": "1f2db93d-93a2-5620-afa6-ec7f75beb769", "code": "WFWTFL", "id": 66184, "logo": "https://program.berlinbuzzwords.de/media/bbuzz25/submissions/WFWTFL/Entwurf_1_18_MEGZKXQ.png", "date": "2025-06-17T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz25-66184-how-to-train-a-fast-llm-for-coding-tasks", "url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WFWTFL/", "title": "How to train a fast LLM for coding tasks", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Coding LLMs are now part of our daily work, making coding easier. In this talk, we share how we built an in-house LLM for AI code completion in JetBrains products, covering design choices, data preparation, training, and model\u2019s evaluation.", "description": "In this talk, we present our approach to training a code completion model using Mellum, our new open-source model, as an example. Mellum powers in-file code completion in AI-enabled JetBrains IDEs. We'll walk through the entire process, from designing the model and preparing the dataset \u2014 with emphasis on the permissiveness of using data \u2014 to the training process and evaluation strategies. Attendees will gain insights into state-of-the-art techniques and the challenges we faced and discover practical approaches to optimizing AI models for real-world coding environments. This talk is relevant for developers and ML Engineers interested in ML feature development and custom model training.", "recording_license": "", "do_not_record": false, "persons": [{"code": "RP87NY", "name": "Ivan Dolgov", "avatar": "https://program.berlinbuzzwords.de/media/avatars/RP87NY_kFMDurL.jpeg", "biography": "Senior MLE@JetBrains\r\n\r\n- Training models which write code-related things", "public_name": "Ivan Dolgov", "guid": "83f45357-8e2b-5333-8960-bd2c3a5c09d5", "url": "https://program.berlinbuzzwords.de/bbuzz25/speaker/RP87NY/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WFWTFL/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz25/talk/WFWTFL/", "attachments": []}]}}]}}}