{"$schema": "https://c3voc.de/schedule/schema.json", "generator": {"name": "pretalx", "version": "2026.1.1"}, "schedule": {"url": "https://program.berlinbuzzwords.de/bbuzz26/schedule/", "version": "0.3", "base_url": "https://program.berlinbuzzwords.de", "conference": {"acronym": "bbuzz26", "title": "Berlin Buzzwords 2026", "start": "2026-06-07", "end": "2026-06-09", "daysCount": 3, "timeslot_duration": "00:05", "time_zone_name": "Europe/Berlin", "colors": {"primary": "#3d3182"}, "rooms": [{"name": "Kesselhaus", "slug": "4656-kesselhaus", "guid": "b2d7567f-1142-58dd-aabe-79c430a1b795", "description": null, "capacity": null}, {"name": "Maschinenhaus", "slug": "4657-maschinenhaus", "guid": "feaeb78e-bed3-5efe-a10e-2e84442f0fc7", "description": null, "capacity": null}, {"name": "Palais Atelier", "slug": "4658-palais-atelier", "guid": "fdb7f58b-e7b7-5f38-b6c8-73521465941c", "description": null, "capacity": null}, {"name": "Frannz Salon", "slug": "4659-frannz-salon", "guid": "e1f82bb9-38e1-5c6e-bfa2-43e633286c38", "description": null, "capacity": null}], "tracks": [], "days": [{"index": 1, "date": "2026-06-07", "day_start": "2026-06-07T04:00:00+02:00", "day_end": "2026-06-08T03:59:00+02:00", "rooms": {"Palais Atelier": [{"guid": "f6c16018-2a2e-5583-8c16-04815c1c1a03", "code": "NUWNJU", "id": 96937, "logo": null, "date": "2026-06-07T14:30:00+02:00", "start": "14:30", "duration": "03:00", "room": "Palais Atelier", "slug": "bbuzz26-96937-barcamp", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/NUWNJU/", "title": "Barcamp", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Barcamps are informal sessions, a kind of \"un-conference\", with a schedule decided on the day. It is all driven by the interests and expertise of those who attend so each one is different, but ours are always great!", "description": "Although the barcamp doesn't have a strict schedule, it won't be completely devoid of structure! #bbuzz barcamps are dynamic events, focused on the overall Berlin Buzzwords topics, tackling the same challenges but in a different format. At the barcamp each session runs for 30 minutes giving enough time to get into the meat of a topic, but without a chance of anyone getting bored. These are participatory sessions and more inclusive than regular conference talks, with everyone taking part. You can help by leading the session, by giving some insights, by asking some great questions, or maybe just with your enthusiasm.\r\n\r\nThe barcamp will be coordinated and moderated by Nick Burch.\r\n\r\nRegistration starts from 2:30pm", "recording_license": "", "do_not_record": false, "persons": [{"code": "97HYST", "name": "Nick Burch", "avatar": "https://program.berlinbuzzwords.de/media/avatars/97HYST_759PqjE.webp", "biography": "Nick has been involved in Open Source for longer than he cares to remember. He leads the Engineering team at Saible, which is trying to ensure that people working in construction projects actually get paid. That involved some hard technical challenges, lots of integrations, and rather a lot of cat-herding!", "public_name": "Nick Burch", "guid": "02fe34a8-176c-520f-a723-b897478d00b2", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/97HYST/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/NUWNJU/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/NUWNJU/", "attachments": []}]}}, {"index": 2, "date": "2026-06-08", "day_start": "2026-06-08T04:00:00+02:00", "day_end": "2026-06-09T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "536b7e80-c5e8-5617-a8e6-48cb3a587fd9", "code": "7X787J", "id": 96932, "logo": null, "date": "2026-06-08T09:30:00+02:00", "start": "09:30", "duration": "00:05", "room": "Kesselhaus", "slug": "bbuzz26-96932-opening-session", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7X787J/", "title": "Opening Session", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Join us as we kick off Berlin Buzzwords 2026!", "description": "-", "recording_license": "", "do_not_record": false, "persons": [{"code": "SKTAV7", "name": "Paul Berschick", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MN8NMT_i2ySCF9.webp", "biography": "Paul has first been involved as in the organization of Berlin Buzzwords as an intern in 2015 and has been a part of the team ever since. He's now the managing director of Plain Schwarz and together with his team also organizes events like FOSS Backstage or Scala Days. \r\nPaul describes himself as a Free and Open Source Software enthusiast and in his spare time you will find him listening to cricket on the radio or deeply immersed in a good book \u2013 sometimes even both.", "public_name": "Paul Berschick", "guid": "007b1a1d-5bb1-5af8-a5e7-067a25a47035", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/SKTAV7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7X787J/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7X787J/", "attachments": []}, {"guid": "38f5d5d6-41c5-5387-a5cb-c38d3baae697", "code": "TMP3LK", "id": 95358, "logo": null, "date": "2026-06-08T09:35:00+02:00", "start": "09:35", "duration": "00:45", "room": "Kesselhaus", "slug": "bbuzz26-95358-building-resilience-the-next-decade-of-open-source", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TMP3LK/", "title": "Building Resilience: The Next Decade of Open Source", "subtitle": "", "track": null, "type": "Keynote", "language": "en", "abstract": "Over 25 years, open source has become vital digital infrastructure. However, its future relies on human resilience, not just code. To combat burnout, funding gaps, and new regulations, we must move beyond old methods and address sustainability through global policy, security, and community health.", "description": "Quietly over the course of 25 years, open source software evolved from a domain perceived as that of only hobbyists into the invaluable backbone of modern digital infrastructure. Sustaining that success for the future will require more than code. It requires resilience: a trait not of technology. but of people. Of community. With increasing regulation around the world, evolving cybersecurity requirements, burnt out contributors, and stagnant corporate participation and funding, how do we ensure the ecosystem's continued success? The things that have worked for the first decades will not be the things that keep us going. Let's look together at sustainability not only as a funding problem, but also from the perspectives of global policy changes, security, and other intertwined issues that face open source in the coming years.", "recording_license": "", "do_not_record": false, "persons": [{"code": "QXYN9Z", "name": "Ruth Suehle", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JTUF7K_8o6iO7K.webp", "biography": "Ruth Suehle is Director of Open Source at SAS, where she is building a nearly 50-year-old analytics, data management, and AI software company\u2019s first open source program office. She is also president of the Apache Software Foundation and a member of the Open Source Initiative (OSI) board of directors. Ruth has helped build open source communities for nearly two decades, much of which she spent in the OSPO at Red Hat. Co-author of Raspberry Pi Hacks (O\u2019Reilly, December 2013) and former editor of Red Hat Magazine and opensource.com, she is a frequent writer, currently as core contributor at GeekMom.com(previously of WIRED), where she covers the adventures of motherhood and fandom.", "public_name": "Ruth Suehle", "guid": "755ae7bd-c33b-5aea-95ac-b867f03f03ca", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/QXYN9Z/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TMP3LK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TMP3LK/", "attachments": []}, {"guid": "edf61342-0fe9-5de9-b2f2-9daee0a96271", "code": "KTRN8U", "id": 90614, "logo": null, "date": "2026-06-08T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz26-90614-low-resource-languages-as-stress-tests-for-nlp-data", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/KTRN8U/", "title": "Low-Resource Languages as Stress Tests for NLP Data", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Low-resource languages expose weaknesses in NLP systems that are often hidden by benchmark data. Drawing on experience annotating fieldwork data, this talk shows how ambiguity and annotation decisions reveal fundamental data quality issues relevant to real-world NLP pipelines.", "description": "This talk is an experience report on annotating language data in a low-resource setting and what this process reveals about data quality in NLP pipelines. Rather than treating low-resource languages as edge cases, the talk frames them as stress tests that make structural data issues visible early and clearly.\r\n\r\nThe session outlines what linguistic fieldwork data looks like before it becomes \u201ctraining data,\u201d highlighting ambiguity, context dependence, and variation that cannot always be resolved through additional labeling. It then focuses on the annotation decisions required when categories are underspecified or multiple analyses are plausible, and connects these challenges to familiar issues in applied NLP, such as label noise, brittle representations, and unexpected model behavior.\r\n\r\nThe goal is to share practical lessons from linguistic data work that help NLP practitioners reason more realistically about annotation, uncertainty, and robustness. Attendees will gain concrete insights into why \u201cclean data\u201d is often an illusion and how early data decisions shape downstream systems.", "recording_license": "", "do_not_record": false, "persons": [{"code": "RVV3BF", "name": "Priscilla Lola Adenuga", "avatar": "https://program.berlinbuzzwords.de/media/avatars/RVV3BF_lhf6mmf.webp", "biography": "Priscilla Lola Adenuga works with language data at the intersection of linguistics and NLP. Her background is in syntactic analysis and linguistic fieldwork, with hands-on experience annotating low-resource language data. She is interested in data quality, annotation practices, and how insights from linguistics can inform more robust and realistic NLP systems.", "public_name": "Priscilla Lola Adenuga", "guid": "e0d5d46e-49bc-5440-87da-7534f343bf98", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/RVV3BF/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/KTRN8U/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/KTRN8U/", "attachments": []}, {"guid": "1c863a11-2608-5e9c-bc1d-92947d18377d", "code": "ANZCWN", "id": 91237, "logo": null, "date": "2026-06-08T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91237-dynamic-broker-side-filtering-for-kafka", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ANZCWN/", "title": "Dynamic Broker-Side Filtering for Kafka", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "KAFKA-6020 has been open for 7 years. This talk demos broker-side filtering for Kafka with sub-millisecond latency (p99 < 25ms). Live demo with working code shows how it reduces network costs, simplifies consumers, and enables new use cases. Real-world validation from financial services and logistics deployments.", "description": "Watch a live implementation of broker-side filtering that solves a 7-year-old debate. You'll see working code, performance benchmarks, and real production deployments from financial services and logistics. Leave with a Kafka-compatible solution you can deploy immediately on StreamNative's platform with sub-millisecond filtering that cuts network traffic by 60-80%. Perfect for real-time analytics and compliance monitoring.", "recording_license": "", "do_not_record": false, "persons": [{"code": "Z3K83Y", "name": "David Kjerrumgaard", "avatar": "https://program.berlinbuzzwords.de/media/avatars/Z3K83Y_gvDM08F.webp", "biography": "David Kjerrumgaard is a Developer Advocate at StreamNative and a committer on the Apache Pulsar project. He is recognized for his expertise in real-time data streaming, messaging systems, and big data technologies. As author of Pulsar in Action and co-author of Practical Hive, he has established himself as a leading voice in the streaming data ecosystem.\r\n\r\nAn accomplished international speaker, David presents at conferences worldwide on big data, streaming technologies, and agentic AI. His technical contributions extend beyond the stage\u2014he actively contributes to Apache NiFi and maintains his committer status on Apache Pulsar, directly advancing these open-source platforms.", "public_name": "David Kjerrumgaard", "guid": "079e56a3-6122-5876-9c37-9126212fbc01", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/Z3K83Y/"}, {"code": "FZKCJS", "name": "\u00c1lvaro Rodr\u00edguez", "avatar": "https://program.berlinbuzzwords.de/media/avatars/R9WMKJ_hGpmJHG.webp", "biography": "I am a Spaniard customer-oriented engineer living in Switzerland. Working on StreamNative since 2022 as Solutions Engineer.\r\n\r\nI have 20 years of experience working at different levels, from C++ developer under Linux to security consultant.\r\n\r\nIn a previous life, I did a Master's in Neuroscience.", "public_name": "\u00c1lvaro Rodr\u00edguez", "guid": "0a9c2689-e27e-58fc-a78b-349b53f7955b", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/FZKCJS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ANZCWN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ANZCWN/", "attachments": []}, {"guid": "55a044bc-cc88-5dc2-af9e-7f7bd176b7f4", "code": "XULYFE", "id": 91508, "logo": null, "date": "2026-06-08T12:00:00+02:00", "start": "12:00", "duration": "00:45", "room": "Kesselhaus", "slug": "bbuzz26-91508-the-agent-era-how-ai-agents-are-reshaping-data-platforms", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XULYFE/", "title": "The Agent Era: How AI Agents Are Reshaping Data Platforms", "subtitle": "", "track": null, "type": "Panel", "language": "en", "abstract": "AI agents have quietly become some of the most demanding users of modern data platforms and most weren't built with them in mind. In this panel, leaders from Snowflake, Elastic, ClickHouse, and Xata share what agentic workloads actually look like in production: what broke, what had to be rebuilt, and where the architecture is heading.", "description": "Autonomous AI agents are becoming first-class users of data infrastructure and most data platforms weren't designed for them. This panel brings together engineers from Snowflake, Elastic, ClickHouse, and Xata to have an honest conversation about what that collision looks like in practice.\r\nEach platform brings a different angle: cloud-scale warehousing, search and observability, real-time analytics, and Postgres. They'll explore what it concretely means to make a data platform agent-ready, from query reliability to access control to the performance characteristics that agentic loops require.\r\n\r\nNote: I confirmed with a few guest, but once the panel is approved, I can confirm with other leaders from data platforms to join the panel.  \r\nIf we can schedule it on Monday, the Head of devrel from Elastic is able to join.", "recording_license": "", "do_not_record": false, "persons": [{"code": "7WXDEG", "name": "Monica Sarbu", "avatar": "https://program.berlinbuzzwords.de/media/avatars/7WXDEG_ZaCeqaY.webp", "biography": "She is the founder and CEO of Xata, a Postgres platform for modern development, backed by Index Ventures and the founders of Elastic, Confluent, Vercel, and Netlify. Before Xata, she founded Packetbeat, an open source network monitoring solution that was acquired by Elastic in 2015. At Elastic, Packetbeat became Beats, the observability data shipper that surpassed 300 million downloads in its first two years and is used by organizations of all sizes worldwide.\r\nMonica is also the founder of Tupu.io, a non-profit providing free mentorship to underrepresented people breaking into tech.", "public_name": "Monica Sarbu", "guid": "68d7a08c-58ad-5d65-a291-fa0ba2b770ab", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/7WXDEG/"}, {"code": "BQKRWQ", "name": "Danica Fine", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BQKRWQ_KJfRbhF.webp", "biography": "Danica began her career as a software engineer in financial services and pivoted to developer relations, where she focussed primarily on open source technologies under the Apache Software Foundation umbrella such as Apache Kafka and Apache Flink. She now leads the open source advocacy efforts at Snowflake, supporting Apache Iceberg and Apache Polaris (incubating). She can be found on X (Bluesky and Mastodon), talking about tech, plants, and baking @TheDanicaFine.", "public_name": "Danica Fine", "guid": "6d1075a1-ba4b-5270-abc2-38f3f2b1b25d", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/BQKRWQ/"}, {"code": "EFGBTQ", "name": "Philipp Krenn", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EFGBTQ_LofyOOy.webp", "biography": "Philipp lives to demo interesting technology. Having worked as a web, infrastructure, and database engineer for over ten years, Philipp is now the head of Developer Advocacy at Elastic \u2014 the company behind the Elastic Stack consisting of Elasticsearch, Kibana, Beats, and Logstash. Based in the heart of San Francisco, he is close to the cutting edge of technology without getting lost in the latest hype.", "public_name": "Philipp Krenn", "guid": "72a7db76-e027-575d-8a15-bdc76aa2f301", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/EFGBTQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XULYFE/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XULYFE/", "attachments": []}, {"guid": "95442500-b152-5712-b024-aac4e952f659", "code": "PLNTP9", "id": 91403, "logo": null, "date": "2026-06-08T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91403-agentic-retrieval-building-self-optimizing-search-systems", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/PLNTP9/", "title": "Agentic Retrieval: Building Self-Optimizing Search Systems", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Relevance feedback loops used to take months. AI agents can now compress the process to seconds. This talk explores agentic retrieval: systems where agents adjust scoring models, schema, and indexing in real time. Learn how to build retrieval infrastructure with verifiable APIs that enable agents to optimize their own search context.", "description": "Relevance feedback loops used to take months. Developers would capture interaction data, train models offline, and push updates through slow deployment cycles. The arrival of AI agents as a new class of search user has compressed this cycle to seconds. In agentic workflows, retrieval is no longer a single tool call that returns results; it is a tight, iterative loop where the agent refines its own queries, evaluates result quality, and tries again.\r\n\r\nThis talk goes beyond basic retrieval-augmented generation (RAG) to explore what comes next: *Agentic Retrieval*. We are entering a paradigm where agents don't just reformulate queries, but dynamically adjust the retrieval system itself, tuning scoring models, modifying schema configurations, and making indexing decisions to match the specific demands of a task. This is the logical extreme of the feedback loop: a self-reinforcing system where the agent optimizes its own context window.\r\n\r\nWe will present the infrastructure principles that make this possible, drawing on our work building agent-native retrieval at Hornet. The talk covers:\r\n\r\n* **Schema-first API design** that gives agents a structured, predictable interface to work with\r\n* **Verifiable state changes** that let agents confirm the effect of their own modifications\r\n* **RL-compatible feedback signals** that enable agents to self-correct rather than relying on human-in-the-loop tuning\r\n\r\nAttendees will leave with a concrete understanding of how to architect a retrieval stack where agents can tune their own environment in real time, and why the shift from human-facing search to agent-facing retrieval infrastructure demands fundamentally different design choices.", "recording_license": "", "do_not_record": false, "persons": [{"code": "THFGGN", "name": "Skip Everling", "avatar": "https://program.berlinbuzzwords.de/media/avatars/THFGGN_MUfhPOa.webp", "biography": "**Skip Everling** is Head of Developer Relations at Hornet.dev, with over two decades of experience across AI infrastructure, developer tools, and technical go-to-market. He helps developers use Hornet, the retrieval engine built for agents with a verifiable API surface that agents can configure, deploy, query, and optimize.", "public_name": "Skip Everling", "guid": "fc7af4cf-5ddf-5cb4-a098-a2da259b7083", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/THFGGN/"}, {"code": "RKQQ97", "name": "Jo Kristian Bergum", "avatar": "https://program.berlinbuzzwords.de/media/avatars/RKQQ97_OSyBwF9.webp", "biography": "Jo Kristian Bergum is the CEO of HORNET.dev and a 25-year veteran of the search industry, formerly serving as the Chief Scientist at Vespa.ai and a Distinguished Engineer at Yahoo.", "public_name": "Jo Kristian Bergum", "guid": "2a280da7-7a52-5f9f-b4fb-f222e19c540c", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/RKQQ97/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/PLNTP9/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/PLNTP9/", "attachments": []}, {"guid": "a0d535b4-22f4-512f-bedd-0133ab09de38", "code": "E7HE9V", "id": 90262, "logo": null, "date": "2026-06-08T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz26-90262-ultraviolet-turn-hidden-document-data-into-an-ai-advantage", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/E7HE9V/", "title": "Ultraviolet: Turn Hidden Document Data into an AI Advantage", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Every PDF hides a world of structure, metadata and embedded signals that can silently influence AI based processing. With ultraviolets, we reveal how those can be exploited for malicious purposes and even become powerful tools for smarter applications. Designing for both humans and machines become a vital aspect of AI experience design.", "description": "Artificial intelligence is no longer only something we build \u2014 it is something we design for. As AI systems increasingly mediate how users access information, make decisions, and interact with digital products, a new role is emerging: designing how intelligence itself is perceived, trusted, and behaves in real-world environments. This perspective becomes especially critical when AI systems depend on complex information artifacts such as documents. \r\n\r\nDocuments remain one of the primary means of information exchange across industries, with PDFs alone accounting for billions of files generated each year. Despite their ubiquity, PDFs are often treated merely as containers of visible text and images. In reality, they encapsulate a much richer and more complex internal structure, including annotations, cross-references, accessibility artifacts (such as alternate text), hidden or layered content, embedded attachments, metadata, and other non-obvious elements. These components are largely invisible to users, yet they can have a profound impact on downstream artificial intelligence systems.\r\n\r\nThis talk explores how agentic workflows, automated information extraction, and retrieval-augmented generation (RAG) can be influenced, or even exploited by the way PDF internals are interpreted. We examine the types of hidden information that can be found or intentionally included within PDFs, how parsers and document processing tools handle (or ignore) this information.\r\n\r\nWe further investigate the risks and opportunities associated with PDF metadata and hidden content. On one hand, poorly handled metadata can introduce vulnerabilities, including malicious data-injection attacks that target AI pipelines at the document layer. On the other hand, these same mechanisms may offer untapped potential: can documents embed structured signals, pre-computed representations, or even vector-like information that could enhance retrieval, indexing, or storage? Could documents themselves act as intelligent carriers of contextual knowledge?\r\n\r\nUsing practical examples, the talk aims to make \u201cvisible\u201d the \u201cinvisible\u201d layer behind visualized text and images, and its interaction with AI systems. Framed through the lens of AI experience design, we discuss what it means to make content truly AI-ready, why structure and intent matter when information is consumed by both humans and machines, and how responsible design can improve reliability, transparency, and control.\r\n\r\nParticipants will gain a deeper understanding of how hidden document structures affect AI behavior, how to safeguard pipelines against adversarial or accidental misuse, and how to responsibly leverage document internals to build more robust, trustworthy, and intentionally designed AI-powered knowledge systems.", "recording_license": "", "do_not_record": false, "persons": [{"code": "RE7CD9", "name": "Alessio Vertemati", "avatar": "https://program.berlinbuzzwords.de/media/avatars/RE7CD9_pivsuE0.webp", "biography": "I'm a passionate AI Engineer from Italy. I spend most of my time working with PHP and Python to build AI-powered experiences. Diving into the realms of documents is my second life.", "public_name": "Alessio Vertemati", "guid": "587c4444-daba-5c0c-b628-5a0b4ef78095", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/RE7CD9/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/E7HE9V/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/E7HE9V/", "attachments": []}, {"guid": "af8375ba-27da-5d98-b1a3-f53335a1cb10", "code": "WCAJ99", "id": 89327, "logo": null, "date": "2026-06-08T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-89327-how-apache-iceberg-enables-multi-engine-data-platforms", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/WCAJ99/", "title": "How Apache Iceberg Enables Multi-Engine Data Platforms", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "the session will cover operational best practices, including metadata management, file sizing, compaction strategies, and performance tuning at scale. Attendees will leave with practical guidance for designing &operating open, flexible, multi-engine data architectures built on Apache Iceberg, enabling faster analytics, lower operational flexibility", "description": "Modern data platforms increasingly rely on multiple compute engines to serve diverse workloads, from batch analytics to interactive SQL and streaming. Without a shared table layer, this flexibility often leads to duplicated data, inconsistent results, and operational complexity.\r\n\r\nApache Iceberg provides a common table abstraction that decouples storage from compute, enabling multiple engines such as Spark, Trino, and Flink to operate safely on the same data. This talk explores the architectural patterns that make multi-engine platforms possible, including metadata-driven concurrency, snapshot isolation, and schema evolution.\r\n\r\nWe\u2019ll discuss how to choose the right engine for different workloads, how catalogs act as the coordination layer, and what operational practices are required to maintain performance and consistency at scale. Attendees will leave with practical guidance for designing open, multi-engine data architectures built on Apache Iceberg", "recording_license": "", "do_not_record": false, "persons": [{"code": "X78MGU", "name": "Geetha Anne", "avatar": "https://program.berlinbuzzwords.de/media/avatars/X78MGU_9UwQLkO.webp", "biography": "Geetha is a Solutions Architect specializing in big data management , storage , Kubernetes and Durable execution, with expertise in cloud-native and on-premises solutions. She ensures customer success and maturity by delivering effective, simplified solutions.", "public_name": "Geetha Anne", "guid": "56637f67-50ce-5e50-9002-8bd119cbd0a2", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/X78MGU/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/WCAJ99/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/WCAJ99/", "attachments": []}, {"guid": "3a2f7314-1644-522e-be3c-4fb38628a181", "code": "GEHRDC", "id": 91484, "logo": null, "date": "2026-06-08T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91484-10x-couchdb-performance-gains-for-a-aaa-game-launch", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GEHRDC/", "title": "10x CouchDB Performance Gains for a AAA Game Launch", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "All software benchmarks and claims of performance are carefully crafted lies and this talk is no different.  Instead of giving you a quick \u201cdo steps one, two, three for a magic speedup\u201d, we aim to explain how we arrived at the changes we made and how we rigorously tested those changes to make sure we understand their impact.", "description": "This talk will take the attendee on a performance tuning journey. From benchmarking fundamentals as the foundation, we are going through six distinct steps of always finding the next bottleneck in a large distributed cluster setup of CouchDB. We will cover, in-depth, ways to measure and improve:\r\n\r\n- Disk I/O\r\n- HTTP request and response times\r\n- TCP Accept handling\r\n- CPU Utilisation and Process Scheduling in an Erlang system\r\n- Erlang cluster communication networking\r\n\r\nIn the end, our client successfully launched their latest version of a AAA sports game with capacity to spare.", "recording_license": "", "do_not_record": false, "persons": [{"code": "VQWSRT", "name": "Jan Lehnardt", "avatar": "https://program.berlinbuzzwords.de/media/avatars/VQWSRT_Sg17XSJ.webp", "biography": "Jan Lehnardt is a developer and businessperson from Berlin. He\u2019s the PMC Chair for Apache CouchDB and PouchDB as well as a CEO at Neighbourhoodie Software. He\u2019s been building scalable database solutions with CouchDB since 2007.", "public_name": "Jan Lehnardt", "guid": "d144f95e-1320-5122-a197-4b1b22781b80", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/VQWSRT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GEHRDC/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GEHRDC/", "attachments": []}, {"guid": "635ac3f7-17d0-53f7-8b3d-a745a5a4d4b4", "code": "SSYYQ8", "id": 97864, "logo": null, "date": "2026-06-08T17:20:00+02:00", "start": "17:20", "duration": "00:45", "room": "Kesselhaus", "slug": "bbuzz26-97864-ai-is-here-time-to-throw-away-our-search-engines", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/SSYYQ8/", "title": "AI is here \u2013 time to throw away our search engines?", "subtitle": "", "track": null, "type": "Panel", "language": "en", "abstract": "Why do we even need traditional search when AI can do everything? Or is it foolish to ignore simple, proven techniques for delivering great results? What's the best way to combine old and new? Join our panel of experts for a fun and provocative debate!", "description": "AI has revolutionised the world of search \u2013 first by giving us better ways to understand language, rewrite content and provide single answers, and latterly with augmented coding techniques & AI agents to configure our engines & run our searches for us.\r\n\r\nIf you're working on search applications today you're probably looking at AI techniques first - but there's decades of work behind the traditional search techniques that you can't afford to ignore. Our panel, which includes leading experts on both old-school and new search techniques, will help you decide how to combine the best of both worlds.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3MCCUJ", "name": "Charlie Hull", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3MCCUJ_NaCDQZj.webp", "biography": "I'm The Search Juggler, an expert search consultant who has been helping companies large and small build scalable, performant and accurate search applications for over 25 years. My clients have included governments, global e-commerce giants, law firms and startups. I co-host the London Search & AI Meetup and ran the Haystack conference series for 5 years. I'm an OpenSearch Ambassador, and a Vespa.ai Partner - but I work with many different search engines.", "public_name": "Charlie Hull", "guid": "d4e6be87-f5d7-5719-ac4f-f0df4a854ca4", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/3MCCUJ/"}, {"code": "JEQ97T", "name": "Atita Arora", "avatar": "https://program.berlinbuzzwords.de/media/avatars/TRRRA8_ADUm6VJ.webp", "biography": "Atita Arora is an open-source contributor and PMC at Apache OpenNLP, with a long-standing career dedicated to advancing search, information retrieval systems, and AI. She focuses on advancing search technologies that connect research to meaningful, real-world applications. A regular speaker at international conferences, she also co-leads Women in Search, advocating for diversity and inclusion across the tech community. Atita is an independent AI and Search consultant, advancing practical innovation in modern search systems, driven by the belief that innovation delivers its greatest value when shared and applied.", "public_name": "Atita Arora", "guid": "51065684-2404-50ee-a9b8-3e222ebbaddd", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/JEQ97T/"}, {"code": "RKQQ97", "name": "Jo Kristian Bergum", "avatar": "https://program.berlinbuzzwords.de/media/avatars/RKQQ97_OSyBwF9.webp", "biography": "Jo Kristian Bergum is the CEO of HORNET.dev and a 25-year veteran of the search industry, formerly serving as the Chief Scientist at Vespa.ai and a Distinguished Engineer at Yahoo.", "public_name": "Jo Kristian Bergum", "guid": "2a280da7-7a52-5f9f-b4fb-f222e19c540c", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/RKQQ97/"}, {"code": "E9MPXG", "name": "Dmitry Kan", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UBSNXT_ubYA7W3.webp", "biography": "Dmitry is currently in charge of managed OpenSearch product business at Aiven, where he leads both product and engineering supporting clients around the world. He previously served as Senior Product Manager at TomTom, Principal  AI Scientist at Silo AI / AMD, and Head of Search at AlphaSense.\r\n\r\nHe is the founder and host of the Vector Podcast.\r\n\r\nContributor to open source (Quepid, Luke), and member of the OpenSearch Search Technical Advisory Group (TAG). Applied and extended Apache Solr and Apache Lucene for 10 years and worked with Elasticsearch and OpenSearch for over 6 years. Dmitry believes in the power of live discussion with every practitioner that drives a wider understanding of where we are moving as the search community.", "public_name": "Dmitry Kan", "guid": "d261bf59-24b7-5bec-8d4d-0b69d5fcdfbe", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/E9MPXG/"}, {"code": "DQMD97", "name": "Evgeniya Sukhodolskaya", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DQMD97_y876bMD.webp", "biography": "Developer Advocate at Qdrant with 8 years of IT experience across software engineering, machine learning, and developer advocacy.\r\nHolds a Technical University of Munich master's degree in Data Analytics and Engineering.\r\nPassionate about NLP and Information Retrieval.\r\nBelieves in conference-, complaints- and memes-driven development:)", "public_name": "Evgeniya Sukhodolskaya", "guid": "76b95d48-3f05-5de6-a988-646d836fb2e3", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/DQMD97/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/SSYYQ8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/SSYYQ8/", "attachments": []}, {"guid": "45cbab43-3f96-579b-9eac-88c223d4e31f", "code": "TYGHXR", "id": 97767, "logo": null, "date": "2026-06-08T18:05:00+02:00", "start": "18:05", "duration": "03:00", "room": "Kesselhaus", "slug": "bbuzz26-97767-get-together", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TYGHXR/", "title": "Get-Together", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Join us for food and drinks at Palais Kulturbrauerei!", "description": "-", "recording_license": "", "do_not_record": false, "persons": [], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TYGHXR/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TYGHXR/", "attachments": []}], "Maschinenhaus": [{"guid": "111fdbc0-5162-55b1-9a03-6e2ed0e964c7", "code": "9STDXK", "id": 91254, "logo": null, "date": "2026-06-08T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz26-91254-opensearch-software-foundation-1-year-of-open-governance", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9STDXK/", "title": "OpenSearch Software Foundation: 1 Year of Open Governance", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "In this presentation, we will talk through moving a major open source project into a foundation and the benefits of open governance, and a vendor-neutral home has proven through a sustained growth in community contributions.", "description": "In September 2024, the OpenSearch community announced the formation of this new home for the project, the OpenSearch Software Foundation, and since then we\u2019ve successfully transitioned to the Linux Foundation's technical and governance stack. Our mission is to empower users to navigate the OpenSearch ecosystem, recruit skilled talent, and adopt the platform effectively, all while supporting sustainable open source innovation.\r\n\r\nOver this period, the OpenSearch community has demonstrated remarkable momentum. We\u2019ve seen more than 8,800 contributions, driven by a vibrant and growing community of over 3,300 individual contributors from more than 400 organizations. This surge in activity has placed OpenSearch among the top 20 most active projects across the entire Linux Foundation ecosystem by contributor engagement.\r\n\r\nWe\u2019ve collaborated closely with our community and members on key initiatives and foundational work during this transition. Join us to hear about the journey so far and the future path for the OpenSearch project and Foundation.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KJ9PXN", "name": "Kris Freedain", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KJ9PXN_s9x3BRh.webp", "biography": "Kris Freedain (he/him) is an OpenSearch Ambassador, Senior Community Manager for the OpenSearch Project & OpenSearch Software Foundation technical steering committee, and serves on the OpenSearch Software Foundation Marketing Committee. He has decades of experience in tech, but finds connecting people to be the most fulfilling part of being a community professional. Kris is also a Fediverse admin for the Fosstodon instance and serves as a Fosstodon Foundation Board Member. His hobbies include gardening, garage gym powerlifting, and meditation.", "public_name": "Kris Freedain", "guid": "6c5adfa1-814c-5980-8e15-f11239f9fb3e", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/KJ9PXN/"}, {"code": "SX73FU", "name": "Carlos Rolo", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FNRJZX_ZifVxIZ.webp", "biography": "Carlos Rolo, Principal Open Source Engineer, is a family man who loves doing random activities with his wife and kids, playing water polo, and exploring AI innovations alongside tinkering with home tech setups. With expertise in Rust, Python, and a strong foundation in Go, Carlos is a celebrated 2x Cassandra MVP, Opensearch Ambassador and has released a groundbreaking time series compressor (instaclustr/atsc) with 7 patents pending. Actively\r\nengaged in the Cassandra and Opensearch community, he enjoys sharing technical insights and fostering collaboration.", "public_name": "Carlos Rolo", "guid": "9763fe22-e661-5c96-ace2-caf6d611fa37", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/SX73FU/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9STDXK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9STDXK/", "attachments": []}, {"guid": "2bdb58d7-a6b5-59ed-abe8-b75c88888267", "code": "7ATP3V", "id": 90297, "logo": null, "date": "2026-06-08T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-90297-apache-solr-10-what-s-coming-up-for-vector-search", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7ATP3V/", "title": "Apache Solr 10: What's Coming up for Vector Search", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "With Apache Solr 10 out, there are plenty of goodies coming up for vector-search aficionados.\r\nFrom scalar and binary quantization to speed up your search and reduce the memory footprint, to early termination and hybrid approaches to navigate the HNSW graph.\r\nJoin us if you want to learn about the big steps forward of Apache Solr vector search!", "description": "Apache Solr 10 introduces many advancements in the realm of vector search, making many interesting Lucene features surface.\r\nStarting from scalar and binary quantization, this feature helps users in reducing both the query time and memory footprint at the cost of some accuracy and disk space: a welcome trade-off for those using Solr on massive amounts of vectors.\r\nEarly termination introduces the ability of speeding up certain queries that saturate a configurable threshold, and Seeded KNN gives the ability to start the HNSW graph exploration from a lexical result set, rather than random entry documents (core mechanism of the Solr vector search implementation).\r\nACORN filtering improves the way pre-filtering happens when you mix traditional keyword searches with knn queries, and the query combiner finally offers a comprehensive strategy to mix up query results, opening the door to a more flexible hybrid search.\r\nTo conclude with a cherry on top of the cake, we'll go through many bug fixes and minor improvements, still worth mentioning.\r\nThe audience is expected to get an overview of all the new interesting vector search features coming with Solr 10 and learn how to use them and benefit from them in their use cases.", "recording_license": "", "do_not_record": false, "persons": [{"code": "GJ3PTP", "name": "Alessandro Benedetti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GJ3PTP_7fNBvIJ.webp", "biography": "Alessandro Benedetti is an Apache Lucene/Solr committer and Solr chair of the PMC, Director at Sease Ltd.\r\nHe believes in Open Source as\u00a0a way to build a bridge between Academia and Industry\u00a0and facilitate the progress of applied research.\r\nAlessandro is a passionate R&D software engineer, continuously applying the latest trends in Information Retrieval and AI to solve search problems.\u2028\r\nHe\u2019s been working on Learning To Rank for years and more recently he\u2019s been exploring Generative AI techs like Large Language Models and Retrieval Augmented Generation.\r\nWhen he isn't on clients' projects, he contributes to the open-source community and presents at meet-ups and conferences such as ECIR, Search Solutions, Community Over Code, Haystack and Berlin Buzzwords.", "public_name": "Alessandro Benedetti", "guid": "bd8c60c2-a21e-5832-978a-2ca73e1cddd0", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/GJ3PTP/"}, {"code": "CBFBFW", "name": "Ilaria Petreti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CBFBFW_tBd7ngd.webp", "biography": "Data Scientist with a strong focus on integrating Machine Learning and Deep Learning into information retrieval systems. She has also worked on Search Quality Evaluation across multiple projects. She loves exploring new technologies, applying state-of-the-art solutions in Search and giving back to the community through technical talks and open-source contributions, particularly to Apache Solr.", "public_name": "Ilaria Petreti", "guid": "9495bfbd-3bd9-5666-a0d4-98c3a515b555", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/CBFBFW/"}, {"code": "MJTHGB", "name": "Anna Ruggero", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MJTHGB_HEvvFb3.webp", "biography": "I\u2019m a Research & Development Software Engineer and Search Consultant at Sease, where I help companies design and improve intelligent search solutions. I work with the most well-known search engines such as Apache Solr, Elasticsearch, OpenSearch and Vespa. I operate closely with clients to tackle complex search challenges, from relevance tuning and learning to rank to neural search and NLP integration. I enjoy diving into real-world problems, experimenting with new approaches, and finding the right balance between research and production-ready solutions. I also share insights with the search community through talks and collaborations.", "public_name": "Anna Ruggero", "guid": "53fa5d8d-d62e-5ef7-ab92-95248b0ef81d", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/MJTHGB/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7ATP3V/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7ATP3V/", "attachments": []}, {"guid": "bc89a602-bc85-51d4-b8ec-791ddfe1b29f", "code": "MCH7ZZ", "id": 91578, "logo": null, "date": "2026-06-08T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-91578-constant-time-aggregations-with-star-tree-in-opensearch", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MCH7ZZ/", "title": "Constant-Time Aggregations with Star-Tree in OpenSearch", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Discover how OpenSearch breaks linear scaling. Inspired by Apache Pinot, the Star-Tree index moves performance dependency from document count to field cardinality. Learn how we extended Lucene\u2019s DocValues to build multi-dimensional materialized views that deliver sub-second analytics on billion-scale datasets for observability workloads.", "description": "Traditional distributed search engines face a significant bottleneck: aggregation latency scales linearly with document count. As datasets grow to billions of records, this \"scan-on-query\" model fails to meet real-time requirements. Inspired by Apache Pinot and star-cube research, OpenSearch introduced the Star-Tree index to decouple performance from raw data volume.\r\n\r\nThis session dives into the engineering behind this transition. We will explore how we extended Lucene\u2019s DocValuesFormat to support multi-field materialized views directly within segment structures and shifted the performance dependency from total document count to the cardinality of indexed dimensions.\r\n\r\nWe will detail the implementation of \"star nodes\"\u2014wildcard structures representing aggregates across all values of a dimension\u2014and how they enable constant-time query pruning. Attendees will learn about the challenges of building a multi-field index in the single-field-centric ecosystems like Lucene, how to fine tune storage versus speed, and why this architectural shift achieved up to 1000x faster queries. Finally, we will discuss operational lessons learned, including limitations on why this structure is limited to append-only workloads and how it bridges the gap between traditional search and OLAP-style analytics.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KPYKMN", "name": "Sandesh Kumar", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KPYKMN_DospeYd.webp", "biography": "Sandesh is a Software Developer working on the OpenSearch Project, with a focus on enhancing search performance and cluster resilience. He is also a maintainer of OpenSearch (core) and hosts the Search Backlog & Triage Community Meeting every Wednesday, where he engages with the community to review open issues, prioritize feature requests, and drive improvements to OpenSearch's Search components.", "public_name": "Sandesh Kumar", "guid": "81afad9d-7e5b-56cd-8237-328251a41288", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/KPYKMN/"}, {"code": "DJA7SK", "name": "Shailesh Kumar Singh", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CRAGYA_IsHT7kL.webp", "biography": "Shailesh Kumar Singh is a Software Development Engineer at Amazon Web Services, working on OpenSearch. His work focuses on building high-performance analytics systems at scale, with contributions to aggregation optimization through Star Tree indexing and efficient data processing and compaction using Parquet. He is particularly interested in designing scalable systems that balance performance, storage efficiency, and real-world usability.\r\n\r\nHe holds a Bachelor\u2019s degree in Computer Science from BITS Pilani, with a minor in Finance, and is interested in scalable systems and fintech.", "public_name": "Shailesh Kumar Singh", "guid": "86dc0c95-cf11-558a-a5e7-048b09320135", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/DJA7SK/"}], "links": [{"title": "Star Tree Index", "url": "https://docs.opensearch.org/latest/search-plugins/star-tree-index/", "type": "related"}], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MCH7ZZ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MCH7ZZ/", "attachments": []}, {"guid": "ad09ee5e-0585-5177-a751-1cdb5ea0b1e2", "code": "TSMVSN", "id": 85808, "logo": null, "date": "2026-06-08T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-85808-turning-the-database-inside-out-again", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TSMVSN/", "title": "Turning the database inside out again", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "We rethink data systems by putting streams at the center. Expanding on Martin Kleppmann's: Turning the Database Inside Out, this talk shows how Apache Kafka and Apache Iceberg together provide durable storage, indexing, and rich views that eliminate brittle ETL and unify real-time and historical analysis. A new way to see databases\u2014and streams.", "description": "Over a decade ago, Martin Kleppmann\u2019s Turning the Database Inside Out challenged us to rethink data systems from first principles\u2014placing the event stream at the center of storage, computation, and truth. That vision sparked an entire ecosystem of event-driven architectures, real-time analytics systems, and stream-aware databases.\r\n\r\nBut what if that journey is still unfinished?\r\n\r\nThis talk explores the next leap: reimagining the database itself through the lens of streaming. Instead of treating the event log as a narrow integration pipe, we\u2019ll treat it as the core substrate for all data\u2014augmented with the essential primitives that traditional databases provide: long-term storage, indexing, and rich materializations. To get there, we move beyond simple append/consume patterns and embrace modern table formats and storage layers capable of making event data durable, queryable, and universally accessible.\r\n\r\nThe result is an architecture that collapses fragile pipelines, dissolves the boundary between real-time and historical processing, and provides a unified view of the world using widely adopted open standards (Apache Kafka and Apache Iceberg) This changes the question from \u201cwhat\u2019s happening right now?\u201d to \u201cwhat has happened across the entire lifespan of the system?\u201d.\r\n\r\nYou\u2019ll walk away seeing both the database\u2014and the stream\u2014through a fundamentally new lens.", "recording_license": "", "do_not_record": false, "persons": [{"code": "W873GY", "name": "Tom Scott", "avatar": "https://program.berlinbuzzwords.de/media/avatars/W873GY_RCa1iKu.webp", "biography": "Long time enthusiast of Kafka and all things data integration, Tom has more than 15yrs experience in innovative and efficient ways to store, query and move data. Tom is currently CEO at Streambased a company focused on unifying operational and analytical data estates into a single, consistent and efficient data layer.", "public_name": "Tom Scott", "guid": "f7db3d07-e5b9-5bd3-abdb-46a8c60a96f0", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/W873GY/"}, {"code": "E8ETVJ", "name": "Roman Kolesnev", "avatar": "https://program.berlinbuzzwords.de/media/avatars/E8ETVJ_YNIHEDN.webp", "biography": "Roman is a Principal Software Engineer at Streambased. His experience includes building business critical event streaming applications and distributed systems in the financial and technology sectors.", "public_name": "Roman Kolesnev", "guid": "76810c28-4388-5248-9cdc-691ea6f7dc7b", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/E8ETVJ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TSMVSN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TSMVSN/", "attachments": []}, {"guid": "b02bbde7-8af3-55b8-9e34-1a96fe14721c", "code": "ULE3MU", "id": 90958, "logo": null, "date": "2026-06-08T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz26-90958-from-oltp-to-olap-is-postgresql-eating-analytics-too", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ULE3MU/", "title": "From OLTP to OLAP: Is PostgreSQL Eating Analytics Too?", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Can PostgreSQL become a serious analytics engine? With emerging columnar extensions, PostgreSQL is pushing beyond OLTP into OLAP territory. This talk explores the current columnar landscape, architectural trade-offs, and how far PostgreSQL can go compared to analytical engines like ClickHouse.", "description": "Traditionally a row-oriented OLTP system, PostgreSQL is now gaining columnar capabilities through extensions such as Citues, TigerData columnar, pg_duckdb and more built on PostgreSQL\u2019s pluggable storage layer. This raises a serious architectural question: can PostgreSQL evolve into a competitive analytical engine?\r\n\r\nIn this talk, we provide a structured overview of the current PostgreSQL columnar ecosystem \u2014 how these extensions work, what features they offer, and where they differ in terms of compression, execution model, and performance.\r\n\r\nWe place these developments in the broader context of modern database trends: HTAP ambitions, consolidation of data stacks, and the gravitational pull of PostgreSQL as a platform.\r\n\r\nFinally, we discuss selected performance observations and architectural considerations when comparing columnar PostgreSQL setups to established analytical systems such as ClickHouse from a technical exploration of trade-offs.\r\n\r\nIs PostgreSQL becoming a universal data platform, or are there structural limits to how far columnar extensions can take it?", "recording_license": "", "do_not_record": false, "persons": [{"code": "PUQ3CN", "name": "Daniel Seybold", "avatar": "https://program.berlinbuzzwords.de/media/avatars/PUQ3CN_S1IaCDz.webp", "biography": "Daniel began his career as a doctoral researcher in cloud computing, focusing on distributed databases in the cloud. After completing his PhD, he co-founded the Benchmarking-as-a-Service platform benchANT, where he is responsible for planning and executing cloud and database benchmarking projects.", "public_name": "Daniel Seybold", "guid": "3422e401-8927-55ec-ad63-19dbbbd019b0", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/PUQ3CN/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ULE3MU/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ULE3MU/", "attachments": []}, {"guid": "48553c6c-ea45-5385-aff3-48986a83b641", "code": "B9TVRQ", "id": 91405, "logo": null, "date": "2026-06-08T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-91405-streamling-lightweight-extensible-streaming-on-datafusion", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/B9TVRQ/", "title": "Streamling: Lightweight, Extensible Streaming on DataFusion", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Apache DataFusion is moving beyond batch into streaming. We built Streamling, a Rust streaming engine that uses DataFusion planning and Arrow RecordBatch streams for real-time SQL/WASM transforms. This talk covers how we built it, highlights key features (FFI plugins, WASM transforms, and dynamic tables), and shares production lessons.", "description": "Stream processing systems are complex. Our previous platform was Flink-based. We learned a lot from it, but wanted a lighter approach for workloads that do not need distributed stateful processing. At the same time, a growing ecosystem was emerging around Apache DataFusion and Arrow. We built Streamling to explore a specific point in this design space: a production streaming engine that stays intentionally simple, with no distributed shuffle and no stateful joins, and focuses on operational clarity, extensibility, and cloud-native deployment.\r\n\r\n**Part 1: The Engine Internals**\r\n\r\nA deep dive into how we extended DataFusion for streaming:\r\n\r\n- **Streaming SQL on DataFusion**: We use DataFusion's query planner, custom `TableProvider`s, and `ExecutionPlan` traits to process Kafka Avro data as continuous Arrow `RecordBatch` streams. \r\n- **Checkpoint coordination**: A lightweight Chandy\u2013Lamport style protocol (Marker \u2192 Ack \u2192 Finalizer) that guarantees at-least-once delivery. State is persisted via a pluggable backend system (in-memory, SQLite, or PostgreSQL in production), keeping checkpoint storage decoupled from the engine itself.\r\n- **Runtime extensibility**: WebAssembly script transforms (JS/TS via Extism), HTTP handler transforms, and an `abi_stable` plugin system provide FFI-safe, language-agnostic extension points without requiring engine forks.\r\n- **Dynamic tables**: Stateful lookup tables can be populated from streams or updated externally (for example, in Postgres), enabling deduplication and enrichment in SQL through custom UDFs without pipeline restarts.\r\n\r\n**Part 2: From Engine to Platform**\r\n\r\nHow we designed the system for production cloud deployment:\r\n\r\n- **Control/data plane separation**: The engine (data plane) is decoupled from orchestration (control plane), enabling both fully managed and BYOC (Bring Your Own Cloud) deployment models.\r\n- **Kubernetes-native lifecycle**: Pipeline management (create, pause, resume, restart), resource sizing, secret injection, and namespace isolation.\r\n- **Clean separation**: Why defining this boundary early keeps the engine portable and the platform flexible across deployment models.\r\n\r\n**Key takeaways for the audience:**\r\n\r\n1. DataFusion is proving to be a versatile foundation for streaming, not just batch. We'll share a brief overview of the landscape and where different projects sit.\r\n2. You don't need distributed stateful processing for many streaming workloads. Deliberately scoping down unlocks operational simplicity.\r\n3. Designing a clean control/data plane boundary from day one keeps your architecture flexible for different deployment models.\r\n\r\nThis talk is aimed at engineers building or evaluating streaming platforms, and anyone exploring DataFusion beyond batch analytics.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UUM7RC", "name": "Xiao Meng", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UUM7RC_fVGkCYG.webp", "biography": "Xiao Meng is a software engineer specializing in data infrastructure, stream processing, and SRE. He is the Streaming Team Lead at Goldsky, where he leads development of a declarative, serverless real-time data platform for blockchain data. Previously, he was an Expert Data Engineer at Activision/Demonware, building a real-time game telemetry platform for titles including Call of Duty.", "public_name": "Xiao Meng", "guid": "42ff95c0-aac8-5938-b7e3-8c2695542047", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/UUM7RC/"}, {"code": "FXFKNE", "name": "Rafael Aguiar", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FXFKNE_nosAvei.webp", "biography": "Rafael Aguiar has battle scars from building streaming engines, distributed systems, and real-time analytics in production.\r\nWhen not at work, he\u2019s probably hiking somewhere.", "public_name": "Rafael Aguiar", "guid": "5bfb1583-994e-5dde-a3fd-bad5a3aae5be", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/FXFKNE/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/B9TVRQ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/B9TVRQ/", "attachments": []}, {"guid": "a3a2956f-927b-5fb5-8922-2134f4ed242a", "code": "77KQHG", "id": 90428, "logo": null, "date": "2026-06-08T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-90428-search-is-back-solving-the-context-crisis-for-ai-agents", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/77KQHG/", "title": "Search is Back: Solving the \"Context Crisis\" for AI Agents", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Why do smart agents make dumb mistakes? The culprit is context, an old problem with new solutions. Let's fast-forward through 20 years of search evolution to fix the missing link in today\u2019s Agentic AI. \r\nWe\u2019ll demonstrate how to combine Knowledge Graphs and Vector Search to build reliable, context-aware applications using open-source tools.", "description": "The challenges of building effective AI systems today echo the early days of web search: we are still struggling to get the context right. But where we once had simple user queries, we now have complex agents demanding structured context that scales faster than any past user.\r\nThe key to context has always been finding ways to accumulate, structure, and recall relevant knowledge across interactions, at the point where knowledge graphs and vector search converge.\r\nIn this talk, we connect the dots between the old problem of user context and the new reality of Agentic AI, and show accessible ways to create context for both people and programs.\r\nWhat you will learn in this session while we walk through a solution built only with open-source tools:\r\n1) How to deliver meaningful context to people and agents\r\n2) The tradeoffs between common retrieval approaches\r\n3) Practical patterns you can apply to build more reliable AI applications", "recording_license": "", "do_not_record": false, "persons": [{"code": "KFGPJH", "name": "David Louis Hollembaek", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KFGPJH_acx39Hf.webp", "biography": "Nearly two decades working in Search. David started at Fast Search shortly before it was acquired by Microsoft then continued down the search rabbit hole working on NLP-powered information solutions, machine learning applications, and eventually AI strategy consulting. Now at Veeva Systems, he's building intelligent search and AI-powered SaaS applications for the life sciences.", "public_name": "David Louis Hollembaek", "guid": "e8be2a12-d1ad-5a88-bf18-42b46bc15621", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/KFGPJH/"}, {"code": "L7ZJWB", "name": "Vincent Pistor", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8EGMQS_HiCnZSR.webp", "biography": "Vincent is the VP Commercial at Cognee. He worked for 4+ years in venture capital, investing in 20+ AI, open-source, and infrastructure companies, at early-stage all over Europe. As an investor, he led the pre-seed round of Cognee and joined the company full-time as the commercial and growth lead by the end of 2025. He loves to talk about anything related to knowledge graphs, memory, context, world-models, or open-source. \r\n\r\nVincent has an academic background in economics and data sciences from LSE in London.", "public_name": "Vincent Pistor", "guid": "0b0310b1-9b4c-589f-8567-a11bca22719c", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/L7ZJWB/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/77KQHG/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/77KQHG/", "attachments": []}, {"guid": "f04a1a2e-554c-584e-b38b-89de266cec5e", "code": "XPADMH", "id": 91084, "logo": null, "date": "2026-06-08T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-91084-building-a-local-news-rag-the-quest-for-trustworthiness", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XPADMH/", "title": "Building a Local News RAG: The Quest for Trustworthiness", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "We will show you how we build a local newspaper rag and all the problems that came along the way. From trustworthiness to customer wishes, search optimization and generation problems. Local villages, that LLMs know nothing about, content that is semantically the same and outdated information are only a part of the journey we made.", "description": "Building a RAG system for a local newspaper is a high-stakes challenge where \"hallucinations\" aren't just bugs\u2014they are threats to the brand's main currency: trust. In this session, we share our unfiltered journey of moving beyond \"clean\" documentation into the messy reality of local news.\r\n\r\nWe\u2019ll dive into the \"Long Tail of Locality,\" exploring how to handle hyper-local contexts (villages and regional politics) that LLMs have nearly no knowledge of. We will discuss the problem of semantic collisions\u2014where standard hybrid search fails to distinguish between dozens of nearly identical weekly football reports\u2014and how we navigate complex customer expectations and unclear usage patterns.\r\n\r\nFrom the architectural nightmare of structuring legacy news data to the ongoing battle for factual reliability, this is a talk about what worked, what we still haven't fixed, and the hard lessons learned when \"state-of-the-art\" AI meets the local beat.", "recording_license": "", "do_not_record": false, "persons": [{"code": "99NGZC", "name": "Marcel Dokters", "avatar": "https://program.berlinbuzzwords.de/media/avatars/99NGZC_64AxkUt.webp", "biography": "Marcel Dokters is a data scientist at NOZ Digital, where he is building AI tools that support the daily workflows of journalists at the local newspaper Neue Osnabr\u00fccker Zeitung.", "public_name": "Marcel Dokters", "guid": "9c22943a-2606-5aeb-b076-d74e9cc228a6", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/99NGZC/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XPADMH/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XPADMH/", "attachments": []}], "Palais Atelier": [{"guid": "d8d5ee2d-60f3-51f8-9b1e-9ed1997071d5", "code": "9MJAUU", "id": 98655, "logo": null, "date": "2026-06-08T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz26-98655-relevance-feedback-inside-the-search-engine", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9MJAUU/", "title": "Relevance Feedback Inside the Search Engine", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "How does searching for new information often look? Loops: query, review results for relevance, rewrite the query, repeat\u2026 Until success, or until the user churns / the token budget burns. \r\nThis talk introduces a new instrument for search pipeline builders: propagating query-results relevance right inside the search algorithm of a search engine.", "description": "The relevance of search results is a use-case-dependent, capricious metric. Without access to the full dataset and visibility into the search algorithm, getting relevant results means either guessing the right query formulation or search engineers squeezing out the reranking (or context) budget to compensate for the search algorithm's required simplicity at scale.\r\nWhat if your retriever could be guided by relevance feedback signals from a smart model (like a reranker or even a search agent) during the search process itself, achieving higher recall and discoverability of relevant results at a reasonable cost?\r\nIn this talk, I'll present our API for distilling relevance feedback from smart models directly into the vector search index.\r\n\r\n\u2014\r\nThis talk is sponsored by [Qdrant](https://qdrant.tech)", "recording_license": "", "do_not_record": false, "persons": [{"code": "DQMD97", "name": "Evgeniya Sukhodolskaya", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DQMD97_y876bMD.webp", "biography": "Developer Advocate at Qdrant with 8 years of IT experience across software engineering, machine learning, and developer advocacy.\r\nHolds a Technical University of Munich master's degree in Data Analytics and Engineering.\r\nPassionate about NLP and Information Retrieval.\r\nBelieves in conference-, complaints- and memes-driven development:)", "public_name": "Evgeniya Sukhodolskaya", "guid": "76b95d48-3f05-5de6-a988-646d836fb2e3", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/DQMD97/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9MJAUU/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9MJAUU/", "attachments": []}, {"guid": "8b219e40-74c0-5f39-a0b2-4956d2a8cad8", "code": "M7T8T3", "id": 90528, "logo": null, "date": "2026-06-08T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-90528-mentoring-in-open-source-in-the-age-of-ai", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/M7T8T3/", "title": "Mentoring In Open Source in the Age of AI", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Open source mentorship changed overnight with AI tools. Contributors submitted polished code they couldn\u2019t explain, making learning harder to assess. This talk shares what we learned mentoring Outreachy contributors\u2014what failed, what worked, and what we\u2019re still figuring out.", "description": "We've both been mentoring open source contributors through Outreachy for a few years. Tilda coordinates mentors globally, and we've both been mentors and interns. We thought we had this figured out. Then AI showed up and broke everything.\r\n\r\nContributors started submitting perfect code they couldn't explain. PRs looked great, but ask someone to modify their own work and they'd freeze up. We realized people were using ChatGPT, and none of us\u2014including the contributors themselves\u2014could tell anymore what they'd actually learned versus what they'd generated.\r\nWe had to completely rethink how we mentor.\r\n\r\nWhat we tried that didn't work:\r\n- Asking \"Did you use AI?\" got us nowhere. People felt defensive or genuinely didn't know if they'd learned something.\r\n- Treating AI code like copy-pasted Stack Overflow didn't work either; the volume and polish were totally different.\r\n- Trying to detect AI-generated code was pointless. We don't care if they used AI. We care if they learned.\r\n\r\nWhat actually worked:\r\n- We changed our code review questions from \"Does this work?\" to \"Why this approach?\" and \"What happens if we change this?\" The answers told us everything.\r\n- We restructured tasks: less \"implement this feature\" and more \"solve this problem, explain your thinking, then build it.\"\r\n- We did more live pairing. It's hard to hide what you don't understand in real time.\r\n- We taught people to use AI for learning (asking it to explain concepts) instead of just generating solutions.\r\n\r\nThis isn't solved\u2014we're still figuring stuff out. But we've tried a lot and can tell you what worked and what didn't. But we know for sure that you'll leave with concrete techniques you can use immediately if you're mentoring contributors, teaching programming, or helping anyone learn when AI's in the picture.", "recording_license": "", "do_not_record": false, "persons": [{"code": "F9AMPQ", "name": "Tilda Udufo", "avatar": "https://program.berlinbuzzwords.de/media/avatars/F9AMPQ_BUYikFh.webp", "biography": "Tilda Udufo is a software engineer, developer advocate, and open source community organizer. She works with global mentorship and contribution programs, supporting hundreds of contributors and maintainers across multiple open source projects. Through her work, she focuses on making technical systems more accessible, sustainable, and human-centered.\r\n\r\nShe has reviewed and mentored thousands of contributions, helped design onboarding and feedback processes, and regularly teaches coding concepts to beginners. Tilda is especially interested in how understanding \u201chow things work\u201d behind the scenes leads to better learning, better debugging, and healthier communities.\r\n\r\nWhen she\u2019s not working on open source infrastructure, she enjoys exploring the intersection of code, design, and education.", "public_name": "Tilda Udufo", "guid": "82f67a9f-b180-58fa-8aef-59d5dde86183", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/F9AMPQ/"}, {"code": "3W8C9L", "name": "Busayo Ojo", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3W8C9L_L3ILMjG.webp", "biography": "Busayo Ojo is a passionate advocate for open-source and open source programs. Her work focuses on contributor onboarding, building inclusive communities, and writing documentation that actually helps people get started.", "public_name": "Busayo Ojo", "guid": "5cda09e0-821a-561e-8de1-d9c21d9f6615", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/3W8C9L/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/M7T8T3/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/M7T8T3/", "attachments": []}, {"guid": "af3ee5fd-0fdc-5d34-a16e-06bc276b2f2e", "code": "JYQZ8Y", "id": 86812, "logo": null, "date": "2026-06-08T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-86812-beyond-the-hype-when-apache-flink-solves-real-problems", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/JYQZ8Y/", "title": "Beyond the Hype: When Apache Flink Solves Real Problems", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "When does Apache Flink solve real problems versus add complexity? Explore use cases where Flink becomes essential such as fraud detection, CDC, real-time analytics versus when batch or Kafka Streams suffice. Compare stream engines (Flink, Spark) with platforms (Kafka, Pulsar) to confidently decide when streaming delivers value.", "description": "Apache Flink promises powerful stream processing, but when does that power translate to actual business value? This session provides the architectural clarity engineers need by focusing on specific use cases where Flink becomes essential versus scenarios where simpler alternatives suffice. Attendees will explore real-world problems that demand Flink's stateful processing and exactly-once guarantees\u2014fraud detection, real-time recommendations, CDC-driven data lakes\u2014contrasted with situations where batch jobs or Kafka Streams are better fits. The talk draws practical distinctions between stream processing engines (Flink versus Spark) and streaming platforms (Kafka with ksqlDB/Kstreams, Pulsar), clarifying when each architectural pattern shines. Engineers will leave equipped to confidently decide when streaming architecture delivers results and when it's unnecessary complexity.", "recording_license": "", "do_not_record": false, "persons": [{"code": "9RBACT", "name": "Naci Simsek", "avatar": "https://program.berlinbuzzwords.de/media/avatars/9RBACT_Ct5ln72.webp", "biography": "Naci Simsek is a Senior Customer Success Technical Manager at Ververica with over 17 years of experience in IT and Telecom. He began his career as a Customer Support Engineer at Nortel Networks, advancing through roles as Software Engineer, Engineering Team Lead, Project Manager, and Solutions Architect at Huawei. Over nearly a decade, he specialized in customer-facing big data solutions as a Platform Engineer, BI Engineer, and Data Engineer. In his current position, he supports customers in leveraging Apache Flink for real-time data streaming across on-premises and cloud environments.\r\n\r\nHe holds a Bachelor\u2019s degree in Computer Engineering from Ege University, an MBA from Bahcesehir University, and the PMP\u00ae certification.", "public_name": "Naci Simsek", "guid": "c2630fba-ff0a-5bdc-8caa-45434716b760", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/9RBACT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/JYQZ8Y/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/JYQZ8Y/", "attachments": []}, {"guid": "d10cf4d1-6e69-562e-a1f8-b77d1e86fac7", "code": "L98Q7L", "id": 85103, "logo": null, "date": "2026-06-08T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-85103-apache-spark-declarative-pipelines-in-action", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/L98Q7L/", "title": "Apache Spark Declarative Pipelines in Action", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Learn Spark 4.1's brand-new Declarative Pipelines, a paradigm shift replacing imperative code with simple declarations. We'll build a real-time data pipeline together, processing streaming ADS-B flight data from tens of thousands of aircraft overhead.", "description": "Spark Declarative Pipelines: Building Data Workflows with Spark 4.1's Game-Changing Feature\r\n\r\nApache Spark 4.1 introduces Spark Declarative Pipelines (SDP), a paradigm shift that transforms how data engineers design and maintain complex data workflows. This hands-on session provides a comprehensive introduction to SDP, demonstrating how declarative configuration can replace traditional imperative Spark code for common data pipeline patterns.\r\n\r\nI will present a live example using an open-sourced PySpark data source I built with OpenSky founders from Oxford and ETH Zurich. In just a few lines of code, you'll create a continuous data pipeline with streaming tables ingesting real ADS-B flight data from aircraft overhead\u2014from tiny Cessnas to massive Airbus A380s. No complex \"glue code\" for incremental ingestion\u2014just define what your pipeline should do while Spark figures out how to do it.\r\n\r\nUsing streaming tables and materialized views, we'll layer on AI-powered analytics, turning natural language questions like \"Show me flights above 30,000 feet over California\" into instant SQL queries against live crowdsourced IoT data. I'll demonstrate with a forever-free cloud environment where every attendee can replicate this example hands-on. Attendees will leave with practical knowledge to immediately begin experimenting with SDP and best practices for modernizing their pipeline development.", "recording_license": "", "do_not_record": false, "persons": [{"code": "SWN3QG", "name": "Frank Munz", "avatar": "https://program.berlinbuzzwords.de/media/avatars/SWN3QG_MYQSQca.webp", "biography": "I bring DevEx into products, tech into marketing, and storytelling into demos at Databricks. I presented at the top tier 1 conferences on every continent except Antarctica and built and delivered hands-on workshops for some ten thousand customers per year. \r\n\r\nI leverage AI tools to create compelling technical content, from voice-activated data queries using Databricks Genie to AI-generated demo content with synthetic speech, enhancing developer-focused marketing campaigns.\r\n\r\nI'm a published author with a Ph.D. in Computer Science (summa cum laude from TU Munich) with over 25 years of expertise in data & AI, cloud computing, and scientific research. Cloud Technologist of the Year (Oracle) and Developer Champion. \r\n\r\nAt AWS, I kickstarted developer relations in Central EMEA and tripled the size of the team. Presented at Devoxx, JavaOne, re:Invent, KubeCon, Oracle World and Data + AI Summit. \r\n\r\nI believe it\u2019s the combination of compelling storytelling and deep technical understanding that allows me to simplify complex concepts and create tech demos that truly resonate with audiences.", "public_name": "Frank Munz", "guid": "2739bf9a-a43e-5600-afde-cec4822e520f", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/SWN3QG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/L98Q7L/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/L98Q7L/", "attachments": []}, {"guid": "bf13717a-8384-518a-8e0c-cd08a5005855", "code": "ZCF89D", "id": 91473, "logo": null, "date": "2026-06-08T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-91473-why-choose-one-multi-engine-analytics-with-apache-wayang", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZCF89D/", "title": "Why Choose One: Multi-Engine Analytics with Apache Wayang", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Choosing the best engine for each data task sounds right, but in modern data stacks doing so requires expertise and effort. Apache Wayang, a recently graduated TLP, addresses this by decoupling logical dataflows from execution engines. From big data platforms to SQL and ML engines, Wayang enables cross-platform execution that maximizes performance.", "description": "Modern analytics pipelines frequently span databases, big data engines, and machine learning frameworks. Connecting these systems manually leads to complex orchestration, high data movement cost, and platform-specific rewrites. This challenge also appears in agent driven workflows where different steps of a task naturally map to different engines.\r\n\r\nApache Wayang is a recently graduated Apache Top Level Project that provides a unified data analytics framework for cross-platform execution. Pipelines are expressed with platform independent operators using Java, Scala, Python, or SQL APIs. A cross-platform optimizer then maps operators to execution backends such as Spark, Flink, JDBC databases, and ML systems, and produces execution plans that may span multiple engines. It models operator and data movement cost and supports runtime re optimization when estimates are wrong. In practice, this lets developers write a pipeline once and run it efficiently across multiple engines without hard-wiring platform choices.\r\n\r\nThe talk is technical and system focused, aimed at practitioners working with heterogeneous data stacks. It has three parts:\r\n\r\n1. Motivation (10-15min)\r\nWhy single engine execution is often not enough. Concrete ETL, ML, and agent-based workflows that require multiple systems and create optimization and integration challenges.\r\n\r\n2. System architecture and optimizer (20-25min)\r\nWayang\u2019s platform agnostic plans, operator mappings, cross-platform data movement handling, and stage-based execution model. How the cost-based optimizer inflates plans, evaluates alternatives, and selects mixed engine execution strategies. Brief coverage of SQL, ML, and multi-language UDF support.\r\n\r\n3. Project history, status, and next steps (5-10min)\r\nFrom multi year cross-platform analytics research to Apache and recent Top Level Project graduation. Extensibility for new platforms and current work on improved cost models and optimizer enhancements.\r\n\r\nAttendees will gain a practical understanding of how cross-platform analytics can be executed efficiently and how to design pipelines that are not locked to a single processing engine.", "recording_license": "", "do_not_record": false, "persons": [{"code": "9CSPHT", "name": "Zoi Kaoudi", "avatar": "https://program.berlinbuzzwords.de/media/avatars/9CSPHT_XN2x7FG.webp", "biography": "Zoi Kaoudi is an Associate Professor in the Computer Science Department at the IT University of Copenhagen (ITU) and the VP/PMC Chair of Apache Wayang. Her current research focus is on (i) leveraging machine learning techniques for data-intensive systems, (ii) improving the performance and ease of use of machine learning systems, and (iii) advance knowledge graph embeddings with ontologies and logical reasoning. Before joining ITU, she has held positions in various places around the world. She has worked as a Senior Researcher at the Technical University of Berlin, as a Scientist at the Qatar Computing Research Institute (QCRI), as a visiting researcher at IMIS-Athena Research Center, and as a postdoctoral researcher at Inria Saclay. She received her Ph.D. from the National and Kapodistrian University of Athens in 2011. She has co-authored articles in both database and ML communities and served as a member of the Program Committee for several international database conferences. She has received the best demonstration award at ICDE 2022 for her work on training data generation for learning-based query optimization.", "public_name": "Zoi Kaoudi", "guid": "f3034547-f823-5a47-a0f6-217e67b88aa6", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/9CSPHT/"}, {"code": "MSFKT3", "name": "Haralampos Gavriilidis", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MSFKT3_MsOD7H1.webp", "biography": "Haralampos (Harry) Gavriilidis is a data systems researcher and incoming postdoctoral researcher at ICSI / UC Berkeley, focusing on cross-platform and federated query processing, optimizer design, and execution across heterogeneous engines. He recently earned his PhD from TU Berlin, where he built systems for decentralized federated query processing and adaptive cross-system data transfer. His work has appeared at leading data management conferences such as SIGMOD, VLDB, and ICDE.\r\n\r\nBeyond papers and prototypes, Harry enjoys explaining complex data systems to real audiences. He has spoken at practitioner conferences such as PGConf and also won a science slam competition for making database research stage-friendly. He is also an active community volunteer, supporting events like Berlin Buzzwords, FOSS Backstage, and Flink Forward. Before academia, he worked as a full stack web developer.", "public_name": "Haralampos Gavriilidis", "guid": "6377826d-4045-5144-887e-e3e984444385", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/MSFKT3/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZCF89D/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZCF89D/", "attachments": []}, {"guid": "9b07ab11-c17b-5a4c-8f7f-9231837b9cb4", "code": "U3NJ9P", "id": 90845, "logo": null, "date": "2026-06-08T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-90845-event-driven-agents-with-complex-event-processing-in-flink", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/U3NJ9P/", "title": "Event-driven Agents with Complex Event Processing in Flink", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Event-driven Agents calling LLMs can be combined with Pattern Recognition and Anomaly Detection in Apache Flink in smart ways to increase cost efficiency, avoid hallucinations and enforce predictable, deterministic behavior. Specifically in a business process context, this architecture provides opportunities for continuous real-time process mining.", "description": "Specialized, event-driven AI Agents, in contrast to planning agents, provide unique value for continuously monitoring real-time event pipelines, business processes or technical logs in Apache Kafka. Streaming Agents can invoke LLMs directly from Flink for each event, but this approach can be very costly for high-volume Kafka topics, and lead to non-deterministic outcomes.\r\n\r\nWe showcase how Streaming Agents can be combined with Pattern Recognition and Anomaly Detection in Apache Flink in smart ways to increase cost efficiency, avoid hallucinations and enforce predictable, deterministic behavior.\r\n\r\nHigh-volume event pipelines can be filtered very efficiently with Complex Event Processing (CEP) as a core library in Apache Flink for pattern recognition of sequences of events, as well as traditional ML models with statistical approaches to detect anomalies for critical errors and business opportunities.\r\n\r\nStreaming Agents can then invoke LLMs in a second step, to classify or further analyze the detected patterns or anomalies, suggesting or triggering actions. Due to these specialized tasks, small models often perform great in this context to achieve deterministic outcomes.\r\n\r\nSpecifically in a business process context, this architecture provides opportunities for real-time process mining for ERP, manufacturing, supply chain and financial data to detect process issues and SLA violations earlier, reducing down time and saving costs by taking action immediately.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WJYBU3", "name": "Steffen Hoellinger", "avatar": "https://program.berlinbuzzwords.de/media/avatars/WJYBU3_j4bE6ts.webp", "biography": "Steffen is a Field CTO at Confluent, where he helps global enterprises harness the full potential of real-time data and AI by bringing Apache Flink and Apache Kafka to the core of their architectures. He partners with customers to modernize their data infrastructure, integrating AI models, metadata, data governance and data lineage to unlock new capabilities for agentic AI powered by continuous intelligence on streaming data.", "public_name": "Steffen Hoellinger", "guid": "54ae90a2-d9fa-574c-899d-cdaec5f1ffb9", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/WJYBU3/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/U3NJ9P/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/U3NJ9P/", "attachments": []}, {"guid": "dec4262a-9d1d-5cac-8f27-c2a50647ec34", "code": "3A9DSM", "id": 90504, "logo": null, "date": "2026-06-08T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-90504-floe-policy-based-table-maintenance-for-apache-iceberg", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/3A9DSM/", "title": "Floe: Policy-Based Table Maintenance for Apache Iceberg", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Iceberg maintenance procedures work. Orchestrating them across hundreds of tables is the problem. Floe is an open-source system that treats maintenance as policy: glob patterns, schedules, and health-driven triggers that gate operations on real table metrics. Supports 7 catalogs, executes via Spark or Trino.", "description": "Every Iceberg table needs maintenance, but catalogs don't execute and engines don't orchestrate. Teams end up with scripts that become DAGs that become technical debt. Nobody knows which tables are healthy, which are overdue, or what ran last.\r\n\r\nFloe is an open-source, policy-based maintenance system for Iceberg. Define rules with glob patterns, schedules, and health-driven triggers that gate operations based on real table metrics: small file percentage, snapshot count, delete file ratio, and partition skew. Priority resolves conflicts when patterns overlap. A maintenance debt score ranks tables by urgency so the most critical work runs first within your resource budget.\r\n\r\nFloe connects to REST, Polaris, Lakekeeper, Gravitino, DataHub, Hive Metastore, and Nessie catalogs, then delegates execution to Spark or Trino. A built-in dashboard shows table health trends, operation history, and policy coverage.\r\n\r\nThis talk covers the policy model, health-driven maintenance planning, and a live demo.", "recording_license": "", "do_not_record": false, "persons": [{"code": "YRGYZP", "name": "Neelesh Salian", "avatar": "https://program.berlinbuzzwords.de/media/avatars/YRGYZP_B5oF8Nf.webp", "biography": "Neelesh Salian builds data platforms. He has led lakehouse and distributed systems work at Datavant, Stitch Fix, dbt Labs, Salesforce, and Cloudera, with a focus on Spark, streaming, and Apache Iceberg in production. He created Floe to solve a problem he kept encountering: orchestrating Iceberg table maintenance at scale.", "public_name": "Neelesh Salian", "guid": "503d7948-aa30-50fa-844e-8fd2b9fb8a1c", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/YRGYZP/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/3A9DSM/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/3A9DSM/", "attachments": []}], "Frannz Salon": [{"guid": "2c2fdeec-abcf-57d2-b9fc-52271b78a62a", "code": "37ZLHV", "id": 91330, "logo": null, "date": "2026-06-08T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Frannz Salon", "slug": "bbuzz26-91330-towards-chunk-less-rag", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/37ZLHV/", "title": "Towards Chunk-less RAG", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Retrieval-Augmented Generation (RAG) systems rely on pre-chunked documents, tying retrieval to arbitrary boundaries. This talk explores an experimental approach that surfaces semantically relevant text spans, without chunking. We'll share surprising findings and examine whether this technique points toward a viable chunk-free retrieval paradigm.", "description": "RAG systems have become foundational for grounding LLM outputs in factual knowledge, but they share a common limitation: semantic search operates at the chunk level, not the token level.\r\n\r\nThis talk presents an experimental investigation into whether we can bypass chunking entirely by extracting token-level relevance directly from dense embedding models. The core insight is simple, by preventing the embedding pooling step and computing cosine similarity between every query token and document token, we can generate relevance heatmaps that highlight exactly which spans matter for a given query, from which we can extract relevant text spans.\r\n\r\nThe session will walk through the complete pipeline:\r\n\r\n* Extracting token-level embeddings from the last hidden layer of dense embedding models (specifically Qwen3-Embedding-0.6B)\r\n* Computing relevance matrices via normalized dot products between query and document token vectors\r\n* Collapsing multi-token query representations into per-document-token scores\r\n* Designing a clustering algorithm that identifies relevance peaks, groups nearby high-scoring tokens, and extends matches to semantic boundaries\r\n* Comparing results against purpose-built late-interaction models (ColBERT variants, Jina Embeddings v4)\r\n\r\nThe experimental results reveal that the extracted spans show strong F1 scores when evaluated against ground truth answers in test documents. And the comparison between models shows that, despite being trained for pooled sentence embeddings, Qwen3's token-level representations outperform ColBERT-style models specifically designed for multi-vector matching.\r\n\r\nHowever, the approach surfaces two major challenges: storage requirements balloon by roughly 900\u00d7 compared to traditional chunking and the model's decoder-only architecture creates attention patterns that bias relevance toward document endings.\r\n\r\nThis is explicitly experimental work shared in the spirit of exploring new directions, not presenting a production solution. The goal is to spark discussion about whether the chunking paradigm is a necessary constraint or an artifact of current tooling, what modifications to model training or inference could make span-level retrieval practical at scale, and the parallelism between this approach and promising knowledge graph retrieval strategies.", "recording_license": "", "do_not_record": false, "persons": [{"code": "QN79F7", "name": "Carles Onielfa", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QN79F7_S0PZDI4.webp", "biography": "Machine Learning Engineer at Progress", "public_name": "Carles Onielfa", "guid": "bbbc20b7-5553-5e76-8a9c-4769a90b2cde", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/QN79F7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/37ZLHV/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/37ZLHV/", "attachments": []}, {"guid": "65f5ffa3-6687-5bb9-a5ac-79a43ce9f13e", "code": "Y7YVKP", "id": 89187, "logo": null, "date": "2026-06-08T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-89187-no-0-day-required-just-target-the-ai-coding-assistant", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/Y7YVKP/", "title": "No 0-day required, just target the AI coding assistant!", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Discover how attackers can manipulate AI coding assistants through hidden text, typosquatting and code errors. Learn to detect concealed instructions and set up trusted dependencies to keep unsafe code out of your environment.", "description": "Do you trust your AI coding assistant? What if I told you that attackers have found ways to manipulate it and attack your code? With everyone now using AI coding assistants it\u2019s time to look at the risks!\r\n\r\nDuring this talk I\u2019ll show you several new techniques attackers are already using. This will range from hidden messages (ASCII smuggling) to abusing mistyping and characters that look the same (typosquatting). I will also show how an LLM can make mistakes when generating code (hallucinations). Did you know that a smart attacker can abuse this too?\r\n\r\nWhen you join this talk, you\u2019ll learn how to spot hidden text in your instruction file and prompts. I will also explain how to set up a trusted dependency repository to prevent the wrong code from entering your production environment!", "recording_license": "", "do_not_record": false, "persons": [{"code": "3XGYLQ", "name": "Leo Visser", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3XGYLQ_fVgTbrr.webp", "biography": "Since 2012 I\u2019ve been working in the field of IT in different positions. Now I am the product lead Automation + AI for the transformation department of OGD ict-diensten. I\u2019m responsible for the propositions regarding Power Platform, AI and Automation. Besides that I also consult a multitude of customers about these topics and their cloud platforms. Due to this broad range of topics I work with on a daily basis, I\u2019ve developed a passion for connecting them all together. When I\u2019m not working on improving specific systems I\u2019m looking into how the systems can work together to create even more value. I also write on my blog https://www.autosysops.com about the solutions I find.", "public_name": "Leo Visser", "guid": "b040a505-90fb-501a-8431-e4401b9e1d8c", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/3XGYLQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/Y7YVKP/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/Y7YVKP/", "attachments": []}, {"guid": "a2078363-f5d6-5126-bc99-b0199a3d286d", "code": "AT33SV", "id": 90997, "logo": null, "date": "2026-06-08T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-90997-oss-security-lessons-from-10-years-at-apache-solr", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/AT33SV/", "title": "OSS Security: Lessons from 10+ Years at Apache Solr", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "How are security decisions big and small made in a distributed open source community?  Come find out at this session where users will gain insights and examples (both good and bad) to take back to their own projects.", "description": "The security landscape is ever-evolving; as threats emerge and best practices shift, open source projects must balance backwards-compatibility and their own volunteer-driven nature against the practical needs of modern security. For Apache Solr, a project that began without built-in authentication or authorization, this journey has been particularly instructive.\r\n\r\nThis talk traces the evolution of security in Apache Solr from its early days through the present. We'll examine the major inflection points that shaped the project's security posture: the introduction of a pluggable authentication and authorization framework, the consideration of alternatives like Apache Shiro, formative CVE reports that exposed critical vulnerabilities, and significant deprecations like the Data Import Handler (\"DIH\") that sacrificed popular features for security. Along the way, we'll discuss the community processes and dynamics involved in each decision, along with the trade-offs of major choices (e.g. breaking changes vs. user safety).\r\n\r\nBy the end of this talk, attendees will understand how security priorities have evolved in a major open source project and gain insights and examples (both good and bad!) to take back to their own applications and projects.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8RMBAS", "name": "Jason Gerlowski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8RMBAS_pAhJVSw.webp", "biography": "Jason is a software developer born and raised in Pittsburgh, PA in the United States, where he's a proud husband and father of two.  He's worked with Apache Solr for 10+ years, with search experience going back even further.", "public_name": "Jason Gerlowski", "guid": "1bd8790f-b6c9-5232-8894-a42a279a091b", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/8RMBAS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/AT33SV/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/AT33SV/", "attachments": []}, {"guid": "a30a6a36-1ed2-5e91-8ccb-af86042b575b", "code": "LQBMVZ", "id": 86591, "logo": null, "date": "2026-06-08T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-86591-livecoding-data-visualisations-with-streamlit", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/LQBMVZ/", "title": "Livecoding Data Visualisations with Streamlit", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Streamlit is an open source Python library that lets you present data to people, without having to become a frontend developer. It's easy to learn, fast to build with, and should be in every data-wrangler's toolkit. In this talk you'll learn everything you need to know to get started.", "description": "It's far too hard to visualise data. If you've got some data you want to share with people, it shouldn't need a React expert just to generate a chart. It shouldn't take a 3-tier architecture to give people an interactive view they can explore. But all too often it does.\r\n\r\nThis is where Streamlit hits a design sweet spot. It's a simple framework that makes it incredibly easy to start with regular Python data-processing code, and get to a clean, professional visualisation, in minutes. Even if you're not a \"frontend person\", you can get a polished, interactive user interface in front of people with just a few extra lines of code.\r\n\r\nIn this live-coding session you'll learn everything you need to get started with Streamlit. We'll start completely from scratch, explore the core parts of Streamlit's API, and see how any backend developer or data scientist can actually *show* their work faster than you can say, \"JSON encoding\".", "recording_license": "", "do_not_record": false, "persons": [{"code": "KWNPB9", "name": "Kris Jenkins", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KWNPB9_tMVZMlh.webp", "biography": "Kris Jenkins is a Lead Developer Advocate for Snowflake, the host of the Developer Voices podcast, and a lifelong programmer. Over the course of his career he's been the CTO of a gold-trading company, a functional programming contractor specialising in Haskell, and a regular hackathon organiser.\r\n\r\nHe loves good software architecture, innovative new programming languages, and most of all, building stuff.", "public_name": "Kris Jenkins", "guid": "1b4d71cb-3fdb-5340-b283-e5fd90f480bd", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/KWNPB9/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/LQBMVZ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/LQBMVZ/", "attachments": []}, {"guid": "4f3dd611-151f-5224-a7a8-51fd63e5b933", "code": "ERWWUY", "id": 91311, "logo": null, "date": "2026-06-08T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Frannz Salon", "slug": "bbuzz26-91311-scientific-data-under-threat-in-today-s-america", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ERWWUY/", "title": "Scientific Data Under Threat in Today\u2019s America", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "A brief look at how scientific data came under political pressure during the presidency of Donald Trump, and how scientists and data repositories in Europe worked to protect public access to evidence-based research, ensuring access for data science in a well-structured way.", "description": "What happens when the data that is used by climate research, public health, and civil rights enforcement becomes politically inconvenient? This talk examines the vulnerability of scientific data during the presidency of Donald Trump, a period marked by the removal of government web pages, restrictions on agency communications, and proposed budget cuts to research institutions.\r\nThe talk highlights the role of [PANGAEA \u2013 Data Publisher for Earth & Environmental Science](https://www.pangaea.de/) in ensuring long-term preservation and open access to geoscientific datasets, demonstrating how trusted repositories can safeguard publicly funded research and make it globally accessible despite shifting political climates. By assigning persistent identifiers (DOIs), rich metadata, use of terminologies, standardized formats, and open licenses, data gets FAIR \u2014 Findable, Accessible, Interoperable, and Reusable \u2014 and readily integrable into computational and AI workflows, enabling large-scale analysis, machine learning applications, and reproducible science across disciplines.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HRJC87", "name": "Uwe Schindler", "avatar": "https://program.berlinbuzzwords.de/media/avatars/HRJC87_iOiv7er.webp", "biography": "Uwe is committer and PMC member of Apache Lucene and Apache Solr. His main focus is on development of Lucene Core. He implemented fast numerical search and is maintaining the new attribute-based text analysis API. He studied Physics at the University of Erlangen-Nuremberg and works as managing director for SD DataSolutions GmbH in Bremen, Germany, a company that provides consulting and support for Apache Lucene, Elasticsearch, and Apache Solr. He also works for \u201cPANGAEA \u2013 Publishing Network for Geoscientific & Environmental Data\u201d where he implemented the portal\u2019s geo-spatial retrieval functions with Lucene Java. Uwe had talks about Lucene at various international conferences like the previous Berlin Buzzwords, ApacheCon EU/US, Lucene Revolution, Lucene Eurocon, and various local meetups.", "public_name": "Uwe Schindler", "guid": "66a8fa69-bb21-5e95-bc1f-c8a092640daf", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/HRJC87/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ERWWUY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ERWWUY/", "attachments": []}, {"guid": "90ebe69b-6257-5594-8536-2da30939fee9", "code": "YWLB7A", "id": 90600, "logo": null, "date": "2026-06-08T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-90600-let-llms-wander-engineering-rl-environments", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/YWLB7A/", "title": "Let LLMs Wander: Engineering RL Environments", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "What if, instead of learning only from examples, Language Models could explore crafted Environments, little worlds where they can act and improve autonomously? \r\n\r\nJoin me to see how **Reinforcement Learning Environments** work, how to build them with open-source tools, and how to use them to **evaluate and train LLMs/Agents**.", "description": "Since the release of reasoning Language Models like DeepSeek R1, improving model capabilities is moving beyond static examples (Supervised Fine-Tuning) to interaction via Reinforcement Learning.\r\n\r\nTo enable this, we need **RL Environments**: controlled worlds where models can act, get rewards, and learn.\r\nAn environment is more than a dataset. It is a piece of software that orchestrates interactions with the model, manages state, defines rewards, and verifies outcomes. \r\n\r\nIn this talk, I will walk you through my journey exploring this emerging space from a software engineering perspective.\r\n\r\n1. I will start by mapping classic Reinforcement Learning concepts to Language Models.\r\n\r\n2. I will then introduce **Verifiers**, an open-source library for building environments as software artifacts.\r\n\r\n3. Based on Verifiers, we\u2019ll see concrete **design patterns** that range from simple single-turn tasks, to multi-turn games, to environments for tool-using agents that interact with external systems.\r\n\r\n4. I\u2019ll share practical experiences using environments for **evaluation and training Small Language Models**.\r\n\r\nBy the end of the session, attendees will be able to start building their own Reinforcement Learning environments, little worlds for LLMs. I'll also share the joys, frustrations, and lessons learned along the way.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZVFVZ8", "name": "Stefano Fiorucci", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ZVFVZ8_6x21TPn.webp", "biography": "\ud83e\uddd1\u200d\ud83d\ude80 AI Engineer/explorer. Passionate about Language Models, open source, and knowledge sharing.\r\n\r\n\ud83d\udc68\u200d\ud83d\udcbb I work on the open-source Haystack LLM orchestration framework, contributing code, tutorials and demos.\r\n\r\n\ud83e\udded Fascinated by all things LLMs. From inference and orchestration (Agents, RAG) to post-training techniques. I frequently experiment with training small Language Models and Reinforcement Learning. I like sharing what I learn.", "public_name": "Stefano Fiorucci", "guid": "81ffb24d-2b43-5c82-8a09-59ce2a145e25", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/ZVFVZ8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/YWLB7A/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/YWLB7A/", "attachments": []}, {"guid": "e946079e-bf17-5d87-b032-f1af3c66c8c3", "code": "BGDMFD", "id": 87967, "logo": null, "date": "2026-06-08T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-87967-spruce-it-up-open-source-greenops-at-scale", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/BGDMFD/", "title": "SPRUCE it up! Open Source GreenOps at scale", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "GreenOps adoption is stalled by missing data from cloud providers. SPRUCE is an open-source, scalable platform built on Apache Spark that enriches cloud usage reports with open models to quantify carbon impact, build insights, and help teams reduce both emissions and cloud spend.", "description": "The environmental impact of ICT\u2014and cloud computing in particular\u2014is rapidly increasing, driven largely by the rise of AI workloads. While **FinOps** has become a standard practice for managing cloud costs, its environmental counterpart, **GreenOps**, is still struggling to gain traction. A key obstacle is the lack of transparent, actionable sustainability data from cloud service providers.\r\n\r\nIn this talk, we introduce [SPRUCE](https://opensourcegreenops.cloud/), a scalable open-source platform designed to implement GreenOps at scale. Built on Apache Spark and leveraging open data and models, SPRUCE processes large volumes of cloud usage data, enriching provider reports to quantify environmental impact and generate actionable insights through reports and visualisations.\r\n\r\nAttendees will gain a practical understanding of GreenOps, the current data and tooling landscape, and how a big-data\u2013driven approach with Apache Spark can help teams measure and reduce both cloud carbon emissions and costs.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UA3XLB", "name": "Julien Nioche", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UA3XLB_7eLQsxV.webp", "biography": "Julien runs [DigitalPebble](https://digitalpebble.com), a consultancy specialised in **Green Software**, **GreenOps** and **Digital Sustainability**. With 20+ years experience as a software developer, he has been involved in many open source projects, mainly at the [Apache Software Foundation](https://apache.org/). Combining a personal passion for sustainability and environmental issues with his technical skills, Julien helps organisations reach their decarbonisation targets.\r\nJulien is a certified FinOps practitioner from the FinOps foundation and is a member of Boavizta, the Green Software Foundation and the Apache Software Foundation.\r\nHe lives in Bristol and outside of work, enjoys music, furniture making, cycling and rewilding. Julien was at the very first BerlinBuzzwords and has given several talks there over the years.", "public_name": "Julien Nioche", "guid": "603a6c4a-a171-5e99-8cd0-5b8a0805f800", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/UA3XLB/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/BGDMFD/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/BGDMFD/", "attachments": []}, {"guid": "3307cd9c-ebb2-5d8a-b7ec-b31f46c3370b", "code": "QEXDKB", "id": 91270, "logo": null, "date": "2026-06-08T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-91270-observability-s-sixth-sense-detecting-anomalies-in-metrics", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/QEXDKB/", "title": "Observability\u2019s Sixth Sense: Detecting Anomalies in Metrics", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this talk, we look at anomaly detection as a complementary way of working with metrics. Instead of relying on predefined limits, anomaly detection focuses on identifying behavior that deviates from what is normally observed over time. The focus is on how developers can interpret these signals, where anomaly detection is useful, where it is not.", "description": "Modern systems produce more metrics than any single person can reason about. As systems grow and change, defining fixed thresholds becomes harder and unexpected behavior often appears without clearly crossing an alert boundary.\r\nUsing a short, live walkthrough with real metric data, the talk shows how anomalies can surface gradual changes, unusual patterns and subtle shifts that are easy to miss in dashboards.\r\nThe session is exploratory and practical, aimed at developers who work with metrics and want additional ways to understand system behavior without introducing complex models or heavy tooling.", "recording_license": "", "do_not_record": false, "persons": [{"code": "QHANR8", "name": "Diana Todea", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QHANR8_H6ehRUm.webp", "biography": "Diana is a Developer Experience Engineer at VictoriaMetrics. She has worked as a Senior Site Reliability Engineer focused on Observability. She is an active member of the OpenTelemetry CNCF open source project, co-organizer of Cloud Native Days Romania, co-lead of neurodiversity working group (part of CNCF initiative merge-forward) and supports underrepresented groups in tech.", "public_name": "Diana Todea", "guid": "440264f4-1d28-555f-ae7d-97dee667f3a2", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/QHANR8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/QEXDKB/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/QEXDKB/", "attachments": []}]}}, {"index": 3, "date": "2026-06-09", "day_start": "2026-06-09T04:00:00+02:00", "day_end": "2026-06-10T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "7a9d07cb-21f8-51f1-8852-4a9a180b0752", "code": "7GUTXM", "id": 91314, "logo": null, "date": "2026-06-09T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz26-91314-reviving-phonetic-algorithms-for-better-search-relevance", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7GUTXM/", "title": "Reviving phonetic algorithms for better search relevance", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Fuzzy search is a double-edged sword: it fixes typos but drowns users in noise on large corpora. At INA, we revived ancient phonetic algorithms to improve relevance. This session compares fuzzy vs. phonetic search on a massive archive, showing how \"sounding right\" beats \"spelling close.\"", "description": "When users are unsure of a spelling, fuzzy search is the standard engineering solution. However, at the scale of the French National Audio-visual Institute, we found that standard fuzziness hits a wall. On a massive corpus, \"approximate\" matching retrieves a paralyzing amount of noise, degrading the user experience.\r\n\r\nTo solve this, we looked back to move forward. We revived and re-implemented \"ancient\" phonetic algorithms, some dating back decades, to test if matching by sound could outperform matching by character distance.\r\n\r\nIn this talk, we share our journey in tuning relevance for the French language, which is notoriously difficult due to silent letters and homophones. We will cover:\r\n\r\n- The Fuzziness Trap: Why increasing edit distance failed to solve our precision/recall trade-off.\r\n- Algorithm Showdown: A comparative analysis of standard Fuzzy Querying vs. Phonetic Analysis (e.g., Soundex, Beider-Morse, Metaphone) within our search pipeline.\r\n- Implementation: How we integrated these phonetic tokens into our indexing strategy to filter noise without losing relevant results.\r\n\r\nYou will leave with a clear understanding of when to abandon standard fuzziness and how to leverage phonetic search to clean up your own noisy results.", "recording_license": "", "do_not_record": false, "persons": [{"code": "CQLPDE", "name": "Pietro Mele", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CQLPDE_dwRmDMi.webp", "biography": "Italian, adopted by France not long ago, I am a constant learner, dedicated to computer science and discovery, whether uncovering solutions or gaining insights.", "public_name": "Pietro Mele", "guid": "26efca5d-0c58-5034-804b-9a0312f01178", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/CQLPDE/"}, {"code": "JNFKMQ", "name": "Radu Pop", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JNFKMQ_BxhO8Hb.webp", "biography": "Radu provides Consulting Services as Solutions Architect at Adelean. He handles projects around Elasticsearch and Adelean\u2019s A2 search technology. He oversees the integration and evolution of search engines within large e-commerce platforms, marketplaces or organizations' data lakes. Prior to joining Adelean, Radu acquired a solid experience in Web archiving, operating large scale crawling systems in the context of several European research projects. He holds a PhD in Computer Science and a MSc in Distributed Systems.", "public_name": "Radu Pop", "guid": "adbcba05-e957-5634-87ae-766e99e33c4c", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/JNFKMQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7GUTXM/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7GUTXM/", "attachments": []}, {"guid": "e19bc1a8-90e5-5b99-ad7d-8d63308c921d", "code": "333TRT", "id": 91536, "logo": null, "date": "2026-06-09T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91536-from-inverted-index-to-columnar-vectorized-execution-search", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/333TRT/", "title": "From Inverted Index to Columnar Vectorized Execution Search", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Search engines are converging with analytical data systems. This talk explores how columnar data layouts, SIMD-accelerated execution, and bulk-oriented processing are reshaping search internals. We examine where traditional models fall short and how hardware-aware techniques from analytics engines are defining the next search infrastructure.", "description": "Modern search workloads increasingly blend text retrieval with aggregations, vector search, and real-time analytics, pushing traditional inverted-index architectures beyond their original design. This session examines how techniques from columnar databases and high-performance analytics engines are being adopted to meet these demands.\r\n\r\nWe explore three key shifts: how columnar storage improves cache locality for efficient aggregation and filtering; how SIMD and vectorized computation accelerate scoring, filtering, and similarity operations on modern CPUs; and how bulk ingestion and execution pipelines reduce coordination overhead while maximizing hardware utilization.\r\n\r\nDrawing from evolving open-source search ecosystems and real-world engineering efforts, we analyze where row-oriented execution falls short, discuss hybrid models combining inverted indexes with columnar processing, and explore treating search queries as vectorized data pipelines.\r\n\r\nTargeting developers and researchers interested in search internals, distributed systems performance, and the retrieval-analytics intersection, attendees will gain practical understanding of how hardware-aware design influences search architecture today, the trade-offs of integrating columnar and vectorized execution into retrieval systems, and where search infrastructure is heading next.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EJMU7A", "name": "Ankit Jain", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EJMU7A_UThCmzc.webp", "biography": "Ankit Jain is a Software Engineer on the Amazon OpenSearch Service team, leading performance and scalability initiatives for search infrastructure. He is an active maintainer and committer for the Apache Lucene and OpenSearch projects, with hands-on experience operating large-scale OpenSearch deployments and solving complex production performance challenges.", "public_name": "Ankit Jain", "guid": "50ba67bb-4698-5f06-8688-ff34fadb00fd", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/EJMU7A/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/333TRT/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/333TRT/", "attachments": []}, {"guid": "2877fbb4-8149-572e-a0f9-1ed6a783bf2c", "code": "M8DR9V", "id": 91423, "logo": null, "date": "2026-06-09T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91423-when-better-retrieval-makes-agents-worse", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/M8DR9V/", "title": "When better retrieval makes agents worse", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Agentic systems can break not because information is missing, but because persuasively wrong context gets promoted into action. We examine a recurring pattern: retrieval metrics improve while agent behavior degrades as distractors enter multi-step loops. We show why relevance, reliability, and security are tightly connected in agentic retrieval.", "description": "In agentic workflows, retrieval is no longer just ranking for a human reader; it is context injection into reasoning and tool use. That shift changes the failure mode. Plausible but incorrect evidence can degrade outcomes disproportionately, and in noisy settings, longer reasoning can make answers worse rather than better. This is inverse scaling under noise: more capable reasoning produces more confident mistakes. In iterative agent loops, those mistakes are recycled and amplified, turning small retrieval defects into workflow-level failures.\r\n\r\nIn this talk we'll break down the main failure modes, including plausible distractors, error compounding across steps, and the gap between traditional retrieval metrics and real task utility. We'll present design patterns for robust agentic retrieval: stricter evidence selection, sufficiency checks before acting, and explicit pause/retry/escalate behavior when confidence is not warranted. We'll also connect these patterns to challenges in open agent tooling ecosystems, where untrusted context has shown that retrieval is a threat surface as well as a ranking problem.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EJLRPN", "name": "Lester Solbakken", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EJLRPN_O4r96yT.webp", "biography": "Lester Solbakken is a Founding Engineer at HORNET.dev, where he builds production-grade retrieval infrastructure for AI agents. Previously pursued a PhD within Artificial Intelligence and Machine Learning, with research centered on neural networks, exploratory data analysis and self-organizing systems. He speaks about building reliable, high-performance AI systems that bridge research and real-world deployment.", "public_name": "Lester Solbakken", "guid": "1661eca1-6cf2-5f54-bf89-155bd1eff797", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/EJLRPN/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/M8DR9V/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/M8DR9V/", "attachments": []}, {"guid": "3a62c7e9-52a4-5dd6-9518-887488316d90", "code": "MHSTAZ", "id": 85920, "logo": null, "date": "2026-06-09T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-85920-keeping-data-private-in-real-time-pipelines", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MHSTAZ/", "title": "Keeping data private in real-time pipelines", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Real-time data is awesome\u2026 until you realize it\u2019s leaking names, emails, and locations. In this talk, you\u2019ll learn how to keep streaming data private, from simple masking to tricks that beat re-identification. All with live demos and some juicy real-world stories.", "description": "We all love real-time data \u2014 clicks, payments, rides, messages \u2014 but most of it comes with a catch: it contains personal information we\u2019re not supposed to leak, such as names, emails, locations, or even small clues that can identify someone. The challenge: how do we keep streaming data useful and safe at the same time?\r\n\r\nIn this talk, we\u2019ll explore practical ways to protect privacy in streaming systems using Apache Kafka, Apache Flink, and Apache Iceberg. We\u2019ll cover:\r\n- simple tricks like masking and tokenizing PII;\r\n- why \u201canonymous\u201d data often isn\u2019t anonymous (the re-identification problem);\r\n- techniques like bucketing, k-anonymity, and adding noise;\r\n- how to balance privacy with data utility (too much hiding makes data useless).\r\n\r\nAlong the way, we\u2019ll look at real-world stories: from public data leaks to surprising deanonymization attacks, and show live demos of pipelines that anonymize data before it\u2019s written to storage.\r\nIf you\u2019ve ever wondered how to build privacy-aware pipelines, this talk will give you practical patterns you can use right away.", "recording_license": "", "do_not_record": false, "persons": [{"code": "LLBXBT", "name": "Olena Kutsenko", "avatar": "https://program.berlinbuzzwords.de/media/avatars/LLBXBT_Q3Gez1v.webp", "biography": "Olena is a Staff Developer Advocate at Confluent and a recognized expert in data streaming and analytics. With two decades of experience in software engineering, she has built mission-critical applications, led high-performing teams, and driven large-scale technology adoption at industry leaders like Nokia, HERE Technologies, AWS, and Aiven.\r\n\r\nA passionate advocate for real-time data processing and AI-driven applications, Olena empowers developers and organizations to use the power of streaming data. She is an AWS Community Builder, a dedicated mentor, and a volunteer instructor at a nonprofit tech school, helping to shape the next generation of engineers.\r\n\r\nAs an international speaker and thought leader, Olena regularly presents at top global conferences, sharing deep technical insights and hands-on expertise. Whether through her talks, workshops, or content, she is committed to making complex technologies accessible and inspiring innovation in the developer community.", "public_name": "Olena Kutsenko", "guid": "317c2014-3ceb-566c-80f7-b9f6e41f062d", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/LLBXBT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MHSTAZ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MHSTAZ/", "attachments": []}, {"guid": "f11b178d-5de8-5205-bb3d-960886393f22", "code": "QJBKUU", "id": 91671, "logo": null, "date": "2026-06-09T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91671-the-three-body-problem-of-inverse-hybrid-search", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/QJBKUU/", "title": "The Three-Body Problem of Inverse Hybrid Search", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "When users expect alerts for new products matching an uploaded image, the problem becomes inverse hybrid search. Unlike top-K search, alerting must guarantee fetch-all semantics: zero missed matches across all saved searches, combining vector similarity, boolean filters, and lexical signals. We show why this breaks traditional scaling intuition.", "description": "Saved searches and alerts are common across e-commerce and marketplaces: price drops, availability notifications, and increasingly, visual alerts driven by images captured on mobile devices. While the user experience feels simple, the underlying system represents one of the most demanding forms of search.\r\n\r\nThis talk reframes alerting as a distinct retrieval discipline:\r\n\r\n- **Inverse**: documents trigger queries, not the other way around\r\n- **Hybrid**: vector similarity, boolean filters, and lexical constraints must all apply\r\n- **Fetch-All**: every true match must be returned - no truncation, no approximation\r\n\r\nWe examine why traditional search assumptions fail under these constraints. In particular, we show how cost and instability are driven not by throughput (QPS), but by match cardinality - the number of alerts matched per incoming item - and how this interacts with scatter/gather execution, merge costs, and bursty ingestion patterns.\r\n\r\nThe talk focuses on:\r\n- where inverse hybrid systems break silently\r\n- why scaling infrastructure buys stability rather than throughput\r\n- how correctness becomes an operational and economic concern\r\n- why AI-driven recall often increases system pressure rather than reducing it\r\n\r\nAttendees will leave with a concrete framework for reasoning about inverse hybrid search systems at scale.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WSF7G8", "name": "Ravindra Harige", "avatar": "https://program.berlinbuzzwords.de/media/avatars/WSF7G8_UYfRAu0.webp", "biography": "Ravindra Harige is the founder of Searchplex, a firm focused on designing scalable AI-native search and discovery systems across multiple industry verticals.", "public_name": "Ravindra Harige", "guid": "e6b7aefa-953b-5f0a-a57b-0145386301b1", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/WSF7G8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/QJBKUU/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/QJBKUU/", "attachments": []}, {"guid": "e4d9123b-53bf-52cd-b12f-4e756851b50f", "code": "ZY3Y9U", "id": 91738, "logo": null, "date": "2026-06-09T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91738-beyond-grep-search-for-reliable-coding-agents", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZY3Y9U/", "title": "Beyond Grep: Search for Reliable Coding Agents", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Coding agents succeed in verifiable loops (compiler + tests), but large repos still expose retrieval weaknesses. \r\nThis session explores how lexical, structural, and semantic search can provide cleaner context for LLMs. We compare tradeoffs and evaluation approaches to improve reliability without inflating token cost.", "description": "Coding agents work well partly because software is a verifiable domain: compilers, tests, and static checks create tight feedback loops that support iterative improvement. \r\nYet even with better tooling, MCP integrations, and skills-based workflows, many agents still degrade in large codebases where retrieval quality becomes the limiting factor.\r\n\r\nThis talk explores a working hypothesis: improving search is one of the highest-leverage ways to improve coding-agent outcomes before changing model size.\r\nWe will examine retrieval patterns across keyword, structural, and hybrid lexical-semantic pipelines, and discuss where each approach may help or fail.\r\n\r\nAttendees will see how indexing, relevance tuning, and retrieval evaluation reduce token waste, answer quality, and provide stable foundations for agentic systems. A live demo shows search in action, highlighting how it complements AI rather than being replaced by it.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WXFDZU", "name": "Amine GANI", "avatar": "https://program.berlinbuzzwords.de/media/avatars/WXFDZU_nbX62Jl.webp", "biography": "Amine Gani is a Software Engineer and Search Consultant at Adelean, where he specializes in building high-performance search solutions with Elasticsearch and OpenSearch. With expertise in data indexing, search relevancy, and analytics, he helps clients optimize their e-commerce search engines and also A2, Adelean\u2019s search solution for e-commerce. He works at the intersection of software engineering and information retrieval, ensuring integrations tailored to business needs.", "public_name": "Amine GANI", "guid": "4f51613e-56a1-59ea-8a0c-2d308c6c3270", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/WXFDZU/"}, {"code": "UR3N3L", "name": "Roudy Khoury", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JFNM7Q_XIlkp4H.webp", "biography": "Roudy is a software engineer at Adelean, where he specializes in designing and building advanced search solutions across diverse platforms. His work covers modern information retrieval, including classical search techniques, AI-driven retrieval, relevance reranking, and vector-based search for semantic understanding. With a strong focus on leveraging AI to enhance search quality, Roudy develops search engines that deliver more accurate, efficient, and personalized results.", "public_name": "Roudy Khoury", "guid": "26568cfc-646a-5082-b7ff-0ec95ebe18cf", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/UR3N3L/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZY3Y9U/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZY3Y9U/", "attachments": []}, {"guid": "ddf14333-f8fe-5fe3-afb1-d1c9db06fc33", "code": "NBBST7", "id": 91377, "logo": null, "date": "2026-06-09T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Kesselhaus", "slug": "bbuzz26-91377-correctness-too-cheap-to-meter-formal-verification-and-llms", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/NBBST7/", "title": "Correctness Too Cheap To Meter: Formal Verification and LLMs", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Formal methods are powerful tools to verify software systems' correctness and reliability. However, manually writing system specs is time-consuming and hard to maintain. LLMs can help with this burden.\r\nWe'll share new research into tools to automate formal methods workflows and learnings from how LLMs currently perform.", "description": "Formal methods can mathematically prove certain properties of software: for example, we can guarantee a database is deadlock free or avoids crashes. Major infrastructure providers like AWS and Azure all leverage verification, but it's currently too expensive and time-consuming to deploy for most use-cases. However, LLMs can automate much of this toil.\r\n\r\nThis talk demonstrates how we can scale formal methods from an academic luxury to a tractable tool. We share novel research on applying LLMs to formalize real-world systems, including popular DBs and libraries. We present benchmark results, our automated formal spec generation framework, and current model shortcomings.\r\n\r\nIn particular, we'll touch on:\r\n- What formal verification is, why it's key for critical systems, and how it's typically done\r\n- SysMoBench: an LLM benchmark grounded in practical formal verification metrics instead of toy tasks\r\n- Specula: an automated framework to synthesize formal specifications directly from source code, eliminating tedious dev work\r\n- New, unpublished research on connecting specs to real source code more efficiently\r\n\r\nOur approach decreases the implementation cost of formal methods, enabling industry to more efficiently avoid outages and bugs. Audience members will take away knowledge of what formal methods are and how to effectively deploy them by taking advantage of automation opportunities.", "recording_license": "", "do_not_record": false, "persons": [{"code": "NXGVAY", "name": "Emilie Ma", "avatar": "https://program.berlinbuzzwords.de/media/avatars/NXGVAY_rMuYGto.webp", "biography": "Researcher at the University of British Columbia. Previously at OpenAI, Stripe, and the University of Cambridge, working on distributed systems infrastructure and security. More at https://emilie.ma.", "public_name": "Emilie Ma", "guid": "84c3a16f-c16b-5423-92f4-9553839a1b08", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/NXGVAY/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/NBBST7/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/NBBST7/", "attachments": []}, {"guid": "b52765a6-31ab-593d-b51c-9c7497a5b743", "code": "FXSUSJ", "id": 90194, "logo": null, "date": "2026-06-09T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Kesselhaus", "slug": "bbuzz26-90194-from-legacy-search-to-vespa-what-a-real-poc-taught-us", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/FXSUSJ/", "title": "From Legacy Search to Vespa: What a Real PoC Taught Us", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "For years, Germany\u2019s largest classifieds website relied on a search-first relevance approach because structured data was sparse. This talk shares how we introduced Vespa in the Motors category, enriched signals with embeddings and extracted attributes, and migrated step by step; what worked, what failed, and which lessons only a real PoC reveals.", "description": "For a long time, our homepage recommendations were driven by a search-first relevance approach. It was fast to iterate on and easy to reason about, but it limited personalization and proved fragile as soon as listings lacked structure or consistency.\r\nIn this talk, we describe how we transitioned to a Vespa-based recommendation stack, starting with the Motors category, where structured attributes are comparatively rich, and gradually expanding to less-structured categories. Rather than a big-bang rewrite, we incrementally replaced the legacy system.\r\nWe\u2019ll share what the PoC taught us in practice: how we ran old and new systems in parallel, defined guardrails for quality and stability, and progressively improved signals by introducing text embeddings for listings and searches, extracting attributes from free text, and incorporating signals derived from images. We\u2019ll also cover what didn\u2019t work as expected, which assumptions broke under real traffic, and how evaluation and rollout influenced the final architecture.\r\nAttendees will leave with concrete lessons on migrating relevance systems in production, running PoCs that expose real constraints, and introducing modern retrieval and ranking approaches when your data foundations are anything but perfect.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8C93V9", "name": "Andr\u00e9 Charton", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8C93V9_s499VuK.webp", "biography": "Andr\u00e9 has worked with software for ages (Robotron KC 87), and he found his passion in building scalable search apps. He studied computer science at TU Berlin and has more than 15+ years of experience in classifieds and a deep search footprint across SQL, Solr, Elasticsearch, and Vespa.", "public_name": "Andr\u00e9 Charton", "guid": "c990f9e6-1dd8-53e8-ba7c-0696198a9d80", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/8C93V9/"}, {"code": "EWRSFJ", "name": "Valeriia Platonova", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EWRSFJ_Tnc7eku.webp", "biography": "Valeriia is a backend engineer at Kleinanzeigen, with a focus on recommendation systems and search. She is originally from Russia and is currently based in Berlin. She holds a degree in computer science, where she discovered her passion for building backend services.\r\n\r\nOver the course of her career, she has worked across fintech, banking, and healthcare, primarily using Java and the Spring ecosystem. Her recent work reflects a strong interest in search technologies, particularly relevance, vector-based retrieval, and improving overall search quality.", "public_name": "Valeriia Platonova", "guid": "79eb649a-507c-5f45-9e4b-be1054e76495", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/EWRSFJ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/FXSUSJ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/FXSUSJ/", "attachments": []}, {"guid": "f1b99356-70a2-539b-a745-d75c52a8b1ce", "code": "KW73Y8", "id": 97095, "logo": null, "date": "2026-06-09T17:10:00+02:00", "start": "17:10", "duration": "00:10", "room": "Kesselhaus", "slug": "bbuzz26-97095-closing-session", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/KW73Y8/", "title": "Closing Session", "subtitle": "", "track": null, "type": "#BBuzz", "language": "en", "abstract": "Join us as we wrap up Berlin Buzzwords.", "description": "-", "recording_license": "", "do_not_record": false, "persons": [{"code": "SKTAV7", "name": "Paul Berschick", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MN8NMT_i2ySCF9.webp", "biography": "Paul has first been involved as in the organization of Berlin Buzzwords as an intern in 2015 and has been a part of the team ever since. He's now the managing director of Plain Schwarz and together with his team also organizes events like FOSS Backstage or Scala Days. \r\nPaul describes himself as a Free and Open Source Software enthusiast and in his spare time you will find him listening to cricket on the radio or deeply immersed in a good book \u2013 sometimes even both.", "public_name": "Paul Berschick", "guid": "007b1a1d-5bb1-5af8-a5e7-067a25a47035", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/SKTAV7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/KW73Y8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/KW73Y8/", "attachments": []}], "Maschinenhaus": [{"guid": "5ae35b12-c641-513f-9718-96cd2c33172f", "code": "HWPQ7L", "id": 86953, "logo": null, "date": "2026-06-09T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz26-86953-circular-dependency-fixes-when-bootstrapping-a-golden-set", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/HWPQ7L/", "title": "Circular Dependency Fixes when Bootstrapping a Golden Set", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "For a golden set, you need queries. Even if you have them, you can\u2019t judge all docs for each query. Only the top N. How do we rank the top N? See the circular dependency? We\u2019ll talk about ways to untangle it: lexical search, significant terms, training an embedder from scratch, etc. By iteratively refining data and queries, we'll get there.", "description": "If you\u2019re not satisfied with your golden set or don\u2019t have it at all, this session is for you. You may have queries (e.g., from query logs) or you need to generate them. We\u2019ll start by looking at how to create synthetic queries from individual documents, as well as from facets and facet combinations, that might match N documents.\r\n\r\nWe\u2019ll move on to relevance judgements. Even with LLM-as-a-judge, it\u2019s not feasible to, say, rate a 1M doc corpus for 1K queries. We need the top N. How do we know the \"correct\" top N? We\u2019ll need to explore the dataset for any query that is ambiguous (i.e., doesn't clearly match a single doc). There are different methods for exploring data: visualizations, analysis tweaks (e.g., stemming, synonyms)... Vector similarity also helps, but choosing an embedder is tricky because transfer learning can introduce bias that may be misleading for our dataset.\r\n\r\nWe can\u2019t get a perfect golden set on the first try, but we\u2019ll explore techniques to iterate until we\u2019re happy. Which is important for any new search application, whether it\u2019s central to the business (i.e., larger teams, bigger budget) or not.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3CMEKA", "name": "Radu Gheorghe", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3CMEKA_DwlKQxc.webp", "biography": "Radu has been in the search space for many years, mainly on Elasticsearch, Solr, OpenSearch, and, more recently, Vespa.ai. Helps users with both the relevance and the operations side of retrieval. Enjoys education in all its forms (training, blog posts, books, conferences...) and got the chance to be involved in all of them.", "public_name": "Radu Gheorghe", "guid": "e0bb8b22-5b87-5930-bd1b-c992f726ce16", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/3CMEKA/"}, {"code": "ADKESR", "name": "Rafa\u0142 Ku\u0107", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ADKESR_gdoaLxJ.webp", "biography": "Author, software engineer, trainer and consultant focused on information retrieval. In his work helping companies throughout the whole software lifecycle - from requirements gathering and architecture, through implementation and deployment ending with scaling and tuning. In his free time a novice carpenter and ultra runner with varying degree of success.", "public_name": "Rafa\u0142 Ku\u0107", "guid": "1eb7cc2c-b6ba-5277-9561-a98d6395be51", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/ADKESR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/HWPQ7L/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/HWPQ7L/", "attachments": []}, {"guid": "ba74de10-e1d5-5edd-9d40-ee523a6daaf2", "code": "GWVGQP", "id": 91191, "logo": null, "date": "2026-06-09T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-91191-text-to-struct-fine-tuning-slms-for-query-intent", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GWVGQP/", "title": "Text-to-Struct: Fine-tuning SLMs for Query Intent", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Hybrid search fails on complex intent: vector search misses constraints, keywords miss nuance. This talk explores fine-tuning SLMs for 'Query Understanding'\u2014transforming vague inputs into structured requests. Learn to extract metadata, expand terms, and route intent to build a search engine that does the hard work for your users.", "description": "Building a search experience that feels \"intelligent\" requires more than just embedding user input or matching keywords. Real-world financial queries\u2014whether from an analyst or a \"lazy\" Agentic LLM\u2014are rarely optimized for your index. They are a messy mix of semantic intent (\"tech stocks sensitive to rate hikes\") and rigid constraints that simple hybrid search often ignores.\r\n\r\nWe typically see three \"Intent Killers\" in production:\r\n\r\n* **Time:** \"European bank guidance *last quarter*\" (Vector search ignores recency; Keywords miss the fiscal calendar).  \r\n* **Entities & Content Types:** \"CEO remarks on AI in *10-K risk factors*\" (Often conflated with general news or 10-Q tables).  \r\n* **Ambiguity:** Generic LLMs often spam search APIs with broad, unrefined queries like \"crypto regulation risks\" that return noise instead of specific regulatory filings.\r\n\r\nIn this session, we present a robust approach: **Fine-tuning a Small Language Model (SLM) to act as a dedicated \"Query Understanding\" layer.**\r\n\r\nWe will move beyond simple RAG architectures and demonstrate how to train a small, deterministic model to parse raw text and output a valid **Structured Semantic Query**. The training dataset for this is created/prepared by combining real user queries with synthetic data, and we used an LLM to assist in the initial annotation (a form of knowledge distillation) which was then meticulously reviewed to ensure the model captures the necessary constraints and financial nuance. This shifts the burden of \"knowing how to search\" from the user to the system.\r\n\r\n**We will cover:**\r\n\r\n* **The \"Hybrid Gap\":** Why combining Semantic \\+ Lexical search is not enough. We will analyze failure cases involving strict fiscal periods, specific tickers (e.g., distinguishing \"META\" the company from \"meta\" the prefix), and document sub-types.  \r\n* **The \"LLM as User\" Problem:** How to handle the influx of queries from generic LLM Agents. We show how to translate their broad requests (e.g., \"Give me macro trends\") into the specific, optimized queries your engine actually needs.  \r\n* **Why Not Just Prompt a Giant Model?** We demonstrate why \"Prompt Engineering\" generic LLMs is a dead end for high-performance finance search. We show how generalist models lack the necessary domain expertise to ensure schema adherence, and compare the latency/cost against specialized SLMs that offer 99% schema adherence  \r\n* **Query Expansion & Intent Routing** is a process where a fine-tuned Small Language Model (SLM) intercepts the user's initial search phrase and automatically enriches it with specific, structured search terms before sending it to the index. Instead of just matching keywords, the SLM *translates* the user's semantic intent into precise, optimized queries. For instance, a vague term like \"greenwashing\" is expanded and routed as multiple concepts, such as `regulatory_risk` OR `esg_controversy`.  \r\n* **Impact on Relevance:** Real-world comparisons showing how \"translating\" intent upstream drastically improves retrieval quality for complex financial instruments compared to standard Hybrid Search.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZGXVRQ", "name": "Hugo Jimenez", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ZGXVRQ_b1uY4eW.webp", "biography": "Hugo Jim\u00e9nez Mu\u00f1oz is a Machine Learning Engineer at RavenPack , where he builds high-performance NLP models for financial analytics. Specializing in RAG optimization and Transformer fine-tuning, he bridges the gap between vector search and structured query requirements. His background includes Knowledge Graph engineering and building scalable AI architectures on AWS.", "public_name": "Hugo Jimenez", "guid": "5e07c500-b3a0-5213-bfff-8e839b96e3de", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/ZGXVRQ/"}, {"code": "ZNAFX7", "name": "Sandra Bull\u00f3n", "avatar": "https://program.berlinbuzzwords.de/media/avatars/Z8LK9Q_x71C7i4.webp", "biography": "Sandra Bull\u00f3n is a Senior Product Manager experienced in building data, analytics, and search products.\r\n\r\nMajor advocate for rigorous evaluation and human-in-the-loop approaches to building reliable AI. \r\n\r\n_You cannot improve what you don't understand._", "public_name": "Sandra Bull\u00f3n", "guid": "3d0856ba-d0a7-51dd-89ab-a6b54802d650", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/ZNAFX7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GWVGQP/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GWVGQP/", "attachments": []}, {"guid": "14569e2c-3219-5d98-aa18-a67e8ed1efc3", "code": "WPX33K", "id": 88592, "logo": null, "date": "2026-06-09T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-88592-context-aware-segments-solving-the-scatter-read-problem", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/WPX33K/", "title": "Context-Aware Segments: Solving the \"Scatter-Read\" Problem", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Traditional OpenSearch segments are context-blind, scattering data across multiple segments. We introduce Context-Aware Segments (CAS), an architecture that brings \"sharding\" logic to the segment level. By enforcing document locality during indexing, we slashed query latency and minimized data footprint through superior pruning and compression.", "description": "#### The Friction: The \"Everything, Everywhere\" Problem\r\n\r\nIn distributed search engines like OpenSearch, the Shard is the unit of scale, but the Segment is the unit of storage. Traditionally, documents are written to segments based purely on arrival time. For multi-tenant SaaS platforms or high-velocity observability clusters, this means data for a specific tenant or time-range is scattered across every single segment within a shard. A simple filter query becomes an expensive fan-out operation, thrashing the file system cache and wasting CPU cycles checking documents that will never match.\r\n\r\n#### The Solution: Context-Aware Segments (CAS)\r\n\r\nThis session dissects the design and implementation of CAS (OpenSearch RFC #18576) and its foundation in Lucene (Issue #13387). This architectural shift introduces a logical \"context\" dimension to segment creation. Instead of a temporal log, we treat segments as optimized containers for specific data subsets. \r\n\r\nIn this technical deep-dive, we will cover:\r\n\r\n**Granular Segment Pruning**: How the query coordinator leverages new segment-level metadata to perform \"pre-search\" filtering\u2014effectively skipping files on disk before the engine even opens them.\r\n\r\n\u200b**Vector Segment Pruning**: How we use segment-level metadata to skip entire HNSW graphs during a k-NN search. If a segment doesn't contain \"Tenant A,\" we don't even load its vector blobs into memory.\r\n\r\n**Supercharged Compression**: We demonstrate how grouping similar data by context significantly increases compression ratios. When the storage engine sees repetitive data patterns in a single segment, the bit-packing and dictionary compression become far more efficient, slashing the data footprint.", "recording_license": "", "do_not_record": false, "persons": [{"code": "7GZANJ", "name": "Rishav Sagar", "avatar": "https://program.berlinbuzzwords.de/media/avatars/7GZANJ_DQ2yMvU.webp", "biography": "Experienced backend developer with 8+ years specializing in distributed systems and cloud technologies. Currently working as SDE 2 at AWS OpenSearch, contributing to OpenSearch's core functionalities.", "public_name": "Rishav Sagar", "guid": "d081d7ac-42f8-5e3d-8f69-bc9ee13b3dbe", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/7GZANJ/"}, {"code": "393DHX", "name": "Tejas Shah", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8EEJVV_TFwzwbQ.webp", "biography": "Sr Software Engineer interested in distributed systems and vector databases", "public_name": "Tejas Shah", "guid": "baf4401c-d4b0-5b2c-8bdc-219f83d35c46", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/393DHX/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/WPX33K/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/WPX33K/", "attachments": []}, {"guid": "a20b5f59-8d08-5ffe-bc6f-211310c83540", "code": "9NH7VB", "id": 91401, "logo": null, "date": "2026-06-09T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-91401-c-search-for-database-kernels-built-in-not-bolted-on", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9NH7VB/", "title": "C++ Search for Database Kernels: Built In, Not Bolted On", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "IResearch is an Apache 2.0 C++ search engine built to live inside databases. We'll benchmark it against leading open-source search engines, show why vectorized scoring is the next frontier for information retrieval engines, share the mistakes we made over a decade of development and explore how database-native search fits modern query execution.", "description": "There's a certain irony in building a search engine no one can find. IResearch is an open-source Apache 2.0 C++ search library that has been quietly powering search inside databases since 2015: first behind ArangoSearch, now as the foundation of SereneDB. Instead of becoming yet another standalone search server, it evolved into a library designed to be embedded directly into database kernels. That journey defined most of the architectural decisions in the project. Some  of them were good, some painful. This talk tells the honest story of it.\r\nWe'll start with how IResearch ended up inside databases and what that means in practice: WAL integration, transactional consistency of search indexes, synchronization with the main storage.\r\nFrom there, we'll compare IResearch functionally and architecturally to Lucene and Tantivy. All three Lucene-inspired engines diverge in different aspects: functionality, index layout and scoring. Using the search-benchmark-game suite, we'll put them head to head - not to declare a winner, but to dissect why the numbers look the way they do and trace performance differences back to architectural roots.\r\nThen we will talk about how different search engines handle scoring and a very noticeable difference of IResearch in that regard. When documents are scored one at a time, significant CPU throughput is left on the table. While recent Lucene versions have begun moving toward block-based evaluation, scoring remains far from fully vectorized. However, some newer approaches treat relevance computation as a SIMD-friendly evaluation pipeline similar to query execution engines. We'll walk through how this could work in practice and show the concrete throughput gains it delivers.\r\nAlong the way, we'll be honest about the mistakes we made: architectural bets that didn't pay off, abstractions that hurt performance in production, integration patterns we had to rip out and rebuild.\r\nFinally, we'll zoom out to the architectural questions that emerge when search lives inside a database. How do you scale a search index when compute and storage are separated? How does search fit into OLAP query execution: late materialization, joins between search indexes and analytical data, unified query planning? We'll share what we've learned solving these in practice.", "recording_license": "", "do_not_record": false, "persons": [{"code": "B7ZYXE", "name": "Andrey Abramov", "avatar": "https://program.berlinbuzzwords.de/media/avatars/B7ZYXE_5kBcVjx.webp", "biography": "Andrey Abramov is the Founder and CTO of SereneDB, where he is building a real-time analytical search and OLAP database. With over 15 years of experience in C++, Andrey specializes in production-grade search engines and database kernel internals.\r\nPrior to SereneDB, Andrey was the mastermind behind ArangoSearch, a search engine natively integrated into ArangoDB's distributed multi-model database core.\r\nEarlier in his career, Andrey held senior engineering roles at EMC and Quest Software, where he managed teams and led the development of enterprise-scale systems.", "public_name": "Andrey Abramov", "guid": "9d12cd23-47cf-5b15-be92-929f005e0205", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/B7ZYXE/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9NH7VB/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/9NH7VB/", "attachments": []}, {"guid": "9bd9a0fa-a68a-5682-90b4-de8c47b7debd", "code": "GPKCWA", "id": 91321, "logo": null, "date": "2026-06-09T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-91321-one-gpu-four-retrieval-modes-multi-model-search-serving", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GPKCWA/", "title": "One GPU, Four Retrieval Modes: Multi-Model Search Serving", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Competitive search now needs dense embeddings, sparse vectors, ColBERT, and cross-encoder reranking. Most teams run four separate containers. This talk shows how to serve all four from one process, walks through building a hybrid retrieval pipeline with real benchmark data, and covers where each retrieval mode wins and where it wastes compute.", "description": "Every production search system in 2026 runs multiple models. A dense embedder handles semantic search. A sparse model provides keyword recall. A multi-vector model like ColBERT enables token-level matching. A cross-encoder reranker improves final precision. These four stages have become table stakes for competitive retrieval quality.\r\n\r\nThe infrastructure story is less elegant. The industry default is one container per model, typically using HuggingFace TEI, Triton, or a custom Flask wrapper. Four models means four separate deployments, four sets of scaling rules, and four GPU allocations where each model uses a fraction of what it reserves.\r\n\r\nWhen building SIE, an open-source search inference engine, we took a different approach: one server process that handles all four retrieval modes through a unified API with three primitives (encode, score, extract). Models like BGE-M3 return dense, sparse, and multi-vector outputs from a single encode call. Cross-encoder reranking uses the score primitive. Same server, same GPU, same API.\r\n\r\n  The talk covers four areas.\r\n\r\nFirst, why hybrid retrieval requires multiple model types. We will walk through a real retrieval pipeline: sparse for keyword recall, dense for semantic matching, ColBERT for token-level precision, and a cross-encoder for final reranking. For each stage we will show what it adds to retrieval quality using BEIR benchmark data, and when the added complexity is not worth it.\r\n\r\nSecond, the adapter architecture that makes multi-model serving possible. SIE wraps PyTorch, FlashAttention, SentenceTransformers, and SGLang behind a common interface. We will walk through the lifecycle of a request: API call, tokenization on CPU, batching, GPU inference, and postprocessing. Different model architectures need different compute backends, and we will explain why a single unified runtime was not the right choice.\r\n\r\nThird, building the pipeline end to end. A practical walkthrough of dense + sparse + ColBERT + reranking from a single server instance, including how to combine scores from different retrieval modes and how to tune the balance between recall and precision.\r\n\r\nFourth, tradeoffs and lessons. When does multi-model serving on one GPU work well, and when should a model get its own dedicated container? What happens under concurrent load when multiple models compete for memory? We will share real data from running these workloads on L4 GPUs.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8TWA3E", "name": "Filip Makraduli", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8TWA3E_AUjMuXq.webp", "biography": "Filip Makraduli is a London\u2011based machine learning engineer and developer relations professional with a background in data science and AI, particularly in recommendation systems and language technologies. He has a masters in Biomedical Data Science from Imperial College London and is an experienced speaker.", "public_name": "Filip Makraduli", "guid": "9fbd3847-aefc-5d4b-87db-2a58314fcc7b", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/8TWA3E/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GPKCWA/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GPKCWA/", "attachments": []}, {"guid": "b9a1e8a4-e0d8-5900-a8d5-d0638f4faf21", "code": "TUSDT8", "id": 90250, "logo": null, "date": "2026-06-09T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-90250-zero-downtime-index-upgrade-in-apache-solr", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TUSDT8/", "title": "Zero downtime index upgrade in Apache Solr", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this talk we'll explore how Apache Solr introduced the capability to upgrade an index in-place with zero downtime. This upgrade path helps prepare the index for a future Solr major version upgrade without needing to recreate the index from source as is the case with Lucene based search engines today.", "description": "Starting Lucene/Solr 7.x, if you have an index created in a certain version it is only usable until the next major version upgrade. Beyond that you are required to recreate the index from source data. This can be a practical challenge in case of large clusters since this can imply potential downtime and/or significant infrastructure & operational costs, or worse - a dead-end if the true source of data no longer exists. \r\n\r\nStarting Apache Solr 9.11 [yet to be released as of this draft], users have the ability to upgrade an index in-place with zero downtime, subject to certain constraints . This prepares the index for a future Solr major version upgrade, eliminating the need to recreate the index from source. The implication is that an index originally created in Solr 8.x now has a pathway to future upgrades without needing the source data.\r\n\r\nIn the talk, I\u2019ll discuss the mechanisms and APIs that Solr exposes to support this capability, and the constraints involved. The audience will also learn about implementation details across the Lucene and Solr layers, and how the underlying changes made during this effort pave the way for a similar capability in other Lucene-based search engines.", "recording_license": "", "do_not_record": false, "persons": [{"code": "MJVUJV", "name": "Rahul Goswami", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MJVUJV_sMLRQBc.webp", "biography": "Rahul is an Apache Solr committer and a Principal Software engineer on the search infrastructure team at Commvault. He has spent over a decade working in the search field, and is passionate about the domain.\r\n\r\nHe loves to get into the guts of how things work and is an active member of the Apache Solr and Lucene community.", "public_name": "Rahul Goswami", "guid": "0e524dee-b292-5bb9-b747-2d8f199af98e", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/MJVUJV/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TUSDT8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/TUSDT8/", "attachments": []}, {"guid": "d5f28a55-1751-558e-80fa-249438ab11be", "code": "EDXRLN", "id": 91366, "logo": null, "date": "2026-06-09T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "bbuzz26-91366-building-schema-free-applications-with-rdf", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/EDXRLN/", "title": "Building Schema-Free Applications with RDF", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "RDF was designed for the semantic web, but it turns out to be a perfect fit for systems where structure emerges from user interaction, not upfront design. This talk covers how to build applications entirely on RDF triples, translate natural language to SPARQL with small, open source language models, and discover implicit knowledge in user input.", "description": "Most applications assume their data model is known before the first user interacts with the system. But there are cases where this assumption doesn't hold, and the structure of the data needs to emerge from how people use the system rather than being designed upfront.\r\n\r\nThis talk explores why common database paradigms fall short for this use case and how that search led us to Resource Description Framework (RDF). Originally designed for the semantic web, RDF stores knowledge as subject-predicate-object triples, a surprisingly natural fit for application data when the schema isn't fixed.\r\n\r\nWe cover the practical side: using fine-tuned open source models to translate natural language into SPARQL queries, drawing on research like FIRESPARQL, storing data with tools like Oxigraph, and self-hosting models with our open source model serving platform Paddler (https://github.com/intentee/paddler).\r\n\r\nFinally, we show how LLMs can derive not just what users explicitly say but also implicit relationships, opening new possibilities for analytics and knowledge discovery.", "recording_license": "", "do_not_record": false, "persons": [{"code": "LTZJ9B", "name": "Gosia Zagajewska", "avatar": "https://program.berlinbuzzwords.de/media/avatars/LTZJ9B_mNBeuKd.webp", "biography": "Ex-head of product at Packhelp, where she led internal tools and applications. Built products at organizations of different sizes, giving her insights into why traditional software development practices should no longer be maintained. Co-founder at Intentee.", "public_name": "Gosia Zagajewska", "guid": "93c3833e-fe11-5382-9f0e-8b900201c516", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/LTZJ9B/"}, {"code": "8H9ADR", "name": "Mateusz Charytoniuk", "avatar": null, "biography": null, "public_name": "Mateusz Charytoniuk", "guid": "3c0ae604-a8b7-53bb-a38c-6290dc4062ba", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/8H9ADR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/EDXRLN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/EDXRLN/", "attachments": []}, {"guid": "c00dc007-17ce-5761-920b-8b0140a5a71e", "code": "7ND3UE", "id": 91223, "logo": null, "date": "2026-06-09T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Maschinenhaus", "slug": "bbuzz26-91223-how-to-survive-the-vortex-of-llm-change", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7ND3UE/", "title": "How to Survive the Vortex of LLM Change", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "The LLM ecosystem changes faster than most teams can adapt. This talk shares our experience and the practical lessons we\u2019ve learned while building an intelligent search product in a world where models, tools, and best practices constantly evolve.", "description": "Working with LLMs today means operating in an environment where models, APIs, capabilities, and costs change constantly. What works today may become obsolete in months, creating technical and organizational pressure on teams.\r\n\r\nIn this talk, we share our experience working in an intelligent search company in this environment. We will share the good, the bad, and the ugly: the rollercoaster of realizing that something which took hours of code can suddenly be achieved with a simple prompt. We discuss how we evaluate new models without destabilizing production, stay updated without losing our minds, and separate the wheat from the chaff in the constant stream of LLM news.\r\n\r\nBeyond technical architecture, we reflect on the human side of constant change. The goal is not to predict where LLMs will go next, but to share strategies for building systems and teams that adapt without losing sanity.", "recording_license": "", "do_not_record": false, "persons": [{"code": "MDSU3R", "name": "Carmen Iniesta", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MDSU3R_KQ38kDV.webp", "biography": "I\u2019m a computer scientist working in ML and NLP, with a soft spot for fairness, linguistics, and trying to make the world a bit better, or at least not worse. \r\nI\u2019m also passionate about making the tech world more inclusive and thoughtful, one system (or conference talk) at a time.", "public_name": "Carmen Iniesta", "guid": "88d0fad1-6f5b-562f-bf93-77a2b60d5b83", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/MDSU3R/"}, {"code": "QN79F7", "name": "Carles Onielfa", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QN79F7_S0PZDI4.webp", "biography": "Machine Learning Engineer at Progress", "public_name": "Carles Onielfa", "guid": "bbbc20b7-5553-5e76-8a9c-4769a90b2cde", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/QN79F7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7ND3UE/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7ND3UE/", "attachments": []}], "Palais Atelier": [{"guid": "477fb049-5fe7-5696-88e2-db819c65864b", "code": "AJK8EK", "id": 91522, "logo": null, "date": "2026-06-09T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-91522-kafi-streams-complex-stream-processing-made-simple", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/AJK8EK/", "title": "Kafi Streams: Complex Stream Processing Made Simple", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "You can finally stop caring about co-partitioning, state stores and eventual consistency. Kafi Streams, built on (Py)DBSP, treats streaming like batch \u2014 strongly consistent, no special concepts. An Open Source Python library for the 80% of use cases that don't need extreme scale. Fully incremental stream processing for everyone, from day one.", "description": "I will unveil Kafi Streams, an Open Source library for complex stream processing inspired by Kafka Streams but built on top of PyDBSP, a pure Python implementation of Feldera's novel \"Database Stream Processing\" theory.\r\n\r\nWhy would we need yet another stream processing library? One whose name sounds so strikingly similar to the most popular stream processing library on the planet?\r\n\r\nBecause existing stream processing libraries are too complex, even Kafka Streams. Their engines have been prematurely optimized for maximum scale, not simplicity. You cannot easily do stream processing without understanding concepts like streams vs. tables, co-partitioning, windowing (hopping, tumbing, sessions...), state stores etc. - all these \"leaky abstractions\" still prevailing in the stream processing world. It is them that keep stream processing in a niche.\r\n\r\nOn the contrary, Kafi Streams aims at making stream processing simple. It does not (yet) aim for extreme scale and performance. But to enable complex stream processing with full support for joins, aggregations et al. for the less performance-heavy 80% of use cases in non-tech companies like mine, Migros, a $30B+ revenue retailer.\r\n\r\nWith Kafi Streams, anyone can do complex stream processing, even those who have never done it before. Right from the start. Because with DBSP as our basis, streaming is no different from batch any longer. Simple. Deterministic. Not just eventually but strongly consistent. Just like anybody coming from outside the streaming world would always have hoped.", "recording_license": "", "do_not_record": false, "persons": [{"code": "N3TCBS", "name": "Ralph Matthias Debusmann", "avatar": "https://program.berlinbuzzwords.de/media/avatars/N3TCBS_Gl0KIhY.webp", "biography": "Ralph is a former AI/NLP researcher turned software engineer, solution architect and technologist, now acting as the Lead Enterprise Kafka Engineer at Migros-Genossenschafts-Bund in Zuerich, Switzerland. He has received his PhD in computer science focusing on Natural Language Processing and Artificial Intelligence in 2006 (Saarbruecken University and University of Edinburgh) and has spent 15 years at SAP, Bosch and Forecasty.AI/BASF SE before joining Migros-Genossenschafts-Bund in 2023.", "public_name": "Ralph Matthias Debusmann", "guid": "de0508c2-5626-50d5-bc22-b1f291ea4666", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/N3TCBS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/AJK8EK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/AJK8EK/", "attachments": []}, {"guid": "7610ed8f-55c4-515c-881b-38b7e148dd11", "code": "7EVE78", "id": 91118, "logo": null, "date": "2026-06-09T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-91118-duckdb-beyond-the-notebook", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7EVE78/", "title": "DuckDB beyond the notebook", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Most people know DuckDB as a fast analytics tool for notebooks and scripts. But embedded OLAP enables much more: browser-based analytics via WebAssembly, serverless data processing, and lightweight data apps \u2014 without heavy infrastructure. This talk shows how DuckDB changes the way we build data-driven applications.", "description": "SQLite as an embedded database is known by everyone \u2014 easy to integrate into any application, it's the most widely used database in the world. DuckDB is also an embedded database, but with a focus on analytical queries rather than transactional workloads.\r\n\r\nToday, DuckDB has evolved into a blazingly fast query engine that runs almost everywhere. This opens up new architectural possibilities for building data-driven applications \u2014 especially when analytics need to be delivered directly to end users through interactive reports, dashboards, or exploratory tools.\r\n\r\nIn this talk, I'll use minimal slides and plenty of live demos to show how developers can build fast and lean data applications with DuckDB. We'll explore scenarios including browser-based analytics powered by WebAssembly, serverless functions processing data from cloud storage, and embedded analytics in traditional applications.\r\n\r\nWe'll also examine the architectural implications: how embedded OLAP changes data architectures by bringing compute closer to the data, enabling 1.5-tier and cache-layer patterns that eliminate the need for separate analytics infrastructure.\r\n\r\nAttendees will learn what DuckDB is and how it works, how it differs from other embedded databases, and how to use it to build data-driven applications that go well beyond the notebook.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8QDF3Q", "name": "Matthias Niehoff", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8QDF3Q_RCl9OTA.webp", "biography": "Matthias Niehoff works as Head of Data and Data Architect for codecentric AG and supports customers in the design and implementation of data architectures. His focus is on the necessary infrastructure and organization to help data and ML projects succeed.", "public_name": "Matthias Niehoff", "guid": "6aa639a8-3661-5755-9bfb-f0c4cb3576d5", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/8QDF3Q/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7EVE78/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/7EVE78/", "attachments": []}, {"guid": "64cc2b32-5f4c-527d-ab04-796a893eabec", "code": "MYQSXK", "id": 88837, "logo": null, "date": "2026-06-09T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-88837-otel-apache-iceberg-the-new-standard-for-observability", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MYQSXK/", "title": "OTel + Apache Iceberg: The New Standard for Observability", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Observability is moving from vendor stacks to open standards. This talk presents a design where OpenTelemetry provides collection and semantic context, and Apache Iceberg is the data layer for logs, metrics, and traces. We cover portability, governance, agent investigation, and write-path pitfalls: drift, small files, compaction.", "description": "Observability is shifting from vendor-specific stacks to an open, composable architecture. This talk presents a reference design where OpenTelemetry provides collection, context propagation, and semantic normalization, and Apache Iceberg becomes the open data layer for logs, metrics, and traces. We will explain why this pairing is emerging as a practical standard for portability and governance, and why it fits agent-driven investigation workflows. The focus is on production write-path realities: schema drift, high cardinality, small-file control, commit and compaction strategy, and streaming aggregation patterns that keep latency and cost predictable.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BH7KTW", "name": "Yingjun Wu", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BH7KTW_GSX6xdx.webp", "biography": "Yingjun Wu is the founder of RisingWave Labs (https://www.risingwave.com/), a database company developing RisingWave, an event streaming platform for agents, apps, and analytics. Before running the company, Yingjun was a software engineer at the Redshift team, Amazon Web Services, and a researcher at the Database group, IBM Almaden Research Center. Yingjun received his PhD degree from National University of Singapore, and was a visiting PhD at Carnegie Mellon University. He has been working in the field of stream processing and database systems for over a decade.", "public_name": "Yingjun Wu", "guid": "a705555f-19ac-55cd-843a-74d73eab0373", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/BH7KTW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MYQSXK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/MYQSXK/", "attachments": []}, {"guid": "393be1d9-2479-595e-a67c-17124a227e2b", "code": "A3JKMH", "id": 91632, "logo": null, "date": "2026-06-09T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-91632-what-if-we-ve-been-scaling-stream-processing-wrong-all-along", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/A3JKMH/", "title": "What If We've Been Scaling Stream Processing Wrong All Along", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "We\u2019ve normalised extraordinary inefficiency in stream processing. Thousands of events/sec don't justify repartition storms, serialization overhead, state migration. This talk explores a different path: Kafka Streams DSL, adopt Flink-like exactly-once semantics, Project Loom, and challenging the assumption that stream processing must be distributed.", "description": "Your Kafka Streams application just rebalanced. Again. Your Flink checkpoint is timing out. Again.\r\n\r\nHere's an uncomfortable truth: most stream processing applications don't operate at Uber scale. They handle thousands of events per second\u2014complex joins, stateful aggregations, valid use cases - but nowhere near the volumes that justify the operational complexity we've accepted as normal.\r\n\r\nYet we pay the full distributed systems tax anyway. Repartition topics doubling network I/O and storage. Repeated serialization burning CPU cycles, often accounting for a significant amount of the total compute of an application. Standby replicas sitting idle. State migration or restoration during deployments. And the human cost: specialized expertise that takes years to develop, expert teams that are expensive to build and painful to lose.\r\n\r\nWe've normalized extraordinary inefficiency in the name of horizontal scalability that many applications will never need.\r\n\r\nBut rethinking stream processing in 2026 doesn't mean \"just use Postgres.\" \r\nIn this talk, I'll share an early-stage exploration of a different approach. A framework that preserves the Kafka Streams DSL, borrows Flink's approach to exactly-once semantics, leverages Project Loom for high concurrency\u2014and challenges a fundamental assumption that both frameworks share.\r\n\r\nThis is an invitation to question conventional wisdom and explore what stream processing could look like when we stop distributing by default.", "recording_license": "", "do_not_record": false, "persons": [{"code": "F3BEM7", "name": "Hartmut Armbruster", "avatar": "https://program.berlinbuzzwords.de/media/avatars/F3BEM7_D2dlYf6.webp", "biography": "Hartmut is a software engineer and tech lead with a strong passion for architecture, data streaming, and distributed systems. He has designed and delivered solutions for mission-critical platforms, working with clients including HSBC, NEX Group plc, Raiffeisen Switzerland, GoodLabs Inc., Deutsche Bahn, and eu-LISA. Hartmut is driven by a desire to see the bigger picture and excels at aligning engineering teams through clear, compelling architectural designs.", "public_name": "Hartmut Armbruster", "guid": "3f381aac-046b-5256-9420-1fd55c21063b", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/F3BEM7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/A3JKMH/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/A3JKMH/", "attachments": []}, {"guid": "d91cdaea-81f9-5d5f-a59a-1da04f61f376", "code": "GH8HEH", "id": 90431, "logo": null, "date": "2026-06-09T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-90431-detecting-hidden-bias-in-datasets-before-models-fail", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GH8HEH/", "title": "Detecting Hidden Bias in Datasets Before Models Fail", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Hidden bias in datasets silently breaks machine learning systems in production. This talk shows how to detect data imbalance, leakage, and coverage gaps early using practical metrics, visualizations, and open-source tools\u2014before misleading offline metrics turn into costly real-world failures.", "description": "Machine learning models rarely fail because of algorithms \u2014 they fail because of data. This talk focuses on practical techniques for detecting hidden bias in datasets before models reach production. Drawing from real-world ML systems, it covers how regional, temporal, and behavioral imbalances distort model behavior while remaining invisible to standard metrics. Attendees will learn how to identify distribution drift, uncover feature leakage, and detect coverage gaps across segments and time windows. The session demonstrates concrete workflows, diagnostics, and visualizations that can be applied using open-source tools to improve data quality, model reliability, and long-term trust in ML-driven products.", "recording_license": "", "do_not_record": false, "persons": [{"code": "9RKSWN", "name": "Stas Don", "avatar": "https://program.berlinbuzzwords.de/media/avatars/9RKSWN_VgViV4f.webp", "biography": "Stanislav Don is a Data Scientist at eBay, working on production machine learning systems and model reliability. His work focuses on data quality, bias detection, and monitoring ML models in real-world environments. He regularly shares practical lessons from deploying ML at scale through conference talks and applied research.", "public_name": "Stas Don", "guid": "0f5869ec-2f85-56e3-9000-89c329236142", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/9RKSWN/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GH8HEH/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/GH8HEH/", "attachments": []}, {"guid": "86e1a3a9-51f2-5b00-8acd-2f5f83b617bb", "code": "UW9W9C", "id": 88059, "logo": null, "date": "2026-06-09T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Palais Atelier", "slug": "bbuzz26-88059-what-you-should-know-about-constraints-in-postgresql-18", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/UW9W9C/", "title": "What you should know about constraints in PostgreSQL 18", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk explains how constraints work in Postgres by exploring the pg_constraint catalog and core concepts like table vs. column constraints, constraint triggers, domains and constraint deferrability through SQL queries. It then covers what\u2019s new in Postgres 18 including temporal keys, NOT NULL as a first-class constraint, NOT ENFORCED and more.", "description": "PostgreSQL 18 introduces significant enhancements to constraints, the first line of defense for maintaining data integrity. This talk focuses on new capabilities in version 18, including non-overlapping PRIMARY KEY, UNIQUE, and foreign key constraints; NOT NULL constraints becoming first-class citizens; the introduction of NOT ENFORCED constraints and improved support for partitioned tables. We\u2019ll look at what\u2019s new, why it matters and how to apply these features in real-world systems.\r\n\r\nWe\u2019ll begin with a detailed walkthrough of the pg_constraint catalog, covering less commonly discussed concepts such as constraint deferrability, constraint triggers, domains and related internals. From there, we\u2019ll move on to what\u2019s new in PostgreSQL 18. A major addition is temporal keys, bringing PostgreSQL a step closer to supporting temporal data models. Another key change is NOT NULL becoming a standard constraint along with the implications of that promotion. We\u2019ll also explore NOT ENFORCED constraints and other recent additions and briefly look ahead to what\u2019s coming in PostgreSQL 19.", "recording_license": "", "do_not_record": false, "persons": [{"code": "B8AKHE", "name": "G\u00fcl\u00e7in Y\u0131ld\u0131r\u0131m Jelinek", "avatar": "https://program.berlinbuzzwords.de/media/avatars/B8AKHE_c1O8xAl.webp", "biography": "G\u00fcl\u00e7in started working with PostgreSQL at a startup in 2012 and was immediately struck by how powerful it is. Since then, she has been an active member of the PostgreSQL community, organizing conferences, giving talks and contributing in various ways. In recognition of her commitment, she was elected to the PostgreSQL Europe Board in 2017 and recognized as a PostgreSQL contributor in 2024.\r\n\r\nDriven by her interest in PostgreSQL automation and cloud technologies, G\u00fcl\u00e7in joined 2ndQuadrant where she led cloud development efforts until the company was acquired by EDB in 2020. She is also an active member of Postgres Women, advocating for greater diversity and inclusion in technical communities.\r\n\r\nG\u00fcl\u00e7in currently works at Xata, where she continues to focus on PostgreSQL engineering. Beyond her professional work, she is a co-founder of Kadin Yazilimci (Women Developers of Turkey) and has led its core team for more than 11 years. In 2023, she launched the Diva: Dive into AI conference as a Kadin Yazilimci initiative and has been part of the organizing team since.\r\n\r\nShe lives in Prague where she is the co-founder and organizer of the monthly Prague PostgreSQL Meetup for over eight years. G\u00fcl\u00e7in remains deeply involved in the PostgreSQL community and is committed to contributing to the long-term health and sustainability of the project.", "public_name": "G\u00fcl\u00e7in Y\u0131ld\u0131r\u0131m Jelinek", "guid": "1f84d441-7ade-5564-a7b9-8d27386e5dc3", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/B8AKHE/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/UW9W9C/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/UW9W9C/", "attachments": []}, {"guid": "1ec27629-9947-539a-99e5-043d8a167964", "code": "DLYAP8", "id": 91052, "logo": null, "date": "2026-06-09T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Palais Atelier", "slug": "bbuzz26-91052-ai-in-the-physical-world-from-observation-to-discovery", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/DLYAP8/", "title": "AI in the physical world: from observation to discovery", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "In 2026, AI is moving beyond digital tasks into the physical world. It increasingly interacts with instruments, experiments, and real-world data. Physicists stand at this frontier, using deep learning, LLMs, and agents to analyze nature itself. What have we learned about AI when it meets reality?", "description": "Artificial intelligence is moving beyond text generation and digital optimization into domains where uncertainty, scale, and scientific rigor dominate. Modern physics provides a uniquely demanding testbed for this shift: deep learning is used to reconstruct complex events, identify rare phenomena, and search for anomalies in high-dimensional datasets characterized by sparse signals and strict statistical constraints. At the same time, AI is taking on more structured roles in scientific workflows \u2014 from code generation and literature synthesis to emerging agent-based approaches \u2014 raising fundamental questions about how far AI can support scientific reasoning in practice.\r\n\r\nA central challenge is operational integration. AI methods are increasingly explored for decision support in complex research facilities: tuning accelerator parameters, assisting telescope operations, and adapting to evolving environmental and hardware conditions. Yet claims of autonomous discovery or fully AI-driven infrastructure have often proven difficult to reproduce outside controlled settings. A balanced, engineering-focused assessment of both successes and limitations is therefore essential.\r\n\r\nIn this talk, I will survey real-world applications of AI across modern physics, from collider experiments to large-scale astronomy systems. I will highlight measurable gains alongside negative results, sources of bias, and stability issues that matter for production environments. The presentation concludes with a concrete case study from gamma-ray astrophysics, illustrating both the opportunities and the practical limits of integrating AI into data analysis pipelines and next-generation observatory infrastructure.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TYY9M8", "name": "Dmitriy Kostunin", "avatar": "https://program.berlinbuzzwords.de/media/avatars/TYY9M8_fb6GC27.webp", "biography": "An astrophysicist and computer scientist based in the Berlin metropolitan area, earned a PhD from the Karlsruhe Institute of Technology (KIT) in 2015. Currently a researcher at the German Electron Synchrotron (DESY), contributing to the development of the next-generation Cherenkov Telescope Array Observatory (CTAO).", "public_name": "Dmitriy Kostunin", "guid": "f45f50fa-5dbb-567a-a91f-ec400be836e2", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/TYY9M8/"}, {"code": "A9PFAR", "name": "Julian von Hoerschelmann-Schliwinski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/A9PFAR_g033oUZ.webp", "biography": "Julian von Hoerschelmann-Schliwinski is an astrophysicist and PhD candidate at DESY and based in Berlin. Working on the ULTRASAT space mission, he specializes in the intersection of space instrumentation and data analysis pipelines. He combines deep technical expertise with a focus on applying scientific rigor and AI-supported operations to real-world industrial challenges.", "public_name": "Julian von Hoerschelmann-Schliwinski", "guid": "4b075950-a6e1-5ee9-9068-26fc313b7cd4", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/A9PFAR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/DLYAP8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/DLYAP8/", "attachments": []}], "Frannz Salon": [{"guid": "90987bd4-c230-5297-8f2b-d54a9b2f72ea", "code": "ZWUDWR", "id": 86547, "logo": null, "date": "2026-06-09T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Frannz Salon", "slug": "bbuzz26-86547-writes-3-ways-postgres-apache-kafka-and-apache-iceberg", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZWUDWR/", "title": "Writes, 3 ways: Postgres, Apache Kafka\u00ae and Apache Iceberg\u2122", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Learning new things is hard, but a useful way to think about new things is by comparing them to things you already know. In this talk, we'll compare writes between 3 different popular data services: Postgres, Apache Kafka and Apache Iceberg. In doing so, we'll learn a bit about the evolution of how we've thought of data storage as developers.", "description": "The world of data services is evolving rapidly, with adoption of open table formats like Apache Iceberg\u2122 picking up steam quickly. But \u201cdata services\u201d is a pretty broad category, and none of these services is quite like the other.\r\n\r\nIn this talk we\u2019ll take a step back to look at three data services: Postgres, Apache Kafka and Apache Iceberg, and how they each handle writes. In doing so, we\u2019ll trace a history through how data services have evolved in the world of distributed systems and big data. We\u2019ll understand the key differences and similarities between these services. Finally, we\u2019ll take a look at what\u2019s coming next in the world of open source data, from Postgres and beyond. \r\n\r\nThis session is meant as a refresher for existing data engineers as well as a primer for junior engineers: Most developers know a bit about Postgres but they might not fully understand the internals, and many engineers are getting heavily involved in Iceberg, but might not understand why it's relevant.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KVW9PZ", "name": "Celeste Horgan", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KVW9PZ_xJNiKyO.webp", "biography": "Celeste Horgan is a Sr. OSS Developer Advocate and OSPO Lead at Snowflake. Previous roles include work at Aiven, The Linux Foundation, Stripe and commercetools. She has worked in open source since 2020, is a former contributor to the Kubernetes project, and currently immersed in the Postgres open source ecosystem. Her work has been featured in the New York Times and she regularly speaks internationally at technical conferences.", "public_name": "Celeste Horgan", "guid": "10bff601-53ec-5708-bde9-89c09049598a", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/KVW9PZ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZWUDWR/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/ZWUDWR/", "attachments": []}, {"guid": "1e14a980-228a-5118-bed9-43201ce63f9a", "code": "XQHAM9", "id": 86920, "logo": null, "date": "2026-06-09T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-86920-gitops-for-n8n-treating-workflows-as-code", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XQHAM9/", "title": "GitOps for n8n: Treating Workflows as Code", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "n8n-gitops is an open-source CLI that applies GitOps principles to n8n workflows. This talk shows how workflows can be exported, reviewed, versioned, and deployed from Git instead of manually promoted via the UI. Through a live demo, we explore safer deployments, rollbacks, and lessons learned operating automation as code.", "description": "Automation workflows frequently run critical business logic, yet they are often excluded from the same operational discipline applied to application code. In many teams, n8n workflows are created visually, copied between environments, and modified directly in production, making changes hard to audit, review, or roll back.\r\n\r\nThis talk presents n8n-gitops, an open-source project that explores how GitOps principles can be applied to n8n without changing how workflows are authored.\r\n\r\nThe session starts by framing the problem: why manual promotion of workflows, UI-driven deployments, and inline code create operational risk as systems grow. From there, we introduce the core ideas behind n8n-gitops: treating Git as the single source of truth, exporting workflows in mirror mode, externalizing code for proper review, and deploying deterministically from Git references.\r\n\r\nA live demonstration will show:\r\n\r\n- Exporting workflows from n8n into a Git repository in mirror mode\r\n\r\n- Externalizing Python and JavaScript code into first-class files\r\n\r\n- Reviewing workflow changes using normal Git diffs and pull requests\r\n\r\n- Deploying workflows from a specific Git tag or commit\r\n\r\n- Rolling back safely by redeploying a previous Git reference\r\n\r\nThe second half of the talk focuses on lessons learned:\r\n\r\n- What GitOps brings to automation platforms\r\n\r\n- Why credentials are intentionally excluded from full automation\r\n\r\n- Trade-offs compared to UI-driven or enterprise Git integrations\r\n\r\nWhen this approach improves safety\u2014and when it adds unnecessary friction\r\n\r\nThis is not a product presentation, but an experience report on extending GitOps beyond infrastructure into workflow engines, aimed at engineers working with operations, automation, and platform tooling who want stronger guarantees around change, traceability, and deployment.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BNZBWR", "name": "Joao Gilberto Magalhaes", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BNZBWR_Tmph8m6.webp", "biography": "Jo\u00e3o Gilberto Magalh\u00e3es (JG) is a seasoned software developer and DevOps/Platform Engineer with a strong background in building and operating distributed systems. He has hands-on experience designing applications and the infrastructure that runs them, working across backend development, cloud platforms, and automation. Throughout his career, JG has helped organizations evolve from monolithic, manually operated systems into scalable, resilient, and observable platforms using AWS, Kubernetes, CI/CD pipelines, and Infrastructure as Code. His developer background allows him to approach DevOps pragmatically, focusing on developer experience, system reliability, and real production constraints. An active open-source contributor and maintainer, JG shares tools and patterns that emphasize simplicity, maintainability, and operational clarity. He brings a practical, engineering-first perspective, connecting deep technical work with measurable business outcomes.", "public_name": "Joao Gilberto Magalhaes", "guid": "98635c0f-8e69-5982-82ce-36fe02c85b04", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/BNZBWR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XQHAM9/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/XQHAM9/", "attachments": []}, {"guid": "c003c5bd-d4bd-5a02-b9eb-e39e5f91b366", "code": "CJRUW3", "id": 88007, "logo": null, "date": "2026-06-09T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-88007-real-time-ml-pipelines-feature-chaining-with-chronon", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/CJRUW3/", "title": "Real-Time ML Pipelines: Feature Chaining with Chronon", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Modern ML applications demand features computed in near real-time with sub-100ms latencies. This talk dives into Chronon, an OSS feature platform bridging streaming data infrastructure and production ML. Using a two-tower search pipeline example, we'll show how we can chain embeddings with tabular features while minimizing hot-path computation.", "description": "Traditional feature engineering pipelines force teams to choose between freshness and latency, leading to complex dual architectures that are expensive to maintain and prone to training-serving skew. For search and recommendation systems, this trade-off is particularly painful: you need a blend of fresh signals (user, query, and item features) and their corresponding embeddings for retrieval and ranking, but can't sacrifice the sub-100ms latencies these systems need to meet.\r\n\r\nThis talk explores how [Chronon](https://chronon.ai) solves this challenge through a unified abstraction over batch and streaming computation, allowing teams to define features once and serve them with minimal latency while keeping them updated in near real-time. Chronon has been battle-tested in production at companies like Stripe, Airbnb, Netflix, and OpenAI, serving billions of predictions daily.\r\n\r\nWe'll use a two-tower search retrieval and ranking pipeline as our primary case study, walking through:\r\n* Computing real-time user and item embeddings for candidate retrieval\r\n* Chaining embedding computation with tabular features to power ranking models\r\n* Minimizing computation in the serving hot-path reducing infrastructure costs by orders of magnitude\r\n\r\nAudience takeways:\r\n* How Chronon unifies batch and streaming feature computation\r\n* Chronon's pluggable architecture with respect to table formats, streaming buses, KV stores and model platforms\r\n* Chronon's approach to minimize serving latency while maximizing feature freshness in production ML systems\r\n* How one can build ML pipelines that chain feature computation with model inference / embedding\r\n* Real-world lessons from companies serving billions of predictions daily\r\n\r\nThis talk sits at the intersection of search, data streaming, and AI in production\u2014ideal for ML engineers, search platform teams, and anyone building real-time intelligent applications at scale.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HSQQGW", "name": "Varant Zanoyan", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EUZAP8_Epe70ar.webp", "biography": "Bio: Varant spent the last 13 years building data infrastructure for AI and ML at Airbnb and Palantir. During this time, he became one of the original authors of Chronon, the recently open sourced feature and embedding platform. Currently, he is Co-Founder of Zipline AI, which is building an enterprise platform around the project.", "public_name": "Varant Zanoyan", "guid": "0a7e1788-ce1e-58aa-a5c0-6dca391882eb", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/HSQQGW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/CJRUW3/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/CJRUW3/", "attachments": []}, {"guid": "6ab2b6a4-5dfa-5468-869b-ad74e793abe3", "code": "HUWSBR", "id": 90923, "logo": null, "date": "2026-06-09T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-90923-the-failures-that-don-t-crash-mlops-for-ai-agents", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/HUWSBR/", "title": "The Failures That Don't Crash: MLOps for AI Agents", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk takes four reliability patterns from distributed systems and shows what they look like inside an agent architecture. How to shadow-test an agent. Why your circuit breakers need confidence thresholds. What an eval harness looks like when your system is non-deterministic. And why human oversight degrades faster than anyone admits.", "description": "AI agents are shipping to production without the reliability patterns we spent decades building for distributed systems. Only 37% of teams run online evaluations on their agents (LangChain State of Agent Engineering 2026). The rest have no systematic way to detect when an agent produces a confident, plausible, wrong answer.\r\n\r\nThis talk bridges that gap. Drawing from 15 years of building systems at scale (50 billion requests/month at Start.io, shadow deployment pipelines at Riskmethods, and the core MLOps platform at Qwak) I'll present four reliability patterns adapted for agent architectures:\r\n\r\n1. Shadow testing agents against a baseline before promoting them to production\r\n2. Circuit breakers with confidence thresholds instead of simple error rates\r\n3. Evaluation harnesses designed for non-deterministic outputs\r\n4. Structured human oversight that accounts for automation bias decay\r\n\r\nEach pattern comes with implementation details: what to measure, where to hook into the agent lifecycle, and what failure modes to watch for. The examples are framework-agnostic and based on real production systems, not toy demos.\r\n\r\nThe audience will walk away with concrete patterns they can apply to their own agent deployments whether they're building with LangChain, LlamaIndex, custom frameworks, or bare API calls.", "recording_license": "", "do_not_record": false, "persons": [{"code": "FBBMGW", "name": "Bartosz Mikulski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/FBBMGW_isrE4QY.webp", "biography": "Bartosz Mikulski is a Senior Data Engineer at Start.io, where he works on ML systems handling 50 billion requests per month. He co-created the MLOps platform at Qwak (acquired for $230M) and contributed to the book \"97 Things Every Data Engineer Should Know\" (O'Reilly). He trains engineering teams on Python, MLOps, and AI. Over 500 technical articles published (mikulskibartosz.name). Previous speaker at Berlin Buzzwords, Infoshare, DataNatives, and DevOpsDays.", "public_name": "Bartosz Mikulski", "guid": "a50e0544-0128-58ca-b114-3287bc6e5e2d", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/FBBMGW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/HUWSBR/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/HUWSBR/", "attachments": []}, {"guid": "d655e61d-777e-5c4c-a63a-6763edcb9d45", "code": "H9UL7Y", "id": 91185, "logo": null, "date": "2026-06-09T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-91185-how-to-tell-if-your-agent-used-the-right-stuff", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/H9UL7Y/", "title": "How to Tell If Your Agent Used the Right Stuff", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Many so-called \u201cagent failures\u201d are actually context failures in disguise. In this session, we\u2019ll explore how to tell whether your agent truly saw and used the right context, using techniques like tracing and attribution, golden datasets for context-aware evaluation, and targeted probes to test retrieval quality.", "description": "Your agent answered confidently, did it use the right evidence? We\u2019ll walk through a repeatable debugging workflow for RAG + tool-using agents: instrument traces, inspect retrieved chunks, run attribution and citation checks, and isolate failure modes (missing recall, bad ranking, distractors, stale docs). You\u2019ll learn how to create a lightweight golden set, write probe questions, and track retrieval + answer metrics so improvements are measurable, not vibes.", "recording_license": "", "do_not_record": false, "persons": [{"code": "H798FW", "name": "Apurva Misra", "avatar": "https://program.berlinbuzzwords.de/media/avatars/H798FW_8wZ5nd6.webp", "biography": "Apurva Misra is a machine learning engineer, speaker, and founder of Sentick, where she helps growing teams unlock practical, ROI-driven AI solutions across automations, predictive analytics, and copilots.\r\nAs a consultant, she builds end-to-end AI systems from discovery to production. In one client engagement, she delivered an AI customer support system that reduced support emails by more than 30%.\r\n\r\nHer academic work includes a University of Waterloo Master\u2019s focused on driver cognitive distraction detection, publications in IEEE Access with 100+ citations so far.\r\n\r\nShe especially enjoys the education side of AI getting founders and teams up to speed on what\u2019s genuinely useful and how to apply it through over 40 publicly listed talks, workshops, webinars, panels, and podcast appearances.", "public_name": "Apurva Misra", "guid": "1a9fd091-d7df-5b16-8263-c198cd39846d", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/H798FW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/H9UL7Y/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/H9UL7Y/", "attachments": []}, {"guid": "935f5537-793e-5981-aadc-fb47e7857ae0", "code": "G3RFDB", "id": 91409, "logo": null, "date": "2026-06-09T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-91409-sunset-for-the-wild-west-making-ml-disciplined-by-default", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/G3RFDB/", "title": "Sunset for the Wild West: Making ML disciplined by default", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Many novel machine learning techniques started as clever hacks that just happened to work, but the demands of building real systems can be at odds with this creative culture. Learn about our open-source stack to improve quality-of-life for ML researchers and infrastructure teams alike \u2014 and how their concerns aren't as different as you might think.", "description": "At first glance, MLOps teams have an unenviable challenge, since they exist to bridge the gap between machine learning practitioners and infrastructure engineers, who work at opposite ends of the application stack and have distinct vocabularies, skills, and goals. Practitioners often adopt an anything-goes creative approach and figure out _why_ a technique works after it's already getting results; this culture has led to many advances in applied machine learning but can be in tension with building reliable systems. However, there's a surprising commonality between ML practitioners and infrastructure teams, and their concerns may not be as different as they appear.\r\n\r\nInfrastructure engineers care about security, observability, and predictable utilization while ML practitioners care about reproducibility, understandability, and performance. This session will argue that the diverse concerns of these groups are often manifestations of the same underlying systems challenges, and that the same open-source tools can help both audiences address their pain points. We'll draw on our experience helping researchers get experiments into production at scale and helping infrastructure teams deploy and manage enormous clusters. Most importantly, we did this while meeting practitioners where they are: without requiring researchers to become release engineers or demanding that SRE teams start caring about gradients or manifolds.\r\n\r\nYou'll come away from this talk with concrete tools and playbooks to make machine learning systems safer and more predictable, to eliminate the error-prone manual work of getting code from an experimental environment ready for collaboration or production, to help researchers achieve reproducible results, to better understand the software your team wants to run and the infrastructure that supports it, to balance overhead and observability for demanding workloads, and to ensure that you know at a glance what's actually running on your compute clusters \u2014 from project-specific Kubernetes configurations all the way down to device drivers and everything in between.", "recording_license": "", "do_not_record": false, "persons": [{"code": "V7EF3R", "name": "William Benton", "avatar": "https://program.berlinbuzzwords.de/media/avatars/V7EF3R_GZC4FA0.webp", "biography": "William Benton is a software engineer at NVIDIA, where he builds tools to help make machine learning systems easier to develop, more understandable, and more reliable. In previous roles, Will's responsibilities included establishing and improving the expected value of accelerated data science frameworks for everyday practitioners, leading teams of data scientists and engineers, contributing to many open-source communities, and developing novel static analyses for real-world software.", "public_name": "William Benton", "guid": "d0cb7874-0585-5c70-aab5-d818c70cc5ed", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/V7EF3R/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/G3RFDB/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/G3RFDB/", "attachments": []}, {"guid": "78829ef6-1f43-5277-9e4f-aa141c8d16bc", "code": "R37LPK", "id": 91664, "logo": null, "date": "2026-06-09T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Frannz Salon", "slug": "bbuzz26-91664-escaping-the-cloud-high-performance-ai-in-your-browser", "url": "https://program.berlinbuzzwords.de/bbuzz26/talk/R37LPK/", "title": "Escaping the Cloud: High-Performance AI in your Browser", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Server-side inference is the bottleneck of modern AI, creating costs and privacy hurdles. But what if the solution is scaling down to the browser? This session investigates Client-Side AI using WebGPU, ONNX Runtime, and Transformers.js. We\u2019ll explore the reality of hardware access, model size, and the 2026 trade-offs of browser based execution.", "description": "Server-side inference is the bottleneck of modern AI. It introduces network latency, creates massive operational costs, and forces complex privacy compliance. But what if we could push the compute entirely to the edge, specifically, the browser tab?\r\n\r\nThis session explores the architecture of **Client-Side AI**, where the strategy is to distribute the workload to the user's own hardware.\r\n\r\nWe will investigate the modern browser-based ML stack:\r\n\r\n- The Runtime: How **ONNX Runtime** provides a near-native execution environment for models trained in PyTorch or TensorFlow.\r\n- The Hardware Access: Leveraging **WebGPU** to unlock direct access to the client\u2019s GPU, bypassing the limitations of legacy WebGL.\r\n- The Pipeline: A technical look at optimizing transformer models (quantization, caching) for delivery over the wire using libraries like **Transformers.js**.\r\n\r\nBut most of all, we will look at actual demos of LLMs, speech and computer vision models all running in the browser. We\u2019ll be honest about the trade-offs: memory limits, model size constraints, and the reality of browser compatibility in 2026. \r\n\r\nJoin us to see if the future of AI scaling is actually... no servers at all.", "recording_license": "", "do_not_record": false, "persons": [{"code": "QFSMUG", "name": "Johannes Kolbe", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QFSMUG_UekUIHx.webp", "biography": "Hey,\r\n\r\nI'm Johannes, a Data Scientist who loves to tell educative stories about Machine Learning methods and AI. Preferably I'm doing this in Open Source communities.\r\n\r\nI've been working with Computer Vision for more than 10 years, ranging from designing my own Haar-Cascade face detection, over research on autonomous cars and helping people configure their photobooks automatically, all the way to undestanding the needs of smalle and medium sized enterprises, to create tailored solutions for them.", "public_name": "Johannes Kolbe", "guid": "6bc9aab7-4c1e-57c5-ac96-e67af8999cd1", "url": "https://program.berlinbuzzwords.de/bbuzz26/speaker/QFSMUG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/R37LPK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/bbuzz26/talk/R37LPK/", "attachments": []}]}}]}}}