{"$schema": "https://c3voc.de/schedule/schema.json", "generator": {"name": "pretalx", "version": "2026.1.1"}, "schedule": {"url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/schedule/", "version": "0.18", "base_url": "https://program.berlinbuzzwords.de", "conference": {"acronym": "berlin-buzzwords-2023", "title": "Berlin Buzzwords 2023", "start": "2023-06-18", "end": "2023-06-20", "daysCount": 3, "timeslot_duration": "00:05", "time_zone_name": "Europe/Berlin", "colors": {"primary": "#3D3182"}, "rooms": [{"name": "Kesselhaus", "slug": "2113-kesselhaus", "guid": "0eebb650-d6b8-53cb-9d33-48dd47df08c1", "description": null, "capacity": null}, {"name": "Maschinenhaus", "slug": "2114-maschinenhaus", "guid": "c6fb8c6b-f3de-5975-8424-16bf52eead3a", "description": null, "capacity": null}, {"name": "Palais Atelier", "slug": "2115-palais-atelier", "guid": "2fd45b3f-34a5-5cdc-83ff-c5179ae5fb09", "description": null, "capacity": null}, {"name": "Frannz Salon", "slug": "2116-frannz-salon", "guid": "53bc35d4-3bfa-5d32-b6f3-228e6d6dd639", "description": null, "capacity": null}], "tracks": [], "days": [{"index": 1, "date": "2023-06-18", "day_start": "2023-06-18T04:00:00+02:00", "day_end": "2023-06-19T03:59:00+02:00", "rooms": {"Palais Atelier": [{"guid": "27f888a5-481d-553c-ab46-d63bdb35583c", "code": "K7BZXH", "id": 30538, "logo": null, "date": "2023-06-18T15:00:00+02:00", "start": "15:00", "duration": "03:00", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-30538-barcamp", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/K7BZXH/", "title": "Barcamp", "subtitle": "", "track": null, "type": "Workshop", "language": "en", "abstract": "Barcamps are informal sessions, a kind of \"un-conference\", with a schedule decided on the day.", "description": "Barcamps are informal sessions, a kind of \"un-conference\", with a schedule decided on the day. It is all driven by the interests and expertise of those who attend so each one is different, but ours are always great!\r\n\r\nAlthough the barcamp doesn't have a strict schedule, it won't be completely devoid of structure! #bbuzz barcamps are dynamic events, focused on the overall Berlin Buzzwords topics, tackling the same challenges but in a different format. At the barcamp each session runs for 30 minutes giving enough time to get into the meat of a topic, but without a chance of anyone getting bored. These are participatory sessions and more inclusive than regular conference talks, with everyone taking part. You can help by leading the session, by giving some insights, by asking some great questions, or maybe just with your enthusiasm.\r\n\r\nThe barcamp will be coordinated and moderated by Nick Burch.\r\n\r\nRegistration starts from 2:30pm", "recording_license": "", "do_not_record": false, "persons": [{"code": "97HYST", "name": "Nick Burch", "avatar": "https://program.berlinbuzzwords.de/media/avatars/97HYST_759PqjE.webp", "biography": "Nick is heavily involved in a number of Apache projects, such as Tika and POI, while having the fortune to know many of the people involved in the Apache Big Data and Search space! When not helping out with Apache things, Nick works as the Director of Engineering at FLEC, where he leads a team making heavy use of Open Source technologies. When not helping improve the logistics industry, he is often to be found attending or organising BarCamps, Geek Nights, or other such fun events dedicated to sharing what's great and new!", "public_name": "Nick Burch", "guid": "02fe34a8-176c-520f-a723-b897478d00b2", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/97HYST/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/K7BZXH/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/K7BZXH/", "attachments": []}]}}, {"index": 2, "date": "2023-06-19", "day_start": "2023-06-19T04:00:00+02:00", "day_end": "2023-06-20T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "5d3b34dc-6ffb-52b1-afa0-81900c1fdd7e", "code": "3WEQZF", "id": 31134, "logo": null, "date": "2023-06-19T09:15:00+02:00", "start": "09:15", "duration": "00:15", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-31134-opening-session", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/3WEQZF/", "title": "Opening Session", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Join us as we kick off Berlin Buzzwords 2023", "description": "", "recording_license": "", "do_not_record": false, "persons": [], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/3WEQZF/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/3WEQZF/", "attachments": []}, {"guid": "4c1dfd06-f033-565d-94e0-9c091ea4f2d6", "code": "BKUVLR", "id": 29798, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/BKUVLR/Ding-Jennifer_wEkHrVL.png", "date": "2023-06-19T09:35:00+02:00", "start": "09:35", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-29798-what-defines-the-open-in-open-ai", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/BKUVLR/", "title": "What defines the \u201copen\u201d in \u201copen AI\u201d?", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk focuses on unpacking this year\u2019s big buzzwords of \u201copen AI\u201d and \u201cresponsible AI\u201d to highlight the range of (sometimes contradictory) activities that exist under these umbrella terms and how", "description": "While the majority of AI production is concentrated within a few companies in even fewer countries, alternative pathways are emerging for more people to participate in the process of building, applying, and governing ML models. Open Artificial Intelligence (open AI) initiatives offer new spaces to reimagine how AI is developed and who can be part of the process. However, over the past year, the intensification of AI model development and hype has made the already nebulous term \u201cAI\u201d even more confusing when extended with terms like \u201copen\u201d, \u201cresponsible\u201d, \u201ctrustworthy\u201d, and \u201cdemocratic\u201d. This talk focuses on unpacking this year\u2019s big buzzwords of \u201copen AI\u201d and \u201cresponsible AI\u201d to highlight the range of (sometimes contradictory) activities that exist under these umbrella terms and how the AI field is expanding the practice of \u201copen\u201d beyond traditional FOSS contexts. Following an overview of the current open and responsible AI landscape, we will end with a discussion on community priorities for focus and intervention to build AI production pipelines that live up to aspirational attributes, like \u201copen.\u201d", "recording_license": "", "do_not_record": false, "persons": [{"code": "QMZFXB", "name": "Jennifer Ding", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QMZFXB_t6bDcHK.webp", "biography": "Jennifer Ding is a Research Application Manager at The Alan Turing Institute, the UK\u2019s national institute for data science and artificial intelligence. Previously, she was a startup founder and data scientist at several public interest tech companies, creating data products for industry and government partners. She enjoys massaging data big and small, and is co-leading the first ever London Data Week, which takes place 3 -9 July 2023.", "public_name": "Jennifer Ding", "guid": "4eb5b7e8-e6bd-5114-85b0-48e7a79772e5", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/QMZFXB/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/BKUVLR/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/BKUVLR/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/BKUVLR/resources/Jennifer_Ding_-_What_Defines_the_Open_EzHjw87.pdf", "type": "related"}]}, {"guid": "0233bf17-db89-57ba-8d3e-743f3dee468b", "code": "VUGYME", "id": 27856, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/VUGYME/Arora-Atita_BjKE0QQ.png", "date": "2023-06-19T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-27856-vectorize-your-open-source-search-engine", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VUGYME/", "title": "Vectorize Your Open Source Search Engine", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Fascinated by vector search but don't know where to start?\r\nJoin us to crack the code and leverage the potential of vector search to delight your users.", "description": "Neural search (a.k.a. Vector search) has rewritten the standards of information retrieval in many different domains.\r\nVector search can help you gather a better understanding of the user query intent, drive product recommendations, search across different source data (text, images, audio, video), deliver better results, improve personalization and create a more successful user experience. Vector search goes beyond keywords to harvest the potential of graphs and embeddings to match users to the intended document, product, job, picture, song, or video.\r\nAs fascinating as this may sound it's easy to find ourselves lost in the deluge of new information.  \r\nIf you're struggling to get started, understand what vector search can bring to the party, add cool new models such as OpenAI models and want to avoid common pitfalls, this talk is for you.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TRRRA8", "name": "Atita Arora", "avatar": "https://program.berlinbuzzwords.de/media/avatars/TRRRA8_2nqHoJb.webp", "biography": "Atita has been working to develop, customize, and optimize Enterprise & E-commerce search engines for many years. She is an active contributor to many open-source tools. She holds 2 Masters degrees in Computer Applications and Strategic Business Management. She has worked and supported in many different roles in various organizations and even founded a small Search consultancy in India in 2017. \r\nShe has a keen interest in personalizing search and influencing customer interaction using NLP, ML, and AI.", "public_name": "Atita Arora", "guid": "51065684-2404-50ee-a9b8-3e222ebbaddd", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/TRRRA8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VUGYME/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VUGYME/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/VUGYME/resources/Atita_Arora_-_Vectorize_your_Opensour_ZHaysB1.pdf", "type": "related"}]}, {"guid": "9c7c6d44-c4d5-5cac-9293-49ddae355412", "code": "KPELMM", "id": 27877, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/KPELMM/Karumuri-Suman_kLs9ElG.png", "date": "2023-06-19T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-27877-kaldb-serverless-lucene-at-petabyte-scale", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KPELMM/", "title": "Kaldb: serverless lucene at petabyte scale", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this talk, we share our experiences, best practices, and lessons learned in designing and operating a serverless Lucene serving system at PB scale.", "description": "Running petabyte-scale columnar stores has become a routine operation in today's data-driven world. However, running a petabyte-scale search system is still a challenging task operationally. Enter Kaldb, an open-source, serverless Lucene serving system designed specifically for petabyte-scale Lucene workloads. We've designed Kaldb to automate and reduce operational toil without sacrificing performance or reliability.\r\n\r\nBut designing a serverless Lucene system at this scale poses several unique challenges, such as ensuring durability of data, modifying replication and caching protocols for high availability, high fanout reads, managing ephemeral nodes, and more. \r\n\r\nIn this talk, we'll delve into the details of how our redesigned Kaldb system overcomes these challenges. We've separated durability of the data from storage, separated compute from storage, modified replication algorithms to handle ephemeral nodes, use Kafka as a write ahead log and developed a novel query execution layer to handle high-fanout queries. Our implementation not only reduces operational toil but also adds several self-healing properties to the system. We're proud to say that Kaldb currently runs on Kubernetes at petabyte scale with improved reliability and performance.\r\n\r\nJoin us in this talk to learn more about how Kaldb can help you overcome the challenges of running a petabyte-scale Lucene serving system. We'll share our experiences, best practices, and lessons learned in designing and operating a serverless Lucene serving system at this scale, and provide practical insights and techniques that you can use to optimize your own search systems.", "recording_license": "", "do_not_record": false, "persons": [{"code": "PU3FFY", "name": "Suman Karumuri", "avatar": "https://program.berlinbuzzwords.de/media/avatars/PU3FFY_nQkoHVp.webp", "biography": "Suman Karumuri is a Principal Software Engineer and the tech lead for Observability at Airbnb. As an expert in distributed tracing, Suman has been a tech lead of Zipkin and a co-author of the OpenTracing standard, a Linux Foundation project under the CNCF. With extensive experience, Suman has spent years building and operating petabyte-scale log search, distributed tracing, and metrics systems at notable companies like Slack, Pinterest, Twitter, and Amazon. In his leisure time, Suman enjoys engaging in board games, exploring the outdoors through hiking, and spending quality time with his children.", "public_name": "Suman Karumuri", "guid": "42948ce6-3908-5949-8903-a3833c15af5f", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/PU3FFY/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KPELMM/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KPELMM/", "attachments": []}, {"guid": "fa98a059-5699-5ead-a6f4-b1b079de74f4", "code": "YTLX8T", "id": 27943, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/YTLX8T/Bergum-Jo-Kristian_eWlgmPY.png", "date": "2023-06-19T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-27943-boosting-ranking-performance-with-minimal-supervision", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YTLX8T/", "title": "Boosting Ranking Performance with Minimal Supervision", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Using generative Large Language Models (LLMs) to generate synthetic labeled data to train in-domain ranking models. Distilling the knowledge and power of generative LLMs into effective ranking models.", "description": "Transformer language models are highly effective text rankers; however, training Transformer-based neural ranking models requires vast amounts of labeled supervised data, which is costly and time-consuming. What if you could teach a ranking model without behavioral click data or human annotations? Enter generative large language models (LLMs) such as GPT-3. \r\n\r\nThis talk showcases a novel approach to generating labeled data with minimal human supervision. First, with just three human-labeled queries and document examples, an open-source LLM generates synthetic questions for all documents in the index. Then, the synthetic data trains a much smaller, cost-efficient Transformer ranking model, which outperforms a strong BM25 baseline by 10 nDCG@10 points on a popular relevance dataset. \r\n\r\nThe innovative method saves on costly annotation efforts and enables faster adaptation to search ranking in new domains, and allows organizations to revolutionize their search capabilities without breaking the bank.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZWB38G", "name": "Jo Kristian Bergum", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ZWB38G_UC6rwVn.webp", "biography": "Jo Kristian is a Distinguished Engineer @Yahoo, where he spends his time working on the open-source Vespa.ai serving engine. Jo Kristian has 20 years of experience with deploying search systems at scale.", "public_name": "Jo Kristian Bergum", "guid": "1093925e-8915-550e-ac6f-6b16b3f53e77", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/ZWB38G/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YTLX8T/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YTLX8T/", "attachments": []}, {"guid": "5213db91-9dcf-5e8e-b6fb-b4e63a1d5c17", "code": "YPUPAA", "id": 27989, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/YPUPAA/Benedetti-Alessandro_Cg3KRSu.png", "date": "2023-06-19T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-27989-introducing-multi-valued-vector-fields-in-apache-lucene", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YPUPAA/", "title": "Introducing Multi-valued Vector Fields in Apache Lucene", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Multiple vectors in a field dedicated to K-nearest-neighbors search has been a fundamental problem for Apache Lucene for long. \r\nThis talk describes how this has been finally designed and implemented.", "description": "Since the introduction of native vector-based search in Apache Lucene happened, many features have been developed, but the support for multiple vectors in a dedicated KNN vector field remained to explore. \r\nHaving the possibility of indexing (and searching) multiple values per field unlocks the possibility of working with long textual documents, splitting them in paragraphs and encoding each paragraph as a separate vector: scenario that is often encountered by many businesses. \r\nThis talk explores the challenges, the technical design and the implementation activities happened during the work for this contribution to the Apache Lucene project.\r\nThe audience is expected to get an understanding of how multi-valued fields can work in a vector-based search use-case and how this feature has been implemented.", "recording_license": "", "do_not_record": false, "persons": [{"code": "GJ3PTP", "name": "Alessandro Benedetti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GJ3PTP_7fNBvIJ.webp", "biography": "Following his passion he entered the Apache Lucene and Solr world in 2010 becoming an active member of the community and Apache Lucene/Solr Committer and PMC member. \r\nExperience with a great variety of clients has taught him to be a proficient and professional consultant.\r\nRecently Alessandro has contributed Neural Search to Apache Solr and worked on integrating Apache Solr\u2019s Learning To Rank in various company ecosystems with the aim of improving search result relevancy.\r\nPrior to that he designed and developed an enterprise semantic search engine known as Sensefy using approaches such as Named Entity Recognition at indexing time, advanced autocompletion, and document similarity metrics.", "public_name": "Alessandro Benedetti", "guid": "bd8c60c2-a21e-5832-978a-2ca73e1cddd0", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/GJ3PTP/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YPUPAA/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YPUPAA/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/YPUPAA/resources/Alessandro_Benedetti_-_Multi_Valued_V_k7X51YQ.pdf", "type": "related"}]}, {"guid": "934cbe59-3c42-50d3-92b4-9cea5d15b76c", "code": "XEC7W3", "id": 27803, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/XEC7W3/Perinetti-Lara_cITESO7.png", "date": "2023-06-19T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-27803-privacy-preserving-web-search", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/XEC7W3/", "title": "Privacy-Preserving Web Search", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "An ethical overview of how a privacy-focused search engine has to adapt its behavior from crawling to ranking web documents without knowing anything about the user and still be as relevant as possible", "description": "Our ubiquitous connection to the internet triggered awareness and concerns regarding privacy preservation issues.\r\nHowever, while privacy is a more and more known subject, a few points remain to be clarified. It is expected from a privacy-preserving web search engine not to track you via your queries and clicks history nor to sell your personal data. Nevertheless, it can use non-personal data to improve search engine relevance.\r\nMoreover, using a privacy-focused web search engine means being ready to adapt the way of querying it to add the information the search engine does not have about you.\r\nThis talk will focus on how we can create a web search engine that preserves its users' privacy while focusing on the relevance of its results and the privacy preservation of its users.", "recording_license": "", "do_not_record": false, "persons": [{"code": "QRHXDT", "name": "Lara Perinetti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QRHXDT_2BAZMdF.webp", "biography": "Machine Learning Engineer focused on NLP and IR @Qwant.", "public_name": "Lara Perinetti", "guid": "964dcbba-59e1-5c4a-8921-eb4aa7e0bf7e", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/QRHXDT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/XEC7W3/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/XEC7W3/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/XEC7W3/resources/Lara_Perinetti_-_Privacy-Preserving_W_wHGjPIY.pdf", "type": "related"}]}, {"guid": "b70901fb-02b3-50a6-ba1b-d3a33aae8bba", "code": "QUQFBY", "id": 28095, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/QUQFBY/Paponaud-Aline_Precup-Lucian_foOAT1B.png", "date": "2023-06-19T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-28095-towards-a-decentralized-and-collaborative-search-engine", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/QUQFBY/", "title": "Towards a decentralized and collaborative search engine", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this session we will share our vision towards an alternative, decentralized and collaborative search engine, from social considerations to technical implementation.", "description": "There are many alternatives to traditional search engines, culminating in the recent breakthrough of ChatGPT. Furthermore, the trend is to move towards decentralized, peer-to-peer and community-driven architectures, as the recent movement towards Mastodon implementations testifies.\r\n\r\nWe wanted to bring these concepts from GitHub, Wikipedia and Mastodon to search engines and build the all.site platform - the collaborative search engine. We will show how developer communities can help organize information and build a new relevance model.\r\n\r\nWe will share with you the experience of this adventure: what we tried, what we learned, the limits encountered and the challenges to come. We will present the internal functioning of a search engine with its different modules, the architecture and the infrastructure, the notions of security and ethics, ending with the economic model and the prototypes currently in place.", "recording_license": "", "do_not_record": false, "persons": [{"code": "H8BKZC", "name": "Aline Paponaud", "avatar": "https://program.berlinbuzzwords.de/media/avatars/H8BKZC_FBSJttu.webp", "biography": "CTO of Adelean, working with search and providing consulting services and expertise around Elasticsearch, Lucene and Solr. She brings her energy to leveraging search engines, as they become more and more essential in every domain.", "public_name": "Aline Paponaud", "guid": "f2799518-1b4c-58fc-9d75-3b6da69461c7", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/H8BKZC/"}, {"code": "DXAHHB", "name": "Lucian Precup", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DXAHHB_cVQTaMu.webp", "biography": "Lucian Precup is the CTO of [all.site](https://all.site) - the collaborative search engine developed at [Station F](https://stationf.co) in Paris. With his colleagues at [Adelean](https://adelean.com), Lucian develops solutions for indexing, searching and analyzing data. Lucian regularly shares his knowledge in specialized conferences and organizes the [Search & Data Meetup](https://www.meetup.com/fr-FR/search-and-data/).", "public_name": "Lucian Precup", "guid": "15cf6ff2-c885-5e06-af41-4e027691f577", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/DXAHHB/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/QUQFBY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/QUQFBY/", "attachments": []}, {"guid": "08080b5f-c68a-5100-9247-db1329f30041", "code": "JXSJB8", "id": 28170, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/JXSJB8/Albertson-Lars_pvylmac.png", "date": "2023-06-19T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-28170-how-to-not-kill-people", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JXSJB8/", "title": "How to not kill people", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "As AI grows, software manages more risks to humans. Moving fast and breaking things won't do. We will look at aviation to learn how successful risk management structures might look in software & AI.", "description": "With the rise of artificial intelligence, we give more control of our lives to software. We thereby introduce new risks, and the fatal Uber crash in 2018 is the first example of AI causing an accidental death. It will be up to us as software engineers to build systems safe and reliable enough to entrust with important decisions.  Our culture, however, includes praising companies that move fast and break things (Facebook), celebrate principled confrontation (Uber), fake self-driving demonstrations (Tesla), and are right, a lot (Amazon). As an industry, we need to radically improve to meet the challenge, or more people will die.\r\n\r\nIn this presentation, we will look at aviation - the industry most successful at continuously improving safety - and attempt to learn. We will look at aviation safety principles, compare with similar practices in software engineering, and see how we can translate safety principles that have worked well in aviation to the software engineering domain.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HUXQYG", "name": "Lars Albertsson", "avatar": "https://program.berlinbuzzwords.de/media/avatars/HUXQYG_PfKikqQ.webp", "biography": "Lars Albertsson is the founder of Scling, a data engineering startup based in Stockholm. Scling provides data-factory-as-a-service - customer tailored data engineering, analytics, and data science. Lars is a frequent conference speaker on data engineering and data strategy. Before founding Scling, Lars has worked at Google, Spotify, Schibsted, and as an independent consultant, helping organisations create business value from data processing and machine learning.", "public_name": "Lars Albertsson", "guid": "703b40f4-9485-5763-a724-c7863aefe36f", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/HUXQYG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JXSJB8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JXSJB8/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/JXSJB8/resources/Lars_Albertson_-_How_to_not_kill_peop_mC8D8Qu.pdf", "type": "related"}]}, {"guid": "529f5070-81dc-5d9b-8a6f-3ebef3556cf6", "code": "73UNZD", "id": 24838, "logo": null, "date": "2023-06-19T17:20:00+02:00", "start": "17:20", "duration": "01:00", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-24838-the-debate-returns-with-more-vectors-which-search-engine", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/73UNZD/", "title": "The Debate Returns (with more vectors) Which Search Engine?", "subtitle": "", "track": null, "type": "Panel", "language": "en", "abstract": "It's that old question - which search engine should I choose for my project? Elasticsearch, Solr, Opensearch (all based on Lucene), or Vespa, or maybe one of the new vector search engines?", "description": "What's best for a particular use case? What advantages does one approach have over another? How has vector search changed the picture? Does it even matter which one I choose?\r\n\r\nModerator Charlie Hull from OpenSource Connections and expert panellists representing various search engine platforms will offer a lively & balanced debate and Q&A session to help you figure out the big question: Which Search Engine?", "recording_license": "", "do_not_record": false, "persons": [{"code": "N3V3QS", "name": "Charlie Hull", "avatar": "https://program.berlinbuzzwords.de/media/avatars/N3V3QS_O86UR2l.webp", "biography": "With over 20 years in the business of open source search, Charlie Hull helps companies across the world build powerful and accurate search engines as a Managing Consultant at OpenSource Connections, the search relevance people. He is co-author of the book 'Searching the Enterprise', a regular conference keynote speaker, prolific blogger and writer, and hosts and organises the Haystack conference series.", "public_name": "Charlie Hull", "guid": "2d38510e-6906-5117-8c03-b5e76d5a816e", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/N3V3QS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/73UNZD/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/73UNZD/", "attachments": []}], "Maschinenhaus": [{"guid": "4ec21e69-884b-5a51-aab6-62f33df54ebf", "code": "WFSUKH", "id": 28152, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/WFSUKH/Li-Zhibo_hjNf5Bp.png", "date": "2023-06-19T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-28152-declarative-data-collections-for-portable-parallelism", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WFSUKH/", "title": "Declarative Data Collections for Portable Parallelism", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "This talk introduces a novel programming model - the user declares data collections with the properties, and and these declarations can be transparently ported to multiple platforms including GPUs.", "description": "I would like to introduce Declarative Abstractions for Data Collections, which provides a novel, declarative approach to data collections for convenient, portable, and efficient parallel computation. Modern programming languages provide programmers with rich abstractions for data collections as part of their standard libraries, e.g., containers in the C++ STL, the Java Collections Framework, or the Scala Collections API. Typically, these collections frameworks are organized as hierarchies that provide programmers with common abstract data types (ADTs) like lists, queues, and stacks. While convenient, this approach introduces problems that ultimately affect application performance due to users over-specifying collection data types, limiting implementation flexibility.\r\n\r\nWith the introduced framework, programmers explicitly select properties for their collections, thereby truly decoupling specification from implementation. By making collection properties explicit, immediate benefits materialize in the form of reduced risk of over-specification and increased implementation flexibility. In terms of computational performance, our framework helps shield the application developer from parallel implementation details, where the property-based data collection can be ported to multiple platforms, including GPU and FPGA, without modifying the declaration on the properties.\r\n\r\nThe framework provides a data-centric approach for high performance computation, where the users focus on what properties the container(collection) would have and do not need to work around the implementation details. The framework has been developed based on C++ metaprogramming and provides modern C++ API for the users. This framework will benefit the community as a convenience and high-performance programming model for parallel data processing in heterogeneous environment. The audience will get to know a practical programming model for data-centric parallelism, which is useful for their everyday job regarding parallel data analyzing, data storage/filter, etc.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DQHDZT", "name": "Zhibo Li", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DQHDZT_NjLprLM.webp", "biography": "Zhibo Li is a final-year PhD student of Informatics at the Informatics School, University of Edinburgh. His research interests include System & Architecture, especially in Data-Centric Parallelism, Compiler, and Programming Model. He is currently working on a Property-based Collection Skeletons library.", "public_name": "Zhibo Li", "guid": "454ddb5b-2511-5d8e-a110-b3c0e4166697", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/DQHDZT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WFSUKH/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WFSUKH/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/WFSUKH/resources/Zhibo_Li_-_Declarative_Data_Collectio_QSjRBtk.pdf", "type": "related"}]}, {"guid": "d416bad1-3076-5be2-8b03-0b849a9882f9", "code": "3GPYJQ", "id": 27932, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/3GPYJQ/Herreros-Quentin_Vaesey-Tom_LKemSHi.png", "date": "2023-06-19T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27932-how-to-train-your-general-purpose-document-retriever-model", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/3GPYJQ/", "title": "How to train your general purpose document retriever model", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "A practical guide for training learned sparse models to outperform BM25 on zero-shot document retrieval tasks", "description": "Large language models augment traditional information retrieval (IR) approaches with both high quality language parsing skills and knowledge external to the corpus. However, training a state of the art general purpose model for document retrieval is challenging. This talk is motivated by our experiences training a high quality retriever model for use alone or together with BM25 to improve relevance out-of-the-box in Elasticsearch.\r\n\r\nWe chose to focus on the learned sparse model (LSM) architecture. LSMs for information retrieval (IR) were recently popularised by SPLADE [1] and have various attractive properties for our purpose. They enable retrieval via inverted indices for which Elasticsearch has a high quality implementation in Lucene. They provide tuneable parameters which allow one to trade off accuracy with index size and query latency. They enable word level highlighting to explain matches. And they perform well in zero-shot settings.\r\n\r\nIn this talk we survey LSMs and discuss how they fit into the IR landscape. We describe some challenges training language models effectively. We briefly survey some techniques which have been studied previously and found to improve performance both in and out of domain. These include downstream task aware pre-training and knowledge distillation. Finally, we give an overview of the key ingredients of our full training pipeline and useful lessons we learned along the way.\r\n\r\nOur goal was to consistently improve on BM25 relevance in a zero-shot setting. In particular, we set out to beat BM25 across a suite of diverse IR tasks gathered together in the BEIR benchmark [2] without using any in domain supervision. We survey other published results on this benchmark and discuss how we compare. \r\n\r\n[1] SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking, Formal et al\r\n\r\n[2] BEIR: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models, Thakur et al", "recording_license": "", "do_not_record": false, "persons": [{"code": "UQZHEK", "name": "Tom Veasey", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UQZHEK_3WkCYD5.webp", "biography": "Tom Veasey has worked at Elastic since September 2016. He is a member of the machine learning team. He started out as a data scientist working on satellite control, phased array radar and drug discovery projects. He then had detours into Electronic Design Automation and FX derivatives pricing. He studied Physics at the University of Cambridge.", "public_name": "Tom Veasey", "guid": "2948f70b-16a4-5efc-ace8-671c9abb1232", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/UQZHEK/"}, {"code": "MX3AWW", "name": "Quentin Herreros", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MX3AWW_TqcoJcj.webp", "biography": "Throughout my career, I have worked on diverse subjects such as medical resonance imaging, infra-red sensor characterization, and predicting carbon footprint in buildings using machine learning. I have been working on natural language processing for three years and I joined Elastic nine months ago.", "public_name": "Quentin Herreros", "guid": "b57be92a-7eb6-529d-9ea4-33d76aa36397", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/MX3AWW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/3GPYJQ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/3GPYJQ/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/3GPYJQ/resources/Tom_Veasey__Quentin_Herreros_-_How_to_ctHFNBC.pdf", "type": "related"}]}, {"guid": "3e6b31e6-7e07-5261-81af-bb3dc0ef0e0f", "code": "YPNYCD", "id": 27635, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/YPNYCD/Kutsenko-Olena_fIN8I8F.png", "date": "2023-06-19T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27635-clickhouse-what-is-behind-the-fastest-columnar-database", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YPNYCD/", "title": "ClickHouse: what is behind the fastest columnar database", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Columnar databases seem to be full of mysteries and confusion.  In this introduction for ClickHouse, we'll take apart its building blocks to see how it achieves its remarkable performance.", "description": "An open source columnar database ClickHouse is in many ways exceptional - it is exceptionally fast, exceptionally efficient, but also, at times exceptionally confusing. \r\n\r\nIts approach to handling data goes against many principles and concepts that we use in other databases. To give some examples: its primary index doesn't index each row and doesn't guarantee uniqueness; a secondary index is used to skip data and doesn't point to specific rows; JOINS is a complex topic and transactions are supported partially, not to mention that its SQL dialect holds a couple of surprises up its sleeve. \r\n\r\nBut, all that said, if used correctly, ClickHouse is a superb solution for online analytical processing (OLAP).\r\n\r\nThe goal of this talk is to help you get the most of ClickHouse and avoid the pitfalls. We'll talk about OLAP and columnar databases. We'll touch topics of indexing, searching and disk storage. We'll look at the reasons behind the most puzzling concepts of ClickHouse, so that by the end of the talk you find them not only logical, but maybe even fascinating.\r\n\r\nIf your challenge is analysing terabytes of data - this talk is for you. If you're a data scientist looking for tools to work with big data - this talk is for you. And, of course, if you are just curious about what makes ClickHouse crazy fast - this talk is for you as well.", "recording_license": "", "do_not_record": false, "persons": [{"code": "LLBXBT", "name": "Olena Kutsenko", "avatar": "https://program.berlinbuzzwords.de/media/avatars/LLBXBT_Q3Gez1v.webp", "biography": "Olena is a seasoned expert in data, sustainable software development, and teamwork. With a background in software engineering, she's led teams and developed mission-critical applications at Nokia, HERE Technologies, and AWS. Currently, she works at Aiven where she supports developers and customers in using open-source data technologies such as Apache Kafka, ClickHouse, and OpenSearch. She is also an international public speaker and regularly present at conferences around the world. She holds AWS Developer and Solutions Architect certifications, and is also a Confluent Catalyst.", "public_name": "Olena Kutsenko", "guid": "317c2014-3ceb-566c-80f7-b9f6e41f062d", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/LLBXBT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YPNYCD/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YPNYCD/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/YPNYCD/resources/Olena_Kutsenko-clickhouse-slides-to-s_d951kuX.pdf", "type": "related"}]}, {"guid": "cc22919b-9fad-52c9-bb76-0dbfba03d552", "code": "8EQS3K", "id": 27965, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/8EQS3K/Driesprong-Fokko_mU2N8Z4.png", "date": "2023-06-19T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27965-tip-of-the-iceberg", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/8EQS3K/", "title": "Tip of the Iceberg", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Apache Iceberg is an open table format that has wide support among open-source and cloud vendors. After this talk, you'll be comfortable with all the concepts and how to use Iceberg.", "description": "Apache Iceberg is a high-performance format for huge analytic tables. Iceberg brings the reliability and simplicity of SQL tables to big data while making it possible for engines to work with the same tables, at the same time. Iceberg is a layer on top of your traditional Parquet tables with all the best practices from the database world. Using this you can do ACID operations on a table that solely lives in cloud storage.\r\nIn the talk, I'll first introduce Iceberg and its history, and the companies that are using and actively contributing to it. We'll take a peek under the hood and I'll explain the different concepts such as metadata, manifest lists, and manifest itself, and how it uses this to help the query engine, and maintain correctness. Next, I'll go through the schema, partition, and sorting evolution and how this is done in a lazy fashion so you don't have to rewrite your multi-petabyte table, and finally I'll do a quick demo using PyIceberg.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JMVXR7", "name": "Fokko Driesprong", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JMVXR7_w1WGDOu.webp", "biography": "Fokko is an open-source enthusiast and member of the Apache Software Foundation. Committer on Apache {Avro, Parquet, Druid, Airflow, Iceberg} and currently working as an open-source developer for Tabular where he focuses on PyIceberg; a non-JVM implementation of Iceberg. In his free time, he spends most of his time with friends and family.", "public_name": "Fokko Driesprong", "guid": "07a9789e-4ca7-552a-998c-e72ab2ac3811", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/JMVXR7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/8EQS3K/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/8EQS3K/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/8EQS3K/resources/Fokko_Driesprong_-_Tip_of_the_Iceberg_wNjgvMC.pdf", "type": "related"}]}, {"guid": "e157f6f5-6e8a-5c77-babc-eaf14c554bc9", "code": "CUTBT7", "id": 27947, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/CUTBT7/Srivastava-Shikhar_030PmUK.png", "date": "2023-06-19T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27947-no-mean-feat-upgrading-a-customized-solr-to-upstream-solr", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/CUTBT7/", "title": "No Mean Feat: Upgrading a Customized Solr to Upstream Solr", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Learn how the News Search Infrastructure Team at Bloomberg migrated from a customized implementation of Apache Solr to the upstream Apache Solr", "description": "Technology upgrades are a key pillar of software infrastructure. However, upgrading search and information retrieval systems is a complex task. At Bloomberg, we had extended the open source Apache Solr implementation with in-house patches to customize it for our use cases. This made upgrading to a newer version quite challenging. But, when you have close to a billion documents that are used by major financial institutions across the world, you cannot afford any mistakes.\r\n\r\nLearn how the News Search Infrastructure Team at Bloomberg migrated from a highly customized implementation of Apache Solr to the upstream Apache Solr, while also making sure that the quality, correctness, and performance of the system was not affected. You will learn about the different strategies we used before, during, and after the migration to make the upgrade transparent to our internal users, all while serving millions of requests everyday.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TA7ZNP", "name": "Shikhar Srivastava", "avatar": "https://program.berlinbuzzwords.de/media/avatars/TA7ZNP_lTu0v4o.webp", "biography": "Shikhar is a Software Engineer on the News Search Infrastructure Engineering team at Bloomberg in London. He has worn multiple hats in his professional career, from developing ETA prediction machine learning models for startups in India to developing low latency, financial market data systems at Bloomberg. He recently started dabbling with Apache Solr and has fallen in love with it", "public_name": "Shikhar Srivastava", "guid": "c6096174-3a9f-5130-ae22-a247d7089fc2", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/TA7ZNP/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/CUTBT7/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/CUTBT7/", "attachments": []}, {"guid": "9af68c2b-03b7-5f69-8aa6-1a53f5d6a5e5", "code": "DDRGJG", "id": 27129, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/DDRGJG/Wang-Bo_Werk-Maximilian_2Mp4WyM.png", "date": "2023-06-19T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27129-model-fine-tuning-for-search-from-algorithms-to-infra", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DDRGJG/", "title": "Model Fine-tuning For Search: From Algorithms to Infra", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Deep learning for search has become a hot topic, while pre-trained neural nets do not function well as expected. We will discuss the algorithms behind model fine-tuning, and how to scale it up.", "description": "Deep learning for search has become a hot topic in recent years, it enables users to search based on semantics, search based on visual similarity, and conduct cross-multi/modality searches.\r\n\r\nThough promising, it is non-trivial to use deep neural nets inside your system and expect it works out of the box. In fact, in most cases, it doesn't work. The reason can be summarised into three pillars: task shift, domain shift, and knowledge shift. \r\n\r\nFirstly, most of the deep learning models are trained to minimize classification/regression/segmentation loss, rather than search loss.  Secondly, the dataset on which the model was trained could be quite different from the data you're working on. Last but not least, we observed a notable knowledge gap between search engineers and machine learning engineers.\r\n\r\nIn this talk, we would like to gently guide the audience into the neural search world. Discuss the motivation behind model tuning. Then, we'll discuss the algorithm frameworks behind model fine-tuning, such as deep metric learning, contrastive learning and self-supervised learning. Last but not least, we'll talk about the infrastructure behind a mature training service and how could we scale it up.\r\n\r\nWe believe the topic could be interesting for the Berlin Buzzwords audience since it covers several aspects of the tags: search, data science, and scale. After the 40 minutes talk, the audience is expected to understand:\r\n1. What is neural search and why it is important.\r\n2. The algorithms to improve pre-trained neural nets for single-modality search/cross-modality search.\r\n3. Our tech stack to scale the training platform up.", "recording_license": "", "do_not_record": false, "persons": [{"code": "MBZKVT", "name": "Maximilian Werk", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MBZKVT_OMLdqG0.webp", "biography": "I enjoy bringing machine learning into production at Jina.ai as Head of Engineering. The combination of high quality engineering, digging into data and the real-world problem at hand thrills me.", "public_name": "Maximilian Werk", "guid": "8a27a56d-1ce0-53ae-a5d2-f1bf646cfd5c", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/MBZKVT/"}, {"code": "SZLMS3", "name": "Bo Wang", "avatar": "https://program.berlinbuzzwords.de/media/avatars/SZLMS3_81RNODI.webp", "biography": "Bo Wang is a senior Machine Learning engineer who's leading the development of Finetuner. He got his BSc from Lanzhou University, China, and MSc from TU Delft, the Netherlands with a background in multimedia information retrieval. He is the core developer of first wave semantic search framework MatchZoo, and also the developer of Jina Core & Docarray.", "public_name": "Bo Wang", "guid": "979ee2dd-6c76-5a5b-be9b-50a38c5749db", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/SZLMS3/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DDRGJG/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DDRGJG/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/DDRGJG/resources/Bo_Wang_-_Model_fine-tuning_for_searc_JILo8oI.pdf", "type": "related"}]}, {"guid": "33bc9cb1-f730-5381-b9ed-e9f90516d594", "code": "8WUWFL", "id": 27997, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/8WUWFL/Ramirez-Javier_W9rQMdJ.png", "date": "2023-06-19T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27997-ingesting-over-4-million-rows-a-second-on-a-single-instance", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/8WUWFL/", "title": "Ingesting over 4 million rows a second on a single instance", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "When we set up to write an open source fast time series database, we realised we would need every trick in the book to make it as performant as possible. This talk will show what's inside.", "description": "How would you build a database to support sustained ingestion of several hundreds of thousands rows per second while running near real-time queries on top?\r\n\r\nIn this session I will go over some of the technical decisions and trade-offs we applied when building QuestDB, an open source time-series database developed mainly in JAVA, and how we can achieve over four million row writes per second on a single instance without blocking or slowing down the reads. There will be code and demos, of course.\r\n\r\nWe will also review a history of some of the changes we have gone over the past two years to deal with late and unordered data, non-blocking writes, read-replicas, or faster batch ingestion.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KZHVVS", "name": "Javier Ramirez", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KZHVVS_PB9R9mX.webp", "biography": "As a Developer Advocate at QuestDB, I help developers make the most of their (fast) data, I make sure the core team behind QuestDB listens to absolutely every piece of feedback I get, and I facilitate collaboration in our open source repository.\r\n\r\nI love data storage, big and small. I have extensive experience with SQL, NoSQL, graph, in-memory databases, Big Data, and Machine Learning. I like distributed, scalable, always-on systems.", "public_name": "Javier Ramirez", "guid": "12d90169-ae2e-5f44-8a99-b8c48dc6323b", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/KZHVVS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/8WUWFL/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/8WUWFL/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/8WUWFL/resources/Javier_Ramirez_-_Ingesting-over-four-_tFj4GVs.pdf", "type": "related"}]}], "Palais Atelier": [{"guid": "dbb33177-5550-5f34-8a64-8ea1e61cdd09", "code": "ZNJLXG", "id": 33356, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/ZNJLXG/Rende-Aydan_sWjaY9s.png", "date": "2023-06-19T10:40:00+02:00", "start": "10:40", "duration": "00:20", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-33356-migrate-data-mesh-in-mind", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ZNJLXG/", "title": "Migrate Data, <Mesh> in mind", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "For quite some time, Hadoop served as the data warehouse for Kleinanzeigen. In this presentation, our objective is to provide an overview of our approach, which involves implementing a cloud-based data pipeline with the help of dbt and Airflow.", "description": "For quite some time, Hadoop served as the data warehouse for Kleinanzeigen. However, the central teams eventually decided to say goodbye to this old friend due to its outdated nature and high costs. This migration presented us with a valuable opportunity to embrace the Data Mesh strategy and establish a new data pipeline. In this presentation, our objective is to provide an overview of our approach, which involves implementing a cloud-based data pipeline with the help of dbt and Airflow. Furthermore, we will delve into the challenges we faced during the process, including the debugging of legacy data flows, the complexities of copying data to s3, and dealing with the domain ownership issues. By sharing these experiences, we aim to provide valuable insights into our journey.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UXD3KU", "name": "Aydan Rende", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UXD3KU_8TTuaOR.webp", "biography": "Aydan Rende is a Senior Data Engineer within the platform team at Kleinanzeigen. In this role, she develops data pipelines and assists teams in facilitating their data requirements. Aydan began her professional journey at Kleinanzeigen as a Software Engineer, working with commercial products. Alongside her professional career, she is a Formula 1 fan who has an on & off relationship with Ferrari.", "public_name": "Aydan Rende", "guid": "6292c0f4-31a1-5ff3-bfe7-3b39f8bb0c24", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/UXD3KU/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ZNJLXG/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ZNJLXG/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/ZNJLXG/resources/Aydan_Rende_-_Migrate_Data_Mesh_in_mi_0jyeP5d.pdf", "type": "related"}]}, {"guid": "a6e6224d-5b95-5a1e-913f-39b21b9d74d3", "code": "ZQ9CPX", "id": 33803, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/ZQ9CPX/Shyani-Milind_9d4jqVj.png", "date": "2023-06-19T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-33803-supercharging-your-transformers-with-synthetic-query-generation-and-lexical-search", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ZQ9CPX/", "title": "Supercharging your transformers with synthetic query generation and lexical search", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk will explore dramatic gains in ranking performance from small transformer models, fine-tuned with synthetic query generation and combined with lexical search, and will equip the audience to pursue the same approach using open-source tools.", "description": "Pre-trained transformers have revolutionized search. However, off-the-shelf transformers, at a fixed model size, perform poorly on out-of-domain data. Larger models have better generalization capabilities but strict latency and cost requirements limit the size of production models.\r\n\r\nIn this talk, we will demonstrate how small transformer models can be fine-tuned on specific domains, even in the absence of labelled data, using the technique of synthetic query generation. Our process involves releasing a fine-tuned 1.5B parameter query generation model that, given a document, generates multiple questions that are answered by the document. These query-document combinations are then used to train a fine-tuned model. We combine the fine-tuned model with OpenSearch lexical search tools and benchmark them. Using these tools, we demonstrate a state-of-the-art, zero-shot nDCG@10 boost of 14.30% over BM25 on a benchmark of 10 public test datasets.\r\n\r\n \r\n\r\nWe elaborate upon lessons learned from training and using large language models for query generation. We also discuss some open questions around representation anisotropy, keyword filtering and index sizes of dense models. Ultimately, audiences will take away from the presentation an understanding of the processes used to fine-tune small transformer models and combine them with lexical search, along with step-by-step guidance with which to pursue their own improvements in search accuracy using open-source tools.", "recording_license": "", "do_not_record": false, "persons": [{"code": "NWEKDN", "name": "Milind Shyani", "avatar": "https://program.berlinbuzzwords.de/media/avatars/NWEKDN_F9VOjMw.webp", "biography": "Milind Shyani is an applied scientist at Amazon Web Services working on language models and machine learning algorithms. He is a theoretical physicist by training and received his Ph.D. from Stanford University.", "public_name": "Milind Shyani", "guid": "69c7871a-658f-5d1d-946b-e82db1a75967", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/NWEKDN/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ZQ9CPX/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ZQ9CPX/", "attachments": []}, {"guid": "c890fc57-529d-5646-ada4-2091c5961f28", "code": "TDWCGF", "id": 27590, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/TDWCGF/Ravi-Bhavani_Zjdsflt.png", "date": "2023-06-19T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-27590-apache-airflow-in-production-bad-vs-best-practices", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TDWCGF/", "title": "Apache Airflow in Production - Bad vs Best Practices", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk will explore the bad and best practices when deploying Apache Airflow in a production environment. From common pitfalls such as misconfigured tasks and lack of scalability,", "description": "Apache Airflow has become a popular open-source platform for managing and orchestrating data pipelines. However, as with any technology, there are good and bad ways to use it. This talk will explore the bad and best practices when deploying Apache Airflow in a production environment. From common pitfalls, such as misconfigured tasks and lack of scalability, to best practices, such as robust monitoring and proper security measures, this talk will provide practical advice for anyone looking to implement Apache Airflow in their production environment.", "recording_license": "", "do_not_record": false, "persons": [{"code": "9SG3KG", "name": "Bhavani Ravi", "avatar": "https://program.berlinbuzzwords.de/media/avatars/9SG3KG_3OE7DzL.webp", "biography": "Bhavani Ravi is an independent DataOps consultant who helps you setup scalable data infrastructures. She is also an avid technical blogger, Opensource enthusiast and Linkedin Learning Instructor", "public_name": "Bhavani Ravi", "guid": "1e9e2796-7d3b-5e0d-909e-77307c64c3e4", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/9SG3KG/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TDWCGF/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TDWCGF/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/TDWCGF/resources/Bhavani_Ravi_-_Apache_Airflow_Bad_Vs__PmL5Ums.pdf", "type": "related"}]}, {"guid": "31a974bb-eea1-5503-85e6-28484f57bca8", "code": "XY8JRJ", "id": 26844, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/XY8JRJ/Fine-Danica_GzqoRYg.png", "date": "2023-06-19T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-26844-a-kafka-client-s-request-there-and-back-again", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/XY8JRJ/", "title": "A Kafka Client\u2019s Request: There and Back Again", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Understand how data moves into and out of Apache Kafka\u00ae by taking a look at the producer and consumer request life cycle. Follow a request from an initial call to send() or poll(), all the way to disk", "description": "Do you know how your data moves into and out of your Apache Kafka\u00ae instance? From the programmer\u2019s point of view, it\u2019s relatively simple. But under the hood, writing to and reading from Kafka is a complex process with a fascinating life cycle that\u2019s worth understanding.\r\n\r\nWhen you call producer.send() or consumer.poll(), those calls are translated into low-level requests which are sent along to the brokers for processing. In this session, we\u2019ll dive into the world of Kafka producers and consumers to follow a request from an initial call to send() or poll(), all the way to disk, and back to the client via the broker\u2019s final response. Along the way, we\u2019ll explore a number of client and broker configurations that affect how these requests are handled and discuss the metrics that you can monitor to help you to keep track of every stage of the request life cycle.\r\n\r\nBy the end of this session, you\u2019ll know the ins and outs of the read and write requests that your Kafka clients make, making your next debugging or performance analysis session a breeze.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UJJAJF", "name": "Danica Fine", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UJJAJF_2ujS2Os.webp", "biography": "Danica Fine is a Senior Developer Advocate at Confluent where she helps others get the most out of their event-driven pipelines. In her previous role as a software engineer on a streaming infrastructure team, she predominantly worked on Kafka Streams- and Kafka Connect-based projects. She can be found on Twitter, tweeting about tech, plants, and baking @TheDanicaFine.", "public_name": "Danica Fine", "guid": "7659b6cc-e4cc-5c4a-a7e3-ba45500dc0e2", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/UJJAJF/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/XY8JRJ/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/XY8JRJ/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/XY8JRJ/resources/Danica_Fine_-_A_Kafka_Clients_Request_c0zWlGI.pdf", "type": "related"}]}, {"guid": "daa2e518-7f47-51f5-845f-3c2d6c872d92", "code": "UNYQ83", "id": 31382, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/UNYQ83/Wu-Yingjun_upiPChu.png", "date": "2023-06-19T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-31382-joining-dozens-of-data-streams-in-distributed-stream-processing-systems", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/UNYQ83/", "title": "Joining Dozens of Data Streams in Distributed Stream Processing Systems", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "This talk will explore the techniques and best practices for joining dozens of data streams, focusing on different joining mechanisms, such as binary joins and delta joins, as well as pros and cons.", "description": "As real-time data processing becomes increasingly essential, organizations face the challenge of efficiently joining and correlating data from multiple streams to gain valuable insights using distributed stream processing systems. This talk will explore the techniques and best practices for joining dozens of data streams, focusing on different joining mechanisms, such as binary joins and delta joins, as well as their pros and cons. Attendees will gain an understanding of various stream join techniques, learn how to optimize performance in distributed environments and apply lessons from industry experiences. Furthermore, the talk will discuss leveraging decoupled compute-storage architecture to reduce join costs. This knowledge will enable participants to harness the full potential of their data, creating efficient and powerful distributed stream processing solutions for their organizations.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BH7KTW", "name": "Yingjun Wu", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BH7KTW_GSX6xdx.webp", "biography": "Yingjun Wu is the founder of RisingWave Labs, the company developing RisingWave, a distributed SQL database for stream processing. Before running the company, Yingjun was a software engineer at the Redshift team, Amazon Web Services, and a researcher at the Database group, IBM Almaden Research Center. Yingjun received his PhD degree from National University of Singapore, and was a visiting PhD at Carnegie Mellon University. He has been working in the field of stream processing and database systems for over a decade.", "public_name": "Yingjun Wu", "guid": "a705555f-19ac-55cd-843a-74d73eab0373", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/BH7KTW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/UNYQ83/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/UNYQ83/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/UNYQ83/resources/Yingjun_Wu_-_joining_dozens_of_data_s_8bMIHY7.pdf", "type": "related"}]}, {"guid": "5f2391d3-0c14-5bab-b42d-c88690336e6f", "code": "JTD7GY", "id": 28166, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/JTD7GY/Burch-Nick_YqT4Xnn.png", "date": "2023-06-19T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-28166-laptop-sized-ml-for-text-with-open-source", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JTD7GY/", "title": "Laptop-sized ML for Text, with Open Source", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Advanced ML models for text may need hundreds of machines, but with open source tools and pre-trained models, you can do a lot just on your laptop or docker container. Discover what and how!", "description": "AI text models like GPT3, ChatGPT, Bing AI and Github Co-Pilot are getting a lot of buzz right now, both good and bad. Much of the training techniques are public, but the computational and data requirements mean most of us can't build our own. Using these big models typically involves cost or sharing your data. What if that's not an option?\r\n\r\nLuckily, there are a number of open source language models out there, with pre-trained versions available to download! They won't let you compete with Google or OpenAI, but they're good enough for a number of real world problems.\r\n\r\nWe'll start with a quick introduction to the main open ML-for-text systems like Word2vec, GloVe, ELMo and BERT, along with how they differ from traditional text relevancy like TF-IDF. Then, we'll discover how open source ML frameworks let us easily work with those techniques, and how pre-trained models let\r\nus quickly get up and running.\r\n\r\nWith our ML-for-text model running on our laptop (or hefty docker container!), next it's time to see what kinds of problems we can solve! We'll look at embeddings for search, inference, semantic reasoning, prediction and more, all with (fairly) minimal coding. Finally, we'll see how we can improve the pre-trained models for specific use-cases with our own text.\r\n\r\nIt may not run on your phone and it probably won't hallucinate incorrect answers, but there's still a lot of text problems we can solve just with open source on our laptops. And we'll share the code you need to do so!", "recording_license": "", "do_not_record": false, "persons": [{"code": "97HYST", "name": "Nick Burch", "avatar": "https://program.berlinbuzzwords.de/media/avatars/97HYST_759PqjE.webp", "biography": "Nick is heavily involved in a number of Apache projects, such as Tika and POI, while having the fortune to know many of the people involved in the Apache Big Data and Search space! When not helping out with Apache things, Nick works as the Director of Engineering at FLEC, where he leads a team making heavy use of Open Source technologies. When not helping improve the logistics industry, he is often to be found attending or organising BarCamps, Geek Nights, or other such fun events dedicated to sharing what's great and new!", "public_name": "Nick Burch", "guid": "02fe34a8-176c-520f-a723-b897478d00b2", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/97HYST/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JTD7GY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JTD7GY/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/JTD7GY/resources/Nick_Burch_-_Laptop-sized_ML_for_Text_NugssWi.pdf", "type": "related"}]}, {"guid": "82782ed4-fd80-501e-b66d-eb89b6245de8", "code": "DPBSAR", "id": 26702, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/DPBSAR/Wu-Qi_YtFtyWd.png", "date": "2023-06-19T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-26702-ml-with-domain-specific-ontology-for-it-security-industry", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DPBSAR/", "title": "ML with Domain-Specific Ontology for IT Security Industry", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "The BSI provides actual data on acute IT threat situations. We developed a system for detecting threats: crawling, automatic analysis with NER, NEL, provision and use of dedicated tools for evaluating", "description": "The BSI monitors and assesses the current IT security situation and its long-term changes. This includes, for example, hacker groups or newly discovered security vulnerabilities. For this purpose, various news sources are monitored and important information is extracted to identify current trends and gain an overview.\r\n\r\nTo optimize this process, we are working with the BSI to develop a system that supports the work by subjecting documents to automatic analysis using methods such as Named Entity Recognition (NER) and Named Entity Linking (NEL). While NER refers to the mapping of text passages to given classes through machine learning (e.g., \"browser\" to software), NEL aims at mapping to concrete entities of an ontology (e.g., \"DOS\" to \"Disk Operating System\"). We explain how we deal with the particular challenge of conceptual ambiguities (\"DOS\" stands not only for \"Disk Operating System\" but also for \"Denial of Service\"). The talk gives an insight into our entity recognition system and how we create a powerful tool for analyzing IT security documents by combining ontology and machine learning.", "recording_license": "", "do_not_record": false, "persons": [{"code": "QHXHYE", "name": "Qi Wu", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QHXHYE_z7SPQcV.webp", "biography": "Qi Wu, Machine Learning Engineer at ontolux, a brand of Neofonie GmbH, works on topics such as training and optimizing models, with a focus on finetuning and distillation, and translates current research results into usable applications for customers. During her master studies in statistics, she has already worked with Prof. Dr. Alan Akbik on the NLP framework FLAIR and worked on ML in the area of natural language processing.", "public_name": "Qi Wu", "guid": "f59621c9-a3f5-5949-9cd9-2cfb75d39d1f", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/QHXHYE/"}, {"code": "MCHQVF", "name": "Bertram S\u00e4ndig", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MCHQVF_kMqwQPQ.webp", "biography": "Bertram S\u00e4ndig leads the Machine Learning team at ontolux, a brand of Neofonie GmbH. He works on the adaptation, optimization, and integration of large language models for ontolux's text analysis toolkit, translating current research results into usable applications for customers.", "public_name": "Bertram S\u00e4ndig", "guid": "b7c467ca-2af9-5a01-8fc9-f51e51dd2634", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/MCHQVF/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DPBSAR/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DPBSAR/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/DPBSAR/resources/Qi_Wu__Bertram_Saendig_-_IT-Sec_NEL_wCdYsth.pdf", "type": "related"}]}, {"guid": "dc0a716c-3dca-54a3-9f65-8528c0f8dd96", "code": "JGCR9K", "id": 26832, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/JGCR9K/Neubauer-Tomas_wcLQjFL.png", "date": "2023-06-19T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-26832-building-real-time-applications-cyclist-crash-detection", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JGCR9K/", "title": "Building Real-Time Applications: Cyclist Crash Detection", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this talk, we will explore common problems faced when building real-time applications at scale, with a focus on a specific use case: detecting and responding to cyclist crashes.", "description": "As the demand for real-time data processing continues to grow, so too do the challenges associated with building production-ready applications that can handle large volumes of data and handle it quickly. In this talk, we will explore common problems faced when building real-time applications at scale, with a focus on a specific use case: detecting and responding to cyclist crashes.\u00a0\r\n\r\nUsing telemetry data collected from a fitness app, we\u2019ll demonstrate how we used a combination of Apache Kafka and Python-based microservices running on Kubernetes to build a pipeline for processing and analyzing this data in real-time. We'll also discuss how we used machine learning techniques to build a model for detecting collisions and how we implemented notifications to alert family members of a crash.\r\n\r\nOur ultimate goal is to help you navigate the challenges that come with building data-intensive, real-time\u00a0 applications that use ML models. By showcasing a real-world example, we aim to provide practical solutions and insights that you can apply to your own projects.\r\nKey takeaways:\r\n\u2022 An understanding of the common challenges faced when building real-time applications at scale\r\n\u2022 Strategies for using Apache Kafka and Python-based microservices to process and analyze data in real-time\r\n\u2022 Tips for implementing machine learning models in a real-time application\r\n\u2022 Best practices for responding to and handling critical events in a real-time application", "recording_license": "", "do_not_record": false, "persons": [{"code": "GSE7FN", "name": "Tom\u00e1\u0161 Neubauer", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GSE7FN_JQriUEQ.webp", "biography": "Tomas Neubauer is a co-founder and the CTO at Quix, works as a technical authority for the engineering team and is responsible for the direction of the company across the full technical stack. He was previously technical lead at McLaren, where he led architecture uplift for Formula 1 racing real-time telemetry acquisition. He later led platform development outside motorsport, reusing the know-how he gained from racing.", "public_name": "Tom\u00e1\u0161 Neubauer", "guid": "2b3f57e8-dbaa-52fd-ab68-ab98c24a26aa", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/GSE7FN/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JGCR9K/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JGCR9K/", "attachments": []}], "Frannz Salon": [{"guid": "f0fa6de6-c869-5bfc-8b95-fedbef42c12e", "code": "79AVEA", "id": 26706, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/79AVEA/Gheorghe-Radu_Kuc-Rafal_GLnqTiW.png", "date": "2023-06-19T11:10:00+02:00", "start": "11:10", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-26706-using-tensorflow-in-a-solr-query-parser", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/79AVEA/", "title": "Using TensorFlow in a Solr Query Parser", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Tutorial for writing Solr Query Parser that use TensorFlow for Java to augment queries.", "description": "Typically, when you need to expand a query through a model - for example, to do entity recognition or query tagging - you'd use a separate service. While this architecture is perfectly valid, the extra network hops to the \"query expansion microservices\" will impact query latency.\r\n\r\nFor autocomplete and other low-latency use-cases, you might want to trade some complexity for speed by implementing a custom query parser. In this talk, we'll show a working example:\r\n- we'll build a model using TensorFlow in Python that does query expansion\r\n- we'll load it with TensorFlow for Java in a Solr Query parser\r\n- now we can run queries and get them expanded directly in Solr\r\n\r\nOne can use this talk and the resources we'll share in order to implement a query parser for their own use-case. We'll also expand on the architecture trade-offs. For example, as you add more nodes and replicas to handle more query throughput, you'll expand the capacity for query expansion. Should you need to scale these separately, you can use coordinator nodes.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3CMEKA", "name": "Radu Gheorghe", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3CMEKA_DwlKQxc.webp", "biography": "Radu Gheorghe works mainly as a [search consultant](https://sematext.com/consulting) at Sematext, working with clients of all sizes on their Elasticsearch, OpenSearch and Solr projects. He is also a [trainer](https://sematext.com/training/) and does [production support](https://sematext.com/support/) for both these search engines.\r\n\r\nSometimes he helps out with the development of Sematext Cloud (an observability SaaS), mostly when it comes to Elasticsearch and log shippers (e.g. Logstash, rsyslog...). He also writes on the [Sematext blog](https://sematext.com/blog/author/radu7gheorghe/) or helps other publish new articles.\r\n\r\nHe co-authored a book (Elasticsearch in Action, Manning), recorded a video tutorial (Working with Elasticsearch, O'Reilly) and was a speaker at a [number of conferences](https://www.youtube.com/watch?v=ONGqk3xXRTw&list=PLjwv6_Ik6hnLEmz-rcII0cGyAIToRF4Q6), such as Berlin Buzzwords, LuceneSolrRevolution (later Activate) and Kubecon.", "public_name": "Radu Gheorghe", "guid": "e0bb8b22-5b87-5930-bd1b-c992f726ce16", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/3CMEKA/"}, {"code": "ADKESR", "name": "Rafa\u0142 Ku\u0107", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ADKESR_gdoaLxJ.webp", "biography": "Software engineer, trainer, consultant and author from time to time - some would say that he is an all in one battle weapon concentrated mostly on Lucene, Solr and Elasticsearch. Currently an Engineering Lead in Archipelo. However he also likes all the other cool stuff that is happening in the IT world. Likes to share his knowledge by giving talks at various meet ups and conferences.", "public_name": "Rafa\u0142 Ku\u0107", "guid": "1eb7cc2c-b6ba-5277-9561-a98d6395be51", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/ADKESR/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/79AVEA/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/79AVEA/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/79AVEA/resources/Radu_Gheorghe__Rafal_Kuc_-_Using_Tens_lUNhggh.pdf", "type": "related"}]}, {"guid": "8f4a91a7-3535-5158-90d9-712c7f153b77", "code": "UFSE8P", "id": 27866, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/UFSE8P/Norem-Savannah_rqbx521.png", "date": "2023-06-19T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-27866-when-probably-is-good-enough", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/UFSE8P/", "title": "When Probably is Good Enough", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Probabilistic data structures give developers room to massively cut down on space requirements while sacrificing a bit of accuracy, so when is probably good enough?", "description": "Examining the probabilistic data structures that come built into Redis Stack will allow us to fully understand how, why and when they work best. We'll examine each of: count min sketch, top k, and bloom and cuckoo filters. Each of these has a distinct structure that we'll start with so we can see how they work. We'll then look at why each one is probabilistic and what the consequences are for that. Then we'll look at use cases for each to see when they would best be used in the wild. We'll wrap up with a demonstration of the space saving capabilities, for example the size difference between a bloom filter and a set with the same items added to each.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UTY3W8", "name": "Savannah Norem", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UTY3W8_3DHCpoW.webp", "biography": "Currently a Developer Advocate at Redis, Savannah has a love for talking about all that technology can (and can't) do for people. When she's not live stream coding, or working on examples to help others get answers faster, she's either crafting, gardening, or hanging out with her husband and their cats.", "public_name": "Savannah Norem", "guid": "71731e5c-19d7-5331-af4a-a8bf3e77369f", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/UTY3W8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/UFSE8P/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/UFSE8P/", "attachments": []}, {"guid": "a7c16bc9-dce2-543c-a4d2-2fd5df6c0628", "code": "PN9VJK", "id": 28158, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/PN9VJK/Williams-Matt_LjYEqVF.png", "date": "2023-06-19T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-28158-cooking-up-a-new-search-system-recipe-search-at-cookpad", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/PN9VJK/", "title": "Cooking up a new search system: Recipe search at Cookpad", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "How we successfully transitioned the search system for the world's largest recipe sharing platform to a modern stack \u2013 including the successes, failures, team structures, and processes along the way.", "description": "Cookpad is the largest recipe sharing platform in the world. Our mission is to make everyday cooking fun, and central to that is our search product. Our search engine helps cooks everywhere find tasty dishes to cook from our ever-growing catalogue of six million recipes created by everyday cooks. As a global recipe search \u2013 available in 70+ countries, 30+ languages, and to over 50 million monthly users \u2013 delivering this is no mean feat.\r\n\r\nIn order to prepare for a substantial new iteration of our search product, we realised that our existing search system was not suited to our goals. Over the course of two years, we embarked on a technological and cultural transition, with the aim of giving product teams and engineers (including search engineers, data scientists, and ML engineers) greater ownership over the search experience. This included a shift to a Python-based stack and a data-driven approach to search relevance improvement.\r\n\r\nWe embarked on a transition to a new system, along with new team structures and team composition, and new strategies for improving the search experience. Over two years we delivered a new system, without halting product development along the way, and without disruption to the user experience.\r\n\r\nOur starting point was a team and system with capacity limited to maintenance and bug fixes, where relevance enhancement was delivered through incremental knowledge base tuning by SMEs (non-engineer subject matter experts). Our end point was multiple search teams who have greater ownership over the search experience and relevance improvement, assisted by SMEs, and following a process for rigorously tested hypothesis-driven experimentation.\r\n\r\nThis change involved transitioning to a new event-driven architecture, along with technologies that were new to Cookpad search, such as Kubernetes, Kafka, Python, and machine learning. In addition \u2013 and just as importantly \u2013 it also involved a transformation in team structures and team composition, for which we borrowed many concepts and practices learned from the search community, and also ideas from the Team Topologies movement.\r\n\r\nThis talk will cover our journey, why we did it, as well as the trials, tribulations, and successes along the way. Hopefully, it will give others who are in a similar position new ideas on how to reinvent their own search system and search function, while minimising disruption to product delivery, in order to deliver proven improvements at pace.", "recording_license": "", "do_not_record": false, "persons": [{"code": "GDXDG8", "name": "Matt Williams", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GDXDG8_gK72h4n.webp", "biography": "Matt Williams is a Principal Engineer at Cookpad, the world's largest recipe sharing platform, where he specialises in search and discovery. He has over six years of experience in building discovery experiences, with a particular interest in NLP, ML, scaleable search and recommendation, and the team structures and processes that underpin effective relevance improvement.\r\n\r\nPrior to joining Cookpad in 2019, Matt worked as a Data Scientist and ML Engineer in industry, with applications to social media analysis and real-time news analysis. Before entering the software industry, Matt was a research scientist in academia, focusing on network science and predictive user modelling. He holds a PhD in Computer Science from Cardiff University, where he was also a lecturer.", "public_name": "Matt Williams", "guid": "e4243626-9be1-58a9-a895-cb056331170d", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/GDXDG8/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/PN9VJK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/PN9VJK/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/PN9VJK/resources/Matt_J_Williams_-_Cooking_up_a_new_Se_psIl09j.pdf", "type": "related"}]}, {"guid": "c6a05577-3bae-5ac6-aad1-a0a91b8d7da8", "code": "DFQK8E", "id": 27991, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/DFQK8E/Pop-Radu_Dauvissat-Benjamin_423ULmT.png", "date": "2023-06-19T14:50:00+02:00", "start": "14:50", "duration": "00:20", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-27991-big-data-in-the-service-of-reliable-news", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DFQK8E/", "title": "Big data in the service of reliable news", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Data vs. Fake news : using available data to offer a critical view of the world", "description": "Brandolini's law states that debunking misinformation consumes more energy than spreading it.  \r\nWith tools that organize, transform and present data we can reduce this amount of energy and provide the tools to apprehend the world in a skeptical way as an alternative to short messages on social-media or re-interpreted news headlines.\r\nThese tools need to enquire a large base of data and to select heterogeneous sources of information.  \r\nThe main challenge leans in the ability to harvest, aggregate and synthetically present the emerging facts.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JNFKMQ", "name": "Radu Pop", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JNFKMQ_BxhO8Hb.webp", "biography": "Radu provides Consulting Services as Solutions Architect at Adelean. He handles projects around Elasticsearch and Adelean\u2019s A2 search technology. He oversees the integration and evolution of search engines within large e-commerce platforms, marketplaces or organizations' data lakes. Prior to joining Adelean, Radu acquired a solid experience in Web archiving, operating large scale crawling systems in the context of several European research projects. He holds a PhD in Computer Science and a MSc in Distributed Systems.", "public_name": "Radu Pop", "guid": "adbcba05-e957-5634-87ae-766e99e33c4c", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/JNFKMQ/"}, {"code": "T3LZJK", "name": "Benjamin Dauvissat", "avatar": "https://program.berlinbuzzwords.de/media/avatars/T3LZJK_7qi7FTl.webp", "biography": "Java developper for almost 20 years, I also look after various fields like DevOps or big data.  \r\nI work at Adelean, a french company specialized in search engines.  \r\nAnd when I'm not working, I like coding and testing new stuff.", "public_name": "Benjamin Dauvissat", "guid": "92f9279c-f881-5e13-948b-0d95d66a68b8", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/T3LZJK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DFQK8E/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/DFQK8E/", "attachments": []}, {"guid": "51aa2d2e-e86b-5505-8a92-19fab7386c2b", "code": "VEQHVW", "id": 28154, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/VEQHVW/Vlatko-Natali_Horgan-Celeste_RCygfpy.png", "date": "2023-06-19T15:20:00+02:00", "start": "15:20", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-28154-building-on-ramps-for-non-code-contributors-in-open-source", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VEQHVW/", "title": "Building On-Ramps for Non-Code Contributors in Open Source", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Open source software is so much more than code \u2013 docs, community and infra need maintaining. How do you attract and keep non-code contributors? Let two experienced practitioners show you the way!", "description": "\u201cContributions welcome!\u201d We\u2019ve all seen this standard disclosure across open source projects, whether they\u2019re actively looking for contributors or not. We build working groups, backlogs and enhancement proposal processes designed to bring in developers to help us work on our project, and with any luck it succeeds. But supporting open source means supporting the entire project: documentation, community meetings, events and infrastructure. One of the most common questions smaller projects have is how to attract these non-code contributors, but there aren\u2019t any easy answers out there. \r\n\r\nIn this talk, experienced non-technical contributors in the Kubernetes and cloud native ecosystems go through some of the ways Kubernetes has built contributor on-ramps for non-code contributions, and how you can adapt them to your projects. The talk features practical examples of what to do to support the non-code aspects of your open source project and how to attract \u2013 and retain \u2013 contributors.\r\n\r\nThis talk will cover the most common question we continually face in the cloud native community from non-k8s projects: how to attract non-technical contributors and get things done. This is clearly seen as a scalability issue for many OSS projects. We'll also share a bit about our individual stories: how we, two non-code contributors, got involved with the Kubernetes project and the challenges we faced.\r\n\r\nNext, what we know has worked: mentorship programs, pairing/shadowing, clear role documentation, easy to understand backlogs (good first issues), and other approaches we\u2019ve seen that we haven\u2019t applied personally. Finally, we'll cover why non-code contribution is both exciting and important, especially regarding governance and policy, plus how open source maintainers and companies with OSPOs can help non-code contributors get involved.", "recording_license": "", "do_not_record": false, "persons": [{"code": "GUQG77", "name": "Natali Vlatko", "avatar": "https://program.berlinbuzzwords.de/media/avatars/GUQG77_SpfltO3.webp", "biography": "Natali Vlatko (she/her) is the SIG Docs Co-Chair for the Kubernetes project and plays on the fun computer in her spare time. Her academic background is in Egyptology and Archaeology; specifically, burial customs across the various kingdoms of Ancient Egypt. Ask her about dead stuff.", "public_name": "Natali Vlatko", "guid": "4299145d-8bd4-50cf-a2aa-054854fc2743", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/GUQG77/"}, {"code": "QZUUVK", "name": "Celeste Horgan", "avatar": "https://program.berlinbuzzwords.de/media/avatars/QZUUVK_Nhk2Ehk.webp", "biography": "", "public_name": "Celeste Horgan", "guid": "757e99e0-d042-5c00-b1d0-1b29fb1e2d89", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/QZUUVK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VEQHVW/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VEQHVW/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/VEQHVW/resources/Natali-Vlatko_Celeste-Horgan-Building_wewfGLh.pdf", "type": "related"}]}, {"guid": "2ef89655-81f1-555d-80d0-2156c0b5ea99", "code": "VFWKWM", "id": 27533, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/VFWKWM/chukka-ram_0UmILew.png", "date": "2023-06-19T16:30:00+02:00", "start": "16:30", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-27533-who-broke-the-build-using-kuttl-to-test-and-release-faster", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VFWKWM/", "title": "Who broke the build? -Using Kuttl to test and Release faster", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "No one wants to be responsible for breaking the build. But what can you do as a developer to avoid being the bad guy? How can project leads enable their teams to reduce the occurrence of builds?", "description": "Description:\r\n\r\nNo one wants to be responsible for breaking the build. But what can you do as a developer to avoid being the bad guy? How can project leads enable their teams to reduce the occurrence of broken builds?\r\nIn talking within our own teams, we discovered that many developers weren\u2019t running sufficient integration and End to End tests in their local environments because it\u2019s too difficult to set up and administer test environments in an efficient way.\r\nThat\u2019s why we decided to rethink our entire local testing process in hopes of cutting down on the headaches, heartaches, and valuable time wasted. Enter Kuttl. Connecting Kuttl to CI builds has empowered our developers to easily configure a development environment locally that accurately matches the final test environment \u2014 without needing to become an expert CI admin themselves.\r\nThese days, we hear, \u201cWho broke the build?\u201d far less often \u2014 and you can too!\r\n\r\nSession Outline:\r\n\r\nIn this session, we\u2019ll discuss how we use kuttl to achieve more streamlined testing and fewer broken builds. We\u2019ll cover:\r\n\u25cf A quick history of our testing challenges and what led us to Kuttl\r\n\u25cf The benefits of our new testing approach \u2014 easy to configure and minimal investment\r\n\u25cf How we combine Kuttl and CI pipelines for more streamlined testing and fewer broken builds\r\n\r\nSession Key Takeaways:\r\n\r\n1. When and why we decided to rethink our e2e testing practices and our subsequent discovery of Kuttl.\r\n2. Why Kuttl has been the perfect tool for our developers to perform better local integration/e2e testing without the burden of becoming their own CI administrators.\r\n3. A detailed account of how we utilize Kuttl to set up development environments locally that match our final test environment in order to reduce unnecessary commits and minimize CI build breaks.", "recording_license": "", "do_not_record": false, "persons": [{"code": "P3DADS", "name": "Ram Mohan Rao Chukka", "avatar": "https://program.berlinbuzzwords.de/media/avatars/P3DADS_hahSRyW.webp", "biography": "Ram, Software Developer@JFrog. Previously worked for startup companies like CallidusCloud (SAP Company), Konylabs. Loves Automation, Linux, openSource", "public_name": "Ram Mohan Rao Chukka", "guid": "c9706a1a-9776-5e1a-95ce-fad23b5fba16", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/P3DADS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VFWKWM/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/VFWKWM/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/VFWKWM/resources/Ram_Chukka_-_Who_broke_the_build_fina_5wjkVo8.pdf", "type": "related"}]}, {"guid": "cf8adf2e-3c5b-5f63-8d55-216d4537f9bd", "code": "9YHQK8", "id": 27845, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/9YHQK8/Saidel-Keesing-Maish_1Wrlxdb.png", "date": "2023-06-19T17:20:00+02:00", "start": "17:20", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-27845-creating-chaos-in-containers", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/9YHQK8/", "title": "Creating chaos in containers", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Chaos engineering is hard, in containers it is even harder.\r\nThis session will show attendees the considerations and get them started on their way to making more resilient applications in the cloud", "description": "Chaos engineering is not a new concept, it has been around since 2011. The benefit of knowing the weak spots of your application before it actually breaks is extremely valuable.\r\nBut with containers, this becomes bit more complicated. There are many layers of possible failure running under your application.\r\n\r\nIn this session you will learn more about the different layers you should be releasing your chaos experiments on, the considerations you need to take into account while testing a shared platform, and also learn about the tooling available to accomplish this.", "recording_license": "", "do_not_record": false, "persons": [{"code": "YUKTS7", "name": "Maish Saidel-Keesing", "avatar": "https://program.berlinbuzzwords.de/media/avatars/YUKTS7_0SPe17f.webp", "biography": "A public speaker, a creator of things, a writer of books, a contributor to community, and yes, also an ambulance driver. Senior Developer Advocate @AWS\r\nMaish Saidel-Keesing is a Senior Enterprise Developer Advocate @AWS working on containers and has been working in IT for the past 20 years and with a stronger focus on cloud and automation for the past 7.\r\n\r\nHe has extensive experience with AWS Cloud technologies, DevOps and Agile practices and implementations, containers, Kubernetes, virtualization, and modern applications.\r\n\r\nHe is constantly trying to bridge the gap between Developers and Operators to allow all of us provide a better service for our customers (and not wake up from pages in the middle of the night). He is an avid practitioner of dissolving silos - educating Ops how to code and explaining to Devs what the hell is Operations.\r\n\r\nAutomation is the way things should be done - and he is constantly looking for ways to make life easier wherever he can.", "public_name": "Maish Saidel-Keesing", "guid": "9fb4d797-4cfa-5cbf-b0bb-3542bafe316e", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/YUKTS7/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/9YHQK8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/9YHQK8/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/9YHQK8/resources/Maish_Saidel-Keesing_-_Creating_chaos_jOGYFzb.pdf", "type": "related"}]}]}}, {"index": 3, "date": "2023-06-20", "day_start": "2023-06-20T04:00:00+02:00", "day_end": "2023-06-21T03:59:00+02:00", "rooms": {"Kesselhaus": [{"guid": "50d50f17-bb11-5b11-8009-472c460532b0", "code": "NNNZ8W", "id": 28105, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/NNNZ8W/Schindler-Uwe_Gzxeoma.png", "date": "2023-06-20T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-28105-what-s-coming-next-with-apache-lucene", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/NNNZ8W/", "title": "What's coming next with Apache Lucene?", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "This talk will discuss the ways Apache Lucene might go in the next years. From the perspective of a full-text search engine, it looks like it is feature-complete. So what comes next?", "description": "Around Lucene 8 most people thought \"There's not much that can be done anymore\". In contrast to that, if you look into Apache Lucene's list of new features after each release, you will see mostly 2 new areas of improvements: Vector search and performance improvements.\r\nIs this the end of development? For sure: No! This talk will check how ongoing optimizations in the Java ecosystem might be implemented in Apache Lucene. As example, this will present the new vector incubation module in recent JDKs and how it helps to make indexing and searching much faster starting with Java 20.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HRJC87", "name": "Uwe Schindler", "avatar": "https://program.berlinbuzzwords.de/media/avatars/HRJC87_iOiv7er.webp", "biography": "Uwe is committer and PMC member of Apache Lucene and Apache Solr. His main focus is on development of Lucene Core. He implemented fast numerical search and is maintaining the new attribute-based text analysis API. He studied Physics at the University of Erlangen-Nuremberg and works as managing director for SD DataSolutions GmbH in Bremen, Germany, a company that provides consulting and support for Apache Lucene, Elasticsearch, and Apache Solr. He also works for \u201cPANGAEA \u2013 Publishing Network for Geoscientific & Environmental Data\u201d where he implemented the portal's geo-spatial retrieval functions with Lucene Java. Uwe had talks about Lucene at various international conferences like the previous Berlin Buzzwords, ApacheCon EU/US, Lucene Revolution, Lucene Eurocon, and various local meetups.", "public_name": "Uwe Schindler", "guid": "66a8fa69-bb21-5e95-bc1f-c8a092640daf", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/HRJC87/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/NNNZ8W/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/NNNZ8W/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/NNNZ8W/resources/WhatsComingNextWithApacheLucene2023_a2J1CDa.pdf", "type": "related"}]}, {"guid": "b1c4a591-f0fb-5cd4-b518-72565a01dfe9", "code": "YLCZP8", "id": 27962, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/YLCZP8/Ginstrom-Ryan_Narboneta-Zosa-Teo_4htELvp.png", "date": "2023-06-20T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-27962-building-mlops-infrastructure-at-japan-s-largest-c2c-e-commerce-site", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YLCZP8/", "title": "Building MLOps Infrastructure at Japan's Largest C2C E-Commerce Site", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "The MLOps infrastructure we built to support ML in search at Mercari, Japan\u2019s largest C2C e-commerce platform.", "description": "We describe the system we built to support ML in search at Mercari, Japan\u2019s largest C2C e-commerce platform. We start by describing the journey to enable the use of ML in a \u201ctraditional\u201d term-based search infrastructure with high throughput and strict latency requirements. We also discuss the mixed blessing of rushing a successful proof of concept into production and the technical challenges this posed on the infrastructure side.\r\n\r\nNext, we discuss the nuts and bolts of data engineering, ETLs, training pipelines, and serving/monitoring our ML model in production. We also discuss some of the weaknesses of our initial homegrown system, including A/B testing and model monitoring. Finally, we discuss our efforts to evolve our homegrown system into a more modern MLOps infrastructure using an A/B testing framework and Seldon for traffic routing and model serving.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WBCXSF", "name": "Ryan Ginstrom", "avatar": "https://program.berlinbuzzwords.de/media/avatars/WBCXSF_AZd1THN.webp", "biography": "I am a machine learning engineer at Mercari. I live and work in Japan. My professional interest these days is using machine learning in production at scale, and the special challenges this poses.", "public_name": "Ryan Ginstrom", "guid": "0b23dd48-c772-5c24-b388-a29e973df60e", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/WBCXSF/"}, {"code": "WS9ADJ", "name": "Teo Narboneta Zosa", "avatar": "https://program.berlinbuzzwords.de/media/avatars/WS9ADJ_nCDvgRQ.webp", "biography": "Teo is a machine learning engineer in the AI & Search division of Mercari, Japan\u2019s largest C2C marketplace. He is currently working across various business-critical projects and helping establish foundational MLOps processes and best practices across the org.", "public_name": "Teo Narboneta Zosa", "guid": "8eaf8568-7bb3-5b8c-9d75-4d8bdeabb10d", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/WS9ADJ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YLCZP8/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YLCZP8/", "attachments": []}, {"guid": "16cf1689-ebfa-502d-805d-0ba0cb5fdcfa", "code": "TVXR9Q", "id": 28088, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/TVXR9Q/Benton-William_JsoGt5G.png", "date": "2023-06-20T11:00:00+02:00", "start": "11:00", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-28088-synthetic-data-when-why-and-how", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TVXR9Q/", "title": "Synthetic data:  when, why, and how", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk will cover several use cases in which generating synthetic data is useful (or even essential) and introduce a toolbox of practical techniques for synthesizing data in these situations.", "description": "Data is essential to today's most interesting applications and systems, which learn from data, act autonomously in response to data, and make data digestible via search. Somewhat counterintuitively, as the importance of real data has increased, the importance of synthetic data has increased as well. In this talk, you'll learn when it's appropriate to use synthetic data (and when it isn't likely to help).  You'll also learn about several circumstances in which synthetic data is especially useful, including dealing with personally-identifying information, load testing, and simulating system response to unlikely scenarios.  The talk will conclude by providing brief, actionable introductions to several practical approaches to generating synthetic tabular data, each of which is appropriate for particular kinds of synthetic data use cases:  we'll cover a simple way to simulate data-generating processes from first principles, basic and more sophisticated statistical techniques, and approaches based on machine learning models.  You'll leave with a better understanding of the role of synthetic data in today's systems and a concrete toolbox of ways to exploit it in your own programs.", "recording_license": "", "do_not_record": false, "persons": [{"code": "V7EF3R", "name": "William Benton", "avatar": "https://program.berlinbuzzwords.de/media/avatars/V7EF3R_GZC4FA0.webp", "biography": "William Benton is passionate about making it easier for machine learning practitioners to benefit from advanced infrastructure and making it possible for organizations to manage machine learning systems. His recent roles have included defining product strategy and professional services offerings related to data science and machine learning, leading teams of data scientists and engineers, and contributing to many open source communities related to data, ML, and distributed systems. Will was an early advocate of building machine learning systems on Kubernetes and developed and popularized the \u201cintelligent applications\u201d idiom for machine learning systems in the cloud. He has also conducted research and development related to static program analysis, language runtimes, cluster configuration management, and music technology.", "public_name": "William Benton", "guid": "d0cb7874-0585-5c70-aab5-d818c70cc5ed", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/V7EF3R/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TVXR9Q/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TVXR9Q/", "attachments": []}, {"guid": "9f412721-a447-5047-b5d6-6ab723309191", "code": "ASWYUN", "id": 28003, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/ASWYUN/Hutchinson-Chris_MZrHbaK.png", "date": "2023-06-20T11:50:00+02:00", "start": "11:50", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-28003-search-saves-lives-solving-healthcare-problems-with-search", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ASWYUN/", "title": "Search saves lives: solving healthcare problems with search", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "During covid the pressure was on for search. I\u2019ll discuss the challenges of building a search engine matching people to covid test facilities and how the lessons learned can solve healthcare issues.", "description": "At its peak, the UK\u2019s National Health Service (NHS) was covid testing almost half a million people per week. When demand for these appointments began to outstrip supply, search result relevance suffered. In some extreme cases, people were recommended to cross a body of water to get a test. This was a risk to public health as the NHS wanted to avoid anyone that had covid using public transport.  \r\n\r\nTo solve this the NHS needed to switch the way they filtered search results. Instead of using straight line (euclidean) distance, they wanted to filter results based on travel times. They also needed a way to tailor results based on whether the searcher had access to a car and ensure public transport was avoided. \r\n\r\nThere were many technical challenges to delivering this kind of search. \r\n- High user demand - needed to be able to handle 100,000 users searching concurrently\r\n- Response times - deliver test centre locations in under 50 milliseconds\r\n- User data privacy - ensuring no customer data will ever be at risk\r\n- Security - ensuring no tampering with data\r\n\r\nThere was no room for months-long stress tests. It needed to deliver on performance instantly. In my presentation I\u2019ll walk through how we built this search under a super tight deadline. \r\n\r\nI\u2019ll also walk through many other applications of search in healthcare. Including: \r\n- Managing Europe\u2019s nursing shortages\r\n- Improving the efficiency of emergency services \r\n- Matching mobile doctors to patients", "recording_license": "", "do_not_record": false, "persons": [{"code": "VGHTHS", "name": "Chris Hutchinson", "avatar": "https://program.berlinbuzzwords.de/media/avatars/VGHTHS_K63Q3wt.webp", "biography": "Chris Hutchinson is Chief Customer Officer at TravelTime, a UK-based company that builds high performance mobility APIs that enable users to search location data using time instead of distance. Chris is responsible for ensuring that users get maximum value from the API at all stages of the customer journey, from testing to integration to production use.", "public_name": "Chris Hutchinson", "guid": "6b695d5b-5b22-5bc1-836e-a4909d29d3e6", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/VGHTHS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ASWYUN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ASWYUN/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/ASWYUN/resources/Chris_Hutchinson_-_Search_saves_lives_mF8UArv.pdf", "type": "related"}]}, {"guid": "a426f2e9-a299-5acb-9d08-94fae1f61ed8", "code": "W7YXCM", "id": 27972, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/W7YXCM/Lukawski-Kacper_ocVG9uU.png", "date": "2023-06-20T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-27972-chatgpt-is-lying-how-can-we-fix-it", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/W7YXCM/", "title": "ChatGPT is lying, how can we fix it?", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Large Language Models are great in grammar but tend to confabulate. Building a reliable knowledge base might be a way to solve it. Here is how.", "description": "ChatGPT was a revolution nobody was ready for. All the social channels have been flooded with prompts and answers which look ok at first glance but turn out to be counterfeit. Factuality is the biggest concern about Large Language Models, not only the OpenAI product. If you build an app with LLMs, you need to be aware of this.\r\n\r\nRetrieval Augmented Language Models seem to be the solution to overcome that issue. They combine LLMs' language capabilities and the knowledge base's accuracy. The talk will review possible ways to implement it with humans in the loop.", "recording_license": "", "do_not_record": false, "persons": [{"code": "H3RSTE", "name": "Kacper \u0141ukawski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/H3RSTE_s2ucWWb.webp", "biography": "Kacper \u0141ukawski is a Developer Advocate at Qdrant - an open-source neural search engine. Recently he\u2019s been exploring the world of similarity learning and vector search.", "public_name": "Kacper \u0141ukawski", "guid": "235d4c1b-f02c-53a2-9a37-d126b9976e0e", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/H3RSTE/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/W7YXCM/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/W7YXCM/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/W7YXCM/resources/Kacper_Lukawski_-_ChatGPT_is_lying_ho_DgMwFKH.pdf", "type": "related"}]}, {"guid": "e3e2bd13-7627-5177-bb2c-540886ee0739", "code": "JYNNE9", "id": 28002, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/JYNNE9/Gupta-Anshum_gtsjLct.png", "date": "2023-06-20T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-28002-cross-data-center-replication-in-solr-a-new-approach", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JYNNE9/", "title": "Cross Data Center Replication in Solr - A new approach", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Learn about the motivation that led to the development of the new Cross Data Center (XDC) Replication module in Apache Solr and discover the capabilities it offers making it disaster ready.", "description": "Apache Solr is a critical piece of infrastructure for most companies dealing with data. The systems that Solr powers are critical, requiring high availability, low latency, and disaster recovery.\r\n\r\nThis talk introduces a new approach to cross data-center replication in Solr that allows for the feature to scale and ensure disaster readiness as well as lower latency at a scale that Solr is expected to support. \r\n\r\nThe audience will be provided a design overview including the challenges and approaches we tried. We will also introduce the current capabilities and our plan for this newly added Solr XDC module.\r\n\r\nAt the end of this talk, attendees would have a better understanding of how and when to use the new module to ensure disaster readiness for the Solr cluster as we\u2019ll as the avenues for them to participate in enhancing the solution.", "recording_license": "", "do_not_record": false, "persons": [{"code": "88SDJF", "name": "Anshum Gupta", "avatar": "https://program.berlinbuzzwords.de/media/avatars/88SDJF_qK31J86.webp", "biography": "Anshum is an Apache Lucene and Solr committer and Project Management Committee member. He started dabbling with Lucene over 15 years ago, and since then has worked at various organizations building both internal\u00a0and consumer facing search platforms on top of Lucene\u00a0and Solr. He is currently a part of the ACS Open Source Solr team helping groups across Apple with their search infrastructure.", "public_name": "Anshum Gupta", "guid": "66f6d8e3-8cd2-57a9-9a5d-c2a937360ed2", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/88SDJF/"}, {"code": "BWR9XK", "name": "Mark Miller", "avatar": null, "biography": null, "public_name": "Mark Miller", "guid": "9323112c-c8c0-54d3-abce-ddf032eea2ec", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/BWR9XK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JYNNE9/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/JYNNE9/", "attachments": []}, {"guid": "5ee20920-0cb7-5b7b-a2fd-5c6b33ef320d", "code": "NPKZHP", "id": 26157, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/NPKZHP/Leszczynski-Pawel_Obuchowski-Maciei_YpTxHsG.png", "date": "2023-06-20T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-26157-column-level-lineage-is-coming-to-the-rescue", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/NPKZHP/", "title": "Column-level lineage is coming to the rescue", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "How are the columns containing sensitive data used across the data ecosystem?  What input columns were used to produce a given report field? Openlineage can answers those questions automatically.", "description": "OpenLineage is a standard for metadata and lineage collection that is growing rapidly. Column-level lineage is one of its most anticipated features of the community that has been developed recently. In this talk, we:\r\n * show foundations for column lineage within OpenLineage standard,\r\n * provide real-life demo on how is it automatically extracted from Spark jobs,\r\n * describe and demo column lineage extraction from SQL queries,\r\n * show how the lineage can be consumed on Marquez backend. \r\n\r\nWe aim to provide demos to focus on practical aspects of the column-level lineage which are interesting to data practitioners all over the world.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JTSW7C", "name": "Pawe\u0142 Leszczy\u0144ski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JTSW7C_84yUbIt.webp", "biography": "Pawel (@pawel-big-lebowski on github) is OpenLineage contributor. As a data practitioner with decade long experience, he focuses on converting data processing logs and metrics into meaningful observability insights.", "public_name": "Pawe\u0142 Leszczy\u0144ski", "guid": "f6dc631a-8732-5f3a-b532-c3cdd82a3e13", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/JTSW7C/"}, {"code": "3M39HP", "name": "Maciej Obuchowski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3M39HP_MBurLqU.webp", "biography": "Maciej is a software engineer at GetInData and OpenLineage commiter. He loves contributing to open source projects and playing with cats.", "public_name": "Maciej Obuchowski", "guid": "5501a6b8-7589-5aaa-8eed-bea45c6a538a", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/3M39HP/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/NPKZHP/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/NPKZHP/", "attachments": []}, {"guid": "0ea9e5f3-1b29-5342-a891-49b3def101ea", "code": "BWNJZN", "id": 28111, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/BWNJZN/Metzger-Robert_Z9djZCG.png", "date": "2023-06-20T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-28111-tiny-flink-minimizing-the-memory-footprint-of-apache-flink", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/BWNJZN/", "title": "Tiny Flink \u2014 Minimizing the memory footprint of Apache Flink", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "We will explore options to run Apache Flink with a very low resource footprint, allowing users to run full streaming SQL queries or custom streaming applications on JVMs with less than 500mb", "description": "Apache Flink has been designed for, and is mostly used with large-scale real-time data processing use-cases. Companies report about TBs of data being processed per second, or TBs of state in huge clusters.\r\n\r\nBut what if you need to process low-throughput streams? Running a full, distributed Flink cluster might be an overkill, as there\u2019s quite a bit of overhead for distributed coordination.\r\n\r\nIn this talk, we\u2019ll explore options to reduce your resource footprint. We\u2019ll dive deeper into Flink\u2019s MiniCluster, allowing you to run Flink in-JVM for integration tests, as a micro service or just a small processing your data in Kubernetes. We will also discuss lessons learned from running MiniCluster in production for a service offering Flink SQL in the cloud.\r\n\r\nAttend this talk if you want to learn about Apache Flink and its various options to deploy and configure it.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UCTFVQ", "name": "Robert Metzger", "avatar": "https://program.berlinbuzzwords.de/media/avatars/UCTFVQ_1acFHMK.webp", "biography": "Robert Metzger is a committer and PMC member at Apache Flink and a Staff Engineer at decodable. He previously co-founded and successfully exited data Artisans (now Ververica), the company originally creating and commercializing Flink. He is a frequent speaker at conferences such as the QCon, ApacheCon and meetups around the world.", "public_name": "Robert Metzger", "guid": "be70a582-0624-5b04-a7d2-61496b0c4034", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/UCTFVQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/BWNJZN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/BWNJZN/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/BWNJZN/resources/Robert_Metzger_-_Tiny_Flink_VThaNrK.pdf", "type": "related"}]}, {"guid": "5a63b0a9-46a8-545b-9598-5f0c3d0e5bbb", "code": "YCHTJK", "id": 31135, "logo": null, "date": "2023-06-20T17:15:00+02:00", "start": "17:15", "duration": "00:15", "room": "Kesselhaus", "slug": "berlin-buzzwords-2023-31135-closing-session", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YCHTJK/", "title": "Closing Session", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "", "description": "", "recording_license": "", "do_not_record": false, "persons": [], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YCHTJK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/YCHTJK/", "attachments": []}], "Maschinenhaus": [{"guid": "154c79a1-6aa6-517b-a912-959b772e538c", "code": "WS7LUL", "id": 28114, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/WS7LUL/Watson-Sophie_k2Qy4yU.png", "date": "2023-06-20T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-28114-avoiding-anti-patterns-in-technical-communication", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WS7LUL/", "title": "Avoiding Anti-patterns in Technical Communication", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Communicating technical knowledge effectively is a core skill for practitioners, but one which is often neglected. We\u2019ll give practical advice on how to (and not to!) communicate technical ideas.", "description": "Practitioners and researchers alike share technical knowledge across a wide range of mediums; from blogs and conference talks, to internal presentations and slack messages. However, communicating technical information effectively is not an easy skill to learn, and every day we are bombarded with poorly communicated content. In this talk we\u2019ll cover some common, but rarely recognised, anti-patterns in technical communication. We\u2019ll dive into why they are an ineffective way to get a point across, and discuss how to avoid them in your content. You will leave the talk with a clear understanding of how to improve your technical communication, making your blogs, talks and day-to-day discussions more effective and impactful.", "recording_license": "", "do_not_record": false, "persons": [{"code": "YYBKHB", "name": "Sophie Watson", "avatar": "https://program.berlinbuzzwords.de/media/avatars/YYBKHB_rELxtC6.webp", "biography": "Sophie is a data scientist at Nvidia where she focuses on tools and techniques for accelerating data science and machine learning workflows and workloads. She has previously worked to help customers build machine learning systems in the hybrid cloud. She\u2019s a frequent public speaker on topics including machine learning workflows on Kubernetes, recommendation engines, and MLOps. Sophie earned her PhD in Bayesian statistics.", "public_name": "Sophie Watson", "guid": "5f18efb2-db2d-5bfb-ada1-6313de792399", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/YYBKHB/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WS7LUL/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WS7LUL/", "attachments": []}, {"guid": "76e51085-5c5f-55bc-92a3-56e10b09f783", "code": "Q9Y9Y3", "id": 27982, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/Q9Y9Y3/Pietsch-Malte_1UVerm3.png", "date": "2023-06-20T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27982-connect-gpt-with-your-data-retrieval-augmented-generation", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/Q9Y9Y3/", "title": "Connect GPT with your data: Retrieval-augmented Generation", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Learn how to build with LLMs, like ChatGPT, and avoid typical pitfalls like hallucination and outdated information. Accompanied by practical code examples using the open source framework Haystack.", "description": "Large Language Models (LLMs), like ChatGPT, became the poster child of AI overnight. They changed how people search the web, how they write content, and how they code. These models have billions of parameters they can use to effectively store some of the information they saw during pre-training. This enables them to show deep knowledge of a subject, even if they weren't explicitly trained on it.\r\n\r\nYet, it\u2019s not straightforward to use LLMs in enterprise use cases and embed them successfully in your product.\r\n\r\nThe most common challenges with LLMs are\r\n1) They don't know anything about YOUR data\r\n2) Their knowledge is not up-to-date\r\n3) They hallucinate - it's hard to understand on what sources they based their answers on\r\n4) It\u2019s hard to assess their performance\r\n\r\nIn this talk, you will learn how to deal with all of the above challenges. We will demonstrate how to connect LLMs to your data and how to keep them up-to-date using retrieval-augmented generation. We will show how to design prompts that minimize hallucination and how to evaluate the performance of your NLP application by collecting end-user feedback. We share best practices of development workflows and typical traps along the way. \r\n\r\nEach step will be accompanied by practical code examples using the open source framework Haystack. By the end of the talk, you will not only know the methods to overcome the above challenges but also have code examples at hand that let you kickstart the development of your own NLP features.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KX8VYD", "name": "Malte Pietsch", "avatar": "https://program.berlinbuzzwords.de/media/avatars/KX8VYD_w0CA1c4.webp", "biography": "Malte is Co-Founder & CTO at deepset, where he builds Haystack - an open source framework that lets you quickly build production-ready NLP services for semantic search, question answering & more. He holds a M.Sc. with honors from TU Munich and conducted research at Carnegie Mellon University. Before founding deepset he worked as a data scientist for multiple startups. He is an open-source lover, likes reading papers before breakfast, and is obsessed with automating the boring parts of our work.", "public_name": "Malte Pietsch", "guid": "e3763cac-ca3f-5e9a-b4c8-9af4f3d97b5a", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/KX8VYD/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/Q9Y9Y3/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/Q9Y9Y3/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/Q9Y9Y3/resources/Malte_Pietsch_-_Connect_GPT_with_your_0PadQlq.pdf", "type": "related"}]}, {"guid": "a19121ad-c62c-50c2-b949-e53f76ee1e00", "code": "HG9XEL", "id": 27885, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/HG9XEL/Gerlowski-Jason_hPuunjX.png", "date": "2023-06-20T11:00:00+02:00", "start": "11:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27885-a-fresh-start-the-path-toward-apache-solr-s-v2-api", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/HG9XEL/", "title": "A Fresh Start? The Path Toward Apache Solr's v2 API", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Modernization efforts face particular hurdles in large, established OSS projects.  Come learn about the community and technical challenges encountered on Apache Solr's path towards revamped HTTP APIs.", "description": "Affecting broad changes in large, established open source projects is hard.  Their larger codebases make for more places to update.  Their age makes for more technical debt to overcome.  And their larger user-bases make for more stakeholder opinions to weigh and balance.  This talk will explore some of these ideas through the lens of Apache Solr's ongoing attempt to modernize its HTTP APIs and associated clients.  Some attention will be given to the state of Solr's APIs, but the primary focus will be the technical and community challenges encountered by the Solr community on the path towards its \"v2\" API.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8RMBAS", "name": "Jason Gerlowski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8RMBAS_pAhJVSw.webp", "biography": "A software engineer with 10+ years working on search, located on the East Coast in the U.S.  I'm a longtime committer and PMC member on the Apache Lucene and Solr projects.  Outside of tech, I enjoy reading and spending time outdoors with my family.", "public_name": "Jason Gerlowski", "guid": "1bd8790f-b6c9-5232-8894-a42a279a091b", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/8RMBAS/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/HG9XEL/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/HG9XEL/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/HG9XEL/resources/Jason_Gerlowski_-_A_fresh_start_Gt4igxh.pdf", "type": "related"}]}, {"guid": "132f3275-7993-5e2d-823f-0a4b3c9a0d75", "code": "ELVNYV", "id": 27890, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/ELVNYV/Putman-Houston_TGuAFN3.png", "date": "2023-06-20T11:50:00+02:00", "start": "11:50", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27890-rethinking-autoscaling-for-apache-solr-using-kubernetes", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ELVNYV/", "title": "Rethinking Autoscaling for Apache Solr using Kubernetes", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Apache Solr\u2019s built-in autoscaling is gone, but the need for autoscaling persists. Using Kubernetes\u2019 HPA, the Solr Operator and new Solr APIs, we re-introduce autoscaling for Solr on Kubernetes.", "description": "SolrCloud clusters are often large and complex, with each organization using its own code to deploy and maintain these clusters. The Solr Operator was a first step in consolidating complexity through official deployment tooling to run Solr on Kubernetes. However the Operator does not address scaling up and down based on demand. \r\n\r\nMuch like it provides generic deployments, Kubernetes provides generic ways of autoscaling applications, such as the HorizontalPodAutoscaler (HPA). This works especially well for stateless applications, much like deployments do. Solr is a stateful application that has specific state assigned to each pod (Solr node), therefore autoscaling SolrClouds with the HPA will not work by default.\r\n\r\nThe Solr Operator has already been built to extend Kubernetes\u2019 StatefulSets, Services and Ingresses to support Solr\u2019s unique use-case. Therefore it is the prefect mechanism to also bridge the gap between the HorizontalPodAutoscaler and Solr.\r\n\r\nThrough extending the functionality of the Solr Operator, and adding new APIs to Solr, we will show how autoscaling can be re-introduced to the Solr ecosystem.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BPP8YT", "name": "Houston Putman", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BPP8YT_K1QYXCF.webp", "biography": "Houston is a Lucene/Solr PMC member and committer. He works at Apple on the Open Source Technologies team, developing Solr and creating a better ecosystem for it in the cloud. Previously Houston worked at Bloomberg, as a member of the Search Infrastructure team. He has degrees in Computer Science & Mathematics from The University of Texas at Austin.", "public_name": "Houston Putman", "guid": "50cf9f7b-8b50-55e5-894b-bb33bb8a9b18", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/BPP8YT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ELVNYV/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/ELVNYV/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/ELVNYV/resources/Houston_Putman_-_Solr_Autoscaling_on__hz6jpE7.pdf", "type": "related"}]}, {"guid": "14d54ed1-a01e-595b-a25a-c04ba397e117", "code": "TMGNMF", "id": 25403, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/TMGNMF/Bashan-Gal_NQ9p40m.png", "date": "2023-06-20T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-25403-platform-engineering-is-all-about-product", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TMGNMF/", "title": "Platform Engineering is All About Product", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "\u201cPlatform Engineering,\u201d the latest buzzword, means building an internal platform to improve your SDLC in a way your developers will want to use. Can this be done with engineering skills alone?", "description": "\"Platform Engineering\" is the latest buzzword in modern software engineering. It is the discipline of designing and building toolchains and workflows that enable self-service capabilities for software engineering organizations in the cloud-native era. Today's holy grail for platform engineering is to achieve the most effective \"Internal Developer Platform\" (IDP) that enables the rest of the developers in the company to be as effective as possible. Can this job be accomplished with engineering skills alone?\r\n\r\nPlatform intersects with product in two ways: first, the platform must be optimized for supporting the development of the company-specific product. Second, the platform must be built with a product mindset and practices for its users - the developers- to adopt it. In this session, we will discuss how to build an engineering platform your engineers want to use. We will go over standard product practices to use when creating the developer platform and the importance of making sure your IDP helps developers build the company's products faster and better. We will define the role of the platform product manager (PPM) and his importance in ensuring our platform is not a glorified Rube Goldberg machine.\r\n\r\nIn this session, you will learn:\r\n\r\n- What is platform engineering? Is it just a new name for DevOps?\r\n- What makes an IDP and a platform team successful?\r\n- Who is the PPM? Why is he important? How do I convince my head of product we need one?\r\n- Practices you can use to build a successful platform and pitfalls to avoid.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8LZYBD", "name": "Gal Bashan", "avatar": "https://program.berlinbuzzwords.de/media/avatars/8LZYBD_5N4zjpS.webp", "biography": "Gal is the Director of Engineering at Epsagon, recently acquired by Cisco, working in the observability space with a focus on distributed tracing. Gal has a cyber-security background and experience in reverse engineering and network analysis. Gal was part of an elite army intelligence unit before joining Epsagon.", "public_name": "Gal Bashan", "guid": "2575519d-7b91-5917-bccf-0a3c457a4753", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/8LZYBD/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TMGNMF/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/TMGNMF/", "attachments": []}, {"guid": "55292975-6ccc-53d2-a151-0201914cc915", "code": "Q3MFKD", "id": 26093, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/Q3MFKD/Loughran-Steve-2_eJYmhSG.png", "date": "2023-06-20T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-26093-hadoop-vectored-io-your-data-just-got-faster", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/Q3MFKD/", "title": "Hadoop Vectored IO: your data just got faster!", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "We are introducing a new Hadoop Filesystem API called \"vectored read\" using which we can achieve significant speedups for all big data applications, especially in cloud storage like S3 and ABFS.", "description": "Since 2006 the world of big data has moved from terabytes to hundreds of petabytes, from local clusters to remote cloud storage, yet the original Apache Hadoop posix-based file APIs have barely changed.\r\n\r\nIt is wonderful that these APIs have worked so well, but we can do a lot better with remote object stores, by providing new operations which suit them better, targeted at columnar data libraries such as ORC and Parquet. Only a few libraries need to migrate to these APIs for significant speedups of all big data applications.\r\n\r\nThis talk introduces a new Hadoop Filesystem API called \"vectored read\", coming in Hadoop 3.4. An extension of the classic FSDataInputStream it is automatically offered by all filesystem clients.\r\nThe S3A connector is the first object store to provide a custom implementation, reading different blocks of data in parallel. In Apache Hive benchmarks with a modified ORC library, we saw a 2x speedup compared to using the classic s3a connector through the Posix APIs.\r\n\r\nWe will introduce the API spec, the S3A implementation, and the benchmarks, and show how to use it in your own applications. We will also cover our ongoing work on providing similar speedups with other object stores, and the use of the API in other applications.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BWSF8M", "name": "Steve Loughran", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BWSF8M_1t6BEuD.webp", "biography": "Steve Loughran is a developer at Cloudera where he focuses on Hadoop and Cloud Integration. Prior to joining Cloudera he was a research scientist at HP Laboratories, where he was involved in the early Ubiquitous Computing/Wearable Computing work. This is why the failure of the smart home is such a disappointment. For fun he falls off bicycles -which is why he spent December 2021 shouting at lightbulbs while waiting for his broken collarbone to heal.", "public_name": "Steve Loughran", "guid": "534bbca3-f925-5026-a60a-c11e82e65b53", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/BWSF8M/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/Q3MFKD/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/Q3MFKD/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/Q3MFKD/resources/Steve-Loughran-Hadoop_Vectored_IO_Mq2kLRW.pdf", "type": "related"}]}, {"guid": "0f7a37de-1013-5d0c-8af7-7ff075445b70", "code": "CWV3T3", "id": 27970, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/CWV3T3/jakubowski-julien_YBHxfGV.png", "date": "2023-06-20T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-27970-scalable-distributed-messaging-streaming-with-apache-pulsar", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/CWV3T3/", "title": "Scalable distributed messaging&streaming with Apache Pulsar", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this session, you'll discover seven Apache Pulsar features that enable you to build amazing event-driven applications and how Apache Pulsar differs from traditional message brokers.", "description": "Today, when you think about building event-driven and real-time applications, the words that come to you spontaneously are probably: RabbitMQ, ActiveMQ, or Kafka. These are the solutions that dominate this landscape. But have you ever heard of Apache Pulsar?\r\n\r\nAfter a brief presentation of the fundamental concepts of messaging, you'll discover the Pulsar features that enable you to build amazing event-driven applications. \r\nYou'll learn the following:\r\n- how Apache Pulsar architecture differs from other brokers\r\n- how it enables scaling processing power & data independently, quickly, and with no hassle\r\n- how it guarantees high durability of messages\r\n- how it can be relevant as a unified streaming & messaging platform\r\n- how to integrate Pulsar with your existing application portfolio that is compatible with Kafka or RabbitMQ\r\n- some insight into the open-source community around Pulsar", "recording_license": "", "do_not_record": false, "persons": [{"code": "7GSUZK", "name": "Julien Jakubowski", "avatar": "https://program.berlinbuzzwords.de/media/avatars/7GSUZK_q0XxTgJ.webp", "biography": "Julien Jakubowski is a Developer Advocate at StreamNative with over 20+ years of experience as a developer, staff engineer, and consultant. He has built several complex systems with distributed, scalable, and event-driven architecture for various industrial sectors such as retail, finance, and manufacturing.\r\n\r\nJulien delivers talks at conferences on software engineering: Devoxx, Java User Groups, and Google Developer Groups, among others.\r\n\r\nJulien is also one of the founders and leaders of the Ch'ti JUG - Java User Group of Lille, France.", "public_name": "Julien Jakubowski", "guid": "052b411c-7574-5406-a7e2-5bd68abc77ad", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/7GSUZK/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/CWV3T3/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/CWV3T3/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/CWV3T3/resources/Julien_Jakubowski_-_Scalable_Distribu_vS43NJZ.pdf", "type": "related"}]}, {"guid": "b42653c2-f1ad-5f31-abb1-eea3a591faab", "code": "HAKWWW", "id": 28165, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/HAKWWW/Krenn-Phillip_VD4J2lh.png", "date": "2023-06-20T16:50:00+02:00", "start": "16:50", "duration": "00:20", "room": "Maschinenhaus", "slug": "berlin-buzzwords-2023-28165-catch-the-fraud-with-observability-and-analytics", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/HAKWWW/", "title": "Catch the fraud \u2014 with observability and analytics", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "This is the story of how to catch cheaters by combining observability and analytics data through the power of search.", "description": "Elastic \u2014 the company behind Elasticsearch, Kibana,... \u2014 is running an annual competition to reward contributions like pull requests, blog posts, talks, etc. Once we started giving away MacBooks, we got a massive influx of fraud. This talk tells the tongue-in-cheek story of how people cheated and also how we caught them:\r\n* Observability: Find the bots and trace everyone's actions to figure out what is a coincidence and what is not.\r\n* Analytics: See how people are trying to exploit the system through fake accounts, shady content, or bending the rules.\r\n\r\nWhile we initially hadn't planned for this scenario, having the power of search available across observability and analytics data let us do many interesting correlations to get a complete picture of the monster we had created.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EFGBTQ", "name": "Philipp Krenn", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EFGBTQ_LofyOOy.webp", "biography": "Philipp lives to demo interesting technology. Having worked as a web, infrastructure, and database engineer for over ten years, Philipp is now a developer advocate and EMEA team lead at Elastic \u2014 the company behind the Elastic Stack consisting of Elasticsearch, Kibana, Beats, and Logstash. Based in Vienna, Austria, he is constantly traveling Europe and beyond to speak and discuss open source software, search, databases, infrastructure, and security.", "public_name": "Philipp Krenn", "guid": "72a7db76-e027-575d-8a15-bdc76aa2f301", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/EFGBTQ/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/HAKWWW/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/HAKWWW/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/HAKWWW/resources/Philipp_Krenn_-_CatchTheFraud_9D1naqw.pdf", "type": "related"}]}], "Palais Atelier": [{"guid": "1835b72b-a0bb-5228-88f6-afcfda0fcd82", "code": "MDFQCE", "id": 27857, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/MDFQCE/Selakovic-Marija_ya3li6V.png", "date": "2023-06-20T09:30:00+02:00", "start": "09:30", "duration": "00:20", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-27857-when-ms-matter-maximizing-query-performance-in-cratedb", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/MDFQCE/", "title": "When ms matter: Maximizing query performance in CrateDB", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "Achieving optimal execution plans in distributed databases is a challenging task. This talk will focus on CrateDB: a distributed SQL database, and key strategies for optimizing its query performance.", "description": "Distributed databases provide easy scaling, high performance, and availability crucial to handle large amounts of data. However, achieving optimal execution plans in these systems is often a challenge and requires special considerations. In this talk, we will explore the key concepts and best practices for optimizing query performance in the CrateDB database. CrateDB is a highly scalable and distributed SQL database that offers a unique blend of SQL and NoSQL capabilities. Although the focus of the talk is going to be on CrateDB, most of the techniques we are going to discuss apply to many distributed databases.\r\n\r\nAs a first step, we will go through query planning to better understand potential bottlenecks. Then, we will discuss the practical implications of indexing, sharding, and partitioning strategies, and provide practical advice on how to further optimize CrateDB queries for optimal performance. All these topics will be covered by real-world examples and practical solutions to some of the most common issues. At the end of the talk, you will be equipped with practical tips and techniques for detecting performance issues and optimizing your queries. \r\n\r\nKey learnings:\r\n- Intro to CrateDB and query plans\r\n- How different sharding, partitioning, and indexing strategies affect query performance\r\n- Real-life examples and tips for debugging slow queries", "recording_license": "", "do_not_record": false, "persons": [{"code": "PUXWWV", "name": "Marija Selakovic", "avatar": "https://program.berlinbuzzwords.de/media/avatars/PUXWWV_OXG3A0W.webp", "biography": "Marija Selakovic is a developer advocate at Crate.io, working with the CrateDB database and various other data engineering tools. She holds a Ph.D. degree in computer science from TU Darmstadt and a Master's degree in software engineering from VU University Amsterdam. As a developer advocate, Marija builds various technical content, speaks at developer conferences, and helps other software developers be productive and successful in using CrateDB.", "public_name": "Marija Selakovic", "guid": "f4177af4-c7a8-5e91-9b6a-19cc16ef011c", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/PUXWWV/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/MDFQCE/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/MDFQCE/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/MDFQCE/resources/Marija_Selakovic_-_When_miliseconds_m_mCVOMIw.pdf", "type": "related"}]}, {"guid": "aab86b1b-277d-5a81-95a2-48d26ec24083", "code": "EQMRJP", "id": 28012, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/EQMRJP/campinas-stephane_7uN3txd.png", "date": "2023-06-20T10:00:00+02:00", "start": "10:00", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-28012-deep-dive-into-an-elasticsearch-plugin-for-query-time-joins", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/EQMRJP/", "title": "Deep dive into an Elasticsearch plugin for query-time joins", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Siren Federate is an Elasticsearch plugin for joining inverted indices at query-time. Learn in this talk about its inner workings and how it complements features of Elasticsearch like runtime fields.", "description": "Data are often at the basis of critical decisions in many different sectors, ranging from e-commerce to cyber-security: machine and search logs, user information, metrics over transactions, etc. Data in such domains are by nature voluminous and inter-connected. Analytics systems are expected not only to search and aggregate those data, but also to join them: join operations are often necessary in order to explore inter-connected data and get insights from them. Analysts often interact with such systems by following an explorative and iterative process that represents their train of thoughts. Such systems then must have fast response times to avoid impeding the mental process of the analysts. Whilst Elasticsearch is a fantastic high performance analytics engine, it presents some limitations in certain cases when it comes to joining data from different indices at query-time.\r\n\r\nIn this talk, we will present Siren\u2019s ten years-long effort in implementing distributed joins on top of Elasticsearch. We will introduce Siren Federate \u2013 our Elasticsearch plugin that provides query-time join capabilities over indices \u2013 and we will discuss some of the challenges we had to tackle during its development. We will begin by describing how joins are performed by Federate, from the reception of a query till the computation of its results. Then we will show the importance of caching join results for performance, and how a cache can be efficiently implemented. Talking about performance, we will explain the benefits of adopting a vectorized data processing model by showing some experimental results. To conclude, we will discuss the importance of the expressiveness of a query language by illustrating the Federate DSL and how it integrates with some advanced features of Elasticsearch such as runtime fields.", "recording_license": "", "do_not_record": false, "persons": [{"code": "F8M9AF", "name": "St\u00e9phane Campinas", "avatar": "https://program.berlinbuzzwords.de/media/avatars/F8M9AF_xqgJXKl.webp", "biography": "St\u00e9phane completed his Ph.D. studies at the University of Galway (Ireland) working on an Information Retrieval engine for Linked Data and became deeply interested in that field. He then transitioned to working for Siren, a spin-off of that research endeavor. From that point on, he worked on Federate, an Elasticsearch plugin for computing joins between inverted indices, and was responsible for maintaining and developing various parts, e.g., from the query planner to the interactions with Lucene like with the query cache.", "public_name": "St\u00e9phane Campinas", "guid": "507a8e96-3e41-5f6f-a1ee-e0b87dd2e689", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/F8M9AF/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/EQMRJP/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/EQMRJP/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/EQMRJP/resources/Stephane_Campinas_-_Siren_Federate_wyKPlde.pdf", "type": "related"}]}, {"guid": "9e867122-e8b2-5ea5-b676-f23f44dd2c03", "code": "9SJGJ3", "id": 32403, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/9SJGJ3/Voorbach-Byron_VB0KisD.png", "date": "2023-06-20T11:00:00+02:00", "start": "11:00", "duration": "00:20", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-32403-from-keyword-to-vector", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/9SJGJ3/", "title": "From keyword to vector", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "During this talk, I will take you on my over-a-decade-long journey in search. Starting from having witnessed the inception of Elasticsearch to my current endeavors with Weaviate, I will share my first-hand experience of the evolution, challenges, and lessons learned along the way.", "description": "The future of search lies in machine learning-based approaches, a realization that has led me to the world of semantic search. As part of my current endeavors with Weaviate, I\u2019ve come to understand the transformative potential of this advanced technology and how it\u2019s reshaping our digital experiences.\r\n\r\nMy journey into this field began during an internship in the early stages of my programming career. A group of my then colleagues ventured out to form a new company called Elasticsearch. Recognizing their potential, my mentor recommended that I focus my personal development on search technologies. This advice sparked my exploration of Lucene, Solr, and Elasticsearch, among others.\r\n\r\nIn the subsequent years as a search consultant, I wrestled with the inherent challenges of keyword-based systems. The tasks were anything but straightforward, from managing semantics, synonyms, and typos to trying to decipher user intent. However, this endeavor was far from fruitless - it led to a deep understanding of the intricate workings of search technologies.\r\n\r\nThis talk will take you through significant advancements in search over the years, peppered with practical insights and hard-earned wisdom I\u2019ve accumulated along the way. The goal is not to argue that vector search replaces keyword search but to illustrate how combining both can yield the best results. Attendees can look forward to a nuanced understanding of search \r\ntechnologies, their evolution, and their potential to shape our future digital experiences", "recording_license": "", "do_not_record": false, "persons": [{"code": "Y7W9LX", "name": "Byron Voorbach", "avatar": "https://program.berlinbuzzwords.de/media/avatars/Y7W9LX_MMF5wlY.webp", "biography": "Byron Voorbach has spent over a decade in the search domain, providing consultation to companies and aiding in implementing large-scale search systems. As the current Head of Sales Engineering at Weaviate, he collaborates with customers globally to harness the power of semantic search in their operations.\r\nA regular conference speaker and active contributor to open-source projects, Byron enjoys tackling complex problems and venturing into diverse domains. His work also includes building projects demonstrating cutting-edge search technologies\u2019 potential and functionality and committing new functionality to Weaviate.", "public_name": "Byron Voorbach", "guid": "3b85a609-68e0-580a-ab67-ed9e37e303e1", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/Y7W9LX/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/9SJGJ3/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/9SJGJ3/", "attachments": []}, {"guid": "778be5a3-aa32-5d17-a1d0-fc8160064600", "code": "KWFLKN", "id": 27940, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/KWFLKN/Golubenco-Tudor_tQut9dy.png", "date": "2023-06-20T11:30:00+02:00", "start": "11:30", "duration": "00:20", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-27940-semantic-vs-keyword-search-as-context-for-gpt", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KWFLKN/", "title": "Semantic vs keyword search as context for GPT", "subtitle": "", "track": null, "type": "Short Talk", "language": "en", "abstract": "If you want to build a chat bot like ChatGPT on your own data, you need to use search to provide the context. Usually semantic search is used, but we've found that keyword search has some pros.", "description": "The OpenAI ChatGPT has taken the world by storm and people want to be able to offer the same type of chat bot experience on their own data. Such a bot can answer questions based on your documentation or knowledge base.\r\n\r\nThis can be done with the OpenAI API by providing the right context, extracted from your data, to the model. You can do this in two steps:\r\n\r\n* the search step: perform a search to select the documentation pages that are likely to contain the answer.\r\n* the GPT step: provide these pages as context with a prompt like \"With this context: .... answer this questions: ...\".\r\n\r\nFor the search step, semantic search is often used, because it makes use of the LLM capabilities. However, we have found that in practice keyword search (e.g. BM25 based) has some advantages when it comes to tuning the search step, and it tends to be more \"explainable\".", "recording_license": "", "do_not_record": false, "persons": [{"code": "9ZQVUM", "name": "Tudor Golubenco", "avatar": "https://program.berlinbuzzwords.de/media/avatars/9ZQVUM_1pEdbm9.webp", "biography": "Tudor is CTO at Xata, a modern serverless database that provides extra data functionality like AI, search, or image transformations. Previously, he had worked at data companies like Elastic and Oracle.", "public_name": "Tudor Golubenco", "guid": "1fd4e85a-c178-560b-9268-173e3dcc3801", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/9ZQVUM/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KWFLKN/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KWFLKN/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/KWFLKN/resources/Tudor_Golubenco_-_Semantic_vs_keyword_XmjmVSg.pdf", "type": "related"}]}, {"guid": "2b542fdc-fe73-55a2-8f65-34df004f6d6e", "code": "WFLLCY", "id": 28171, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/WFLLCY/Loughran-Steve_Yopov2y.png", "date": "2023-06-20T12:00:00+02:00", "start": "12:00", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-28171-alexa-is-the-smart-home-vision-failing", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WFLLCY/", "title": "Alexa, is The Smart Home vision failing?", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Amazon's Alexa team has lost billions. Google and Apple's hub aren't great successes. Is the Smart home failing? How can you keep your lights on when they depend on cloud infrastructure to work?", "description": "A harsh critique of the current state of the \"Smart Home\". The vision was of a home full of smart devices -network enabled, remotely controllable and managed via local applications, phones and cloud services.\r\n\r\nWe now have three competing vendors all trying to be the one ecosystem of the Smart Home: Apple, Google and Amazon. The financial disaster which is Alexa highlights how even for them it is a way to lose money -and raises the question \"how long will Amazon keep the blue light on your Alexa on?\". Some the problems technical, but many are related to usability and integration.\r\n\r\nBased on experiences of attempting to use devices through all the ecosystems, and even writing a basic Alexa skill, this highlight how broken the smart home currently is. Like the need to give Alexa and Philips Hue light bulb groups different names so alexa knows which office lights to turn on. Or the way which cloud-hosted platforms can change their speech recognition and pattern matching algorithms without any warning or control. What longevity can we expect of the hardware-cloud-enhanced devices may have a lower purchase price but they depend on VC cash to keep working. \r\n\r\nWhat can we do? We must embrace platforms such as Home Assistant to stay in control. Yes, you get add debug statements to python modules to fix plug authentication -but if we developers do this, others will benefit. We also need to look at the survival of cloud integration -a subscription model is the only one which works. Finally, there is the promise of Threads, the low power wireless mesh network, and Matter, the model and API for devices and applications -useful but insufficient.\r\n\r\nA code free talk; the audience will get the historical context of the early Ubicomp work and the experience of trying to get the modern platforms to achieve that visions from the turn of the century. While things have moved on from hacked together hardware and rigged demos -they haven't moved on far enough.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BWSF8M", "name": "Steve Loughran", "avatar": "https://program.berlinbuzzwords.de/media/avatars/BWSF8M_1t6BEuD.webp", "biography": "Steve Loughran is a developer at Cloudera where he focuses on Hadoop and Cloud Integration. Prior to joining Cloudera he was a research scientist at HP Laboratories, where he was involved in the early Ubiquitous Computing/Wearable Computing work. This is why the failure of the smart home is such a disappointment. For fun he falls off bicycles -which is why he spent December 2021 shouting at lightbulbs while waiting for his broken collarbone to heal.", "public_name": "Steve Loughran", "guid": "534bbca3-f925-5026-a60a-c11e82e65b53", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/BWSF8M/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WFLLCY/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WFLLCY/", "attachments": []}, {"guid": "efac0c79-0011-5ef8-9ad5-7ff98c010a22", "code": "WCPQTA", "id": 27766, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/WCPQTA/Sprenger-Stefan_PbErXUV.png", "date": "2023-06-20T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-27766-a-crash-course-in-error-handling-for-streaming-data-pipeline", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WCPQTA/", "title": "A Crash Course in Error Handling for Streaming Data Pipeline", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Learn how to handle errors in streaming data pipelines using concepts, such as dead-letter queues.", "description": "Streaming data pipelines pose unique requirements for the handling of errors and other malfunctions because they are executed continuously and cannot be manually supervised. As a consequence, we need to automate the handling of errors as much as possible.\r\nThis talk answers three critical questions in the context of data streaming: What are potential errors? How shall we handle the different kinds of errors? Which metrics help us to keep track of the health of streaming data pipelines?\r\nWe discuss (1) errors that happen when consuming Apache Kafka topics, e.g., when deserializing records, (2) errors that happen when producing records to Apache Kafka topics, e.g., when serializing data, (3) errors that happen when processing records, e.g., exceptions raised in data transformations, and (4) errors that are caused by external factors, e.g., when the streaming data pipeline exceeds available memory resources.\r\nOnce potential errors have been introduced, we show how to cope with them through design patterns, like dead-letter queues, or practical approaches, like log-based alerts.\r\nFinally, we discuss important metrics for monitoring the health of streaming data pipelines, e.g., consumer lags, or producing rates for dead-letter topics.\r\nWhile we use examples from Kafka Streams applications, the presented content can be easily transferred to other stream processing frameworks.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DCXVFF", "name": "Stefan Sprenger", "avatar": "https://program.berlinbuzzwords.de/media/avatars/DCXVFF_SEk0KRo.webp", "biography": "Stefan is co-founder and CEO at DataCater GmbH, the company behind the real-time ETL platform based on Apache Kafka. He has more than 10 years of experience in software and data engineering and researched database systems on modern hardware during his PhD studies.", "public_name": "Stefan Sprenger", "guid": "5c6e2333-8bba-5ab4-abe6-f7d300fbf7df", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/DCXVFF/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WCPQTA/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/WCPQTA/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/WCPQTA/resources/Stefan_Sprenger-A_Crash_Course_in_Err_afEYEoA.pdf", "type": "related"}]}, {"guid": "88941d1d-46da-5897-91f6-21f8cb00ce33", "code": "EAD8JD", "id": 27821, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/EAD8JD/Fiorucci-Stefan_Ao9Dv7j.png", "date": "2023-06-20T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-27821-fact-checking-rocks-how-to-build-a-fact-checking-system", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/EAD8JD/", "title": "Fact Checking Rocks: how to build a fact-checking system", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In this infodemic era, fact-checking is becoming a vital task.\r\nIn this talk, we\u2019ll discover how to build a simple fact-checking system for rock music, leveraging the power of open-source libraries.", "description": "In this infodemic era, fact-checking is becoming a vital task. However, it is a complex and time-consuming activity.\r\n\r\nIn this talk, we will see how to combine Information Retrieval tools with modern Language Models to simply implement a fact-checking baseline with low human effort.\r\n\r\nI will show you how to build a funny use case around rock music.\r\n\r\nThe application is based on several Python open-source libraries: Haystack, FAISS, Hugging Face Transformers, Sentence Transformers.\r\nThis step-by-step implementation will be an opportunity to learn more about Dense retrieval and Natural Language Inference models in a hands-on way. I will share some insights into developing modern Natural Language applications.\r\n\r\n**Why it's relevant:**\r\n\r\nFact-checking is significant to the society, although it is still difficult to do automatically. Using modern NLP tools can help speed up and automate part of this task.\r\n\r\n**What the audience will learn:**\r\n- Dense retrieval for semantic search\r\n- Natural Language Inference models\r\n- How to build a fact-checking system using Haystack, FAISS, Hugging Face Transformers, Sentence Transformers.\r\n- How to integrate powerful (Large) Language Models in your NLP applications, conditioning them to operate on your knowledge base\r\n- How to efficiently combine tools from Information Retrieval, NLP, and Vector Search", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZVFVZ8", "name": "Stefano Fiorucci", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ZVFVZ8_6x21TPn.webp", "biography": "Always passionate about computer science \ud83d\udcbb, Stefano approached Machine Learning after receiving an  education in engineering.\r\n\r\nHis interest in Machine Learning comes from how the field sits at the intersection of scientific research and software craftsmanship. Over time, Stefano gained a deep understanding of Natural Language Processing and Information Retrieval.\r\n\r\nLately, he has been fascinated by the vibrant field of neural/semantic/vector search \ud83d\udd0e, and enjoys contributing to open source projects in this field.", "public_name": "Stefano Fiorucci", "guid": "81ffb24d-2b43-5c82-8a09-59ce2a145e25", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/ZVFVZ8/"}], "links": [{"title": "Github project", "url": "https://github.com/anakin87/fact-checking-rocks", "type": "related"}, {"title": "Demo of the project", "url": "https://huggingface.co/spaces/anakin87/fact-checking-rocks", "type": "related"}], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/EAD8JD/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/EAD8JD/", "attachments": []}, {"guid": "7f0be176-3bf5-5440-b483-dc38752c01b3", "code": "FKKNBD", "id": 28144, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/FKKNBD/Grebennikov-Roman_Goloviznin-Vsevolod_P5Qvv1R.png", "date": "2023-06-20T16:00:00+02:00", "start": "16:00", "duration": "00:40", "room": "Palais Atelier", "slug": "berlin-buzzwords-2023-28144-learning-to-hybrid-search", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/FKKNBD/", "title": "Learning to hybrid search", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Combining BM25, neural embeddings and customer behavior with Learning-to-Rank into an ultimate ranking ensemble, with examples on Amazon ESCI e-commerce search dataset.", "description": "Traditional term search has good precision but lacks semantics. Modern neural search is good at semantics but can miss customer behavior. Learning-to-rank approach adapts to customer behavior, but only if your baseline retrieval is already good enough.\r\n\r\nThe current hype about neural search can make an impression that it's the ultimate solution for all problems of legacy term search and LTR. You just only need [disclaimer: irony ahead] to do a very simple thing of fine-tuning a giant neural network to notice all the dependencies between queries, documents and customer behavior on all the data you have. But what if instead of replacing A with B, you can combine the strengths of all the approaches?\r\n\r\nIn this talk, we will take an example of an e-commerce search with an open-source Amazon's ESCI/ESCI-S dataset and compare traditional text matching and Learning-to-Rank approaches with modern neural search methods on real data. We will show how combining multiple old, and new approaches in a single hybrid system can deliver an even better result than each of them separately.", "recording_license": "", "do_not_record": false, "persons": [{"code": "NGCSXL", "name": "Roman Grebennikov", "avatar": "https://program.berlinbuzzwords.de/media/avatars/NGCSXL_uRCcgSO.webp", "biography": "Principal Engineer at Delivery Hero SE, working on search personalization and recommendations. A pragmatic fan of functional programming, learn-to-rank models and performance engineering.", "public_name": "Roman Grebennikov", "guid": "45e06948-5b1f-55ba-a894-653eb73a9f1e", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/NGCSXL/"}, {"code": "3ZW37D", "name": "Vsevolod Goloviznin", "avatar": "https://program.berlinbuzzwords.de/media/avatars/3ZW37D_LZAPbwo.webp", "biography": "Software engineer in the past, switched tracks to work closer with customers and product. Has multi-year experience of communicating with customers to understand what they really want and translating this information to engineers as a Head of Product.", "public_name": "Vsevolod Goloviznin", "guid": "f0125fba-3b1c-58b9-9d60-e9de3b261967", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/3ZW37D/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/FKKNBD/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/FKKNBD/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/FKKNBD/resources/grebennikov_learning_to_hybrid_search_cU1eJKJ.pdf", "type": "related"}]}], "Frannz Salon": [{"guid": "3bc45cc8-0f2d-53e6-a224-6db489c9d52e", "code": "FFNZSK", "id": 25783, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/FFNZSK/Vlaeva-Stanimira_BveVwPy.png", "date": "2023-06-20T09:30:00+02:00", "start": "09:30", "duration": "01:10", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-25783-advanced-search-plays-with-graphql", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/FFNZSK/", "title": "Advanced Search Plays with GraphQL", "subtitle": "", "track": null, "type": "Workshop", "language": "en", "abstract": "This demo-heavy workshop scores a hat trick by combining Apache Lucene, MongoDB, and GraphQL to easily build search functionality across data collections and 3rd party APIs into applications.", "description": "GraphQL is rapidly growing in popularity as the new standard for working with APIs, and it\u2019s easy to see why! This groundbreaking API query language gives developers a single endpoint to access exactly the data they need. This eliminates over-fetching, decreases the response payload, and avoids multiple costly round trips to the server and long page load times.\r\nThis could be a session, long or short, or a workshop. The application is a football themed app (or a movie app if the organizers prefer) where we start small with exposing data via a GraphQL endpoint in minutes, but then we make the application much different and more fun by using GraphQL custom resolvers to add a 3rd party TikTok endpoint to the mix. The code is hosted in a code sandbox so attendees will leave with the inspiration, best practices, and actual code to implement immediately in their workflow.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JFZQXV", "name": "Stanimira Vlaeva", "avatar": "https://program.berlinbuzzwords.de/media/avatars/JFZQXV_kRUostU.webp", "biography": "Stanimira Vlaeva is a Developer Advocate at MongoDB and a Google Developer Expert for Angular. She is passionate about explaining complex technical topics in an understandable way, live-coding, and contributing to open-source software. Her Twitter DMs are always open!", "public_name": "Stanimira Vlaeva", "guid": "38027160-a4c7-5a76-b898-e1918cad5d5b", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/JFZQXV/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/FFNZSK/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/FFNZSK/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/FFNZSK/resources/Stanimira_Vlaeva_-_Advanced_Search_Pl_KL5uBfn.pdf", "type": "related"}]}, {"guid": "1e276956-392f-516e-9e74-780fc97f3230", "code": "PRQ7PV", "id": 28093, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/PRQ7PV/Petreti-Ilaria_Ruggero-Anna_xYMLsg4.png", "date": "2023-06-20T11:00:00+02:00", "start": "11:00", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-28093-how-to-implement-online-search-quality-evaluation-with-kibana", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/PRQ7PV/", "title": "How to Implement Online Search Quality Evaluation with Kibana", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Conducting online testing is crucial for assessing a model\u2019s performance in a real-world scenario. This talk explores a customized approach for evaluating ranking models using Kibana.", "description": "Online testing represents a fundamental method to assess the performance of a ranking model in practical applications, providing the information needed to improve and better understand its behavior.\r\nDespite the advantages, the currently available evaluation tools have certain limitations. For this reason, we will present an alternative and customized approach to evaluate ranking models using Kibana.\r\nThe talk will begin with an overview of online testing, including its benefits and drawbacks. Then, we will provide an in-depth exploration of our Kibana implementation, detailing the reasons behind our approach. Attendees will learn about the various tools provided by Kibana, and with practical examples, we will show how to create visualizations and dashboards, complete with queries and code, to compare different rankers.\r\nAttending this presentation will provide participants with valuable knowledge on how to leverage Kibana for the purpose of evaluating ranking models on custom metrics and on specific contexts such as the most popular and \u201cpopulous\u201d queries.", "recording_license": "", "do_not_record": false, "persons": [{"code": "CBFBFW", "name": "Ilaria Petreti", "avatar": "https://program.berlinbuzzwords.de/media/avatars/CBFBFW_tBd7ngd.webp", "biography": "After an initial experience in the healthcare sector, believing strongly in the power of Big Data and Digital Transformation, Ilaria earned a Master in Data Science.\r\nSince joining the Sease team (in 2020), she has gained a diverse range of experiences through projects related to Machine Learning and Natural Language Processing for Information Retrieval systems.\r\nIlaria has been working on integrating Learning To Rank and Search Quality Evaluation in e-commerce ecosystems, with the goal of improving their performance and the relevance of search results.\r\nAdditionally, she is an active member of the information retrieval research community, regularly sharing her knowledge through blogs and talks, contributing to open-source projects, and participating in international conferences.", "public_name": "Ilaria Petreti", "guid": "9495bfbd-3bd9-5666-a0d4-98c3a515b555", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/CBFBFW/"}, {"code": "MJTHGB", "name": "Anna Ruggero", "avatar": "https://program.berlinbuzzwords.de/media/avatars/MJTHGB_HEvvFb3.webp", "biography": "Anna has demonstrated a passion for Information Retrieval since the University. Graduated from the University of Padua, with a computer science master\u2019s degree dissertation in Entity Search, Anna has been working as a Search Consultant in Sease since 2019.\r\nShe actively works to support clients in the process of improving their search engines with the implementation of innovative personalized solutions.\r\nShe specializes in the integration of machine learning techniques with information retrieval systems, from Learning to Rank techniques to Neural Searches and Recommender Systems. She extensively worked on e-commerce websites, improving their performance by developing personalized models and evaluation systems.\r\nAnna highly believes in innovation and research, keeping up-to-date with the latest academic studies and contributing to them. She participated in the European Conference of Information Retrieval 2022 with a poster on offline and online evaluation in the industry; and published a paper on improving interleaving techniques for the evaluation of information retrieval systems at the ECIR 2023.", "public_name": "Anna Ruggero", "guid": "53fa5d8d-d62e-5ef7-ab92-95248b0ef81d", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/MJTHGB/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/PRQ7PV/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/PRQ7PV/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/PRQ7PV/resources/Ilaria_Petrieti_-_Anna_Ruggero_-_How__Rbvvqck.pdf", "type": "related"}]}, {"guid": "e3db037e-f2d5-5492-9839-d54f156f94f3", "code": "N9JRVC", "id": 28056, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/N9JRVC/Ebrahimpour-Khosrow_41WBDnS.png", "date": "2023-06-20T11:50:00+02:00", "start": "11:50", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-28056-highly-available-search-at-shopify", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/N9JRVC/", "title": "Highly Available Search at Shopify", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk shares the story of how Shopify implemented seamless storage autoscaling for Elasticsearch that powers search for millions of merchants without data loss.", "description": "Millions of merchants rely on Shopify\u2019s search infrastructure to sell their products and fulfill their orders. To be successful, merchants need their data to be highly available and also searchable in a matter of seconds. Moreover, these merchants are spread in different jurisdictions across the globe where data residency regulations require them to ensure their sensitive data stays within their jurisdiction. However, since their buyers are also spread across the globe, non personal data such as store products should be available globally and close to buyers to provide a fast search experience.  \r\n\r\nThis talk explains how the search platform team at Shopify built a highly available search infrastructure that indexes petabytes of data from traditional databases to Elasticsearch through Kafka in record time. \r\nSince search is a critical service for a global commerce platform in Shopify\u2019s scale, the indexing pipeline writing to Elasticsearch is implemented with high availability and disaster recovery as a key requirement. That is, if one region becomes unavailable, the designed data replication mechanism allows the search infrastructure to provide service without impacting merchants and buyers.\r\nMoreover, this infrastructure is distributed across the globe and designed in a way to follow data residency regulations of different jurisdictions while making sure buyers are able to search products with minimum delay.\r\n\r\nShopify\u2019s search infrastructure has proven to be performant and capable of indexing millions of documents per minute while serving millions of queries at the same time. The lessons learned shared in this talk about the challenges of building a highly available and performant search infrastructure will be interesting to individuals and will encourage them to solve similar challenges.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ENYGVN", "name": "Khosrow Ebrahimpour", "avatar": "https://program.berlinbuzzwords.de/media/avatars/ENYGVN_WJd84Yl.webp", "biography": "I'm a Production Engineering manager at Shopify, where I lead the search platform team. Prior to that, I've worked at public companies and government organizations focusing on infrastructure handling large scale data.", "public_name": "Khosrow Ebrahimpour", "guid": "46114551-e0f9-5254-a1b0-1f4603d0909a", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/ENYGVN/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/N9JRVC/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/N9JRVC/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/N9JRVC/resources/Khosrow_Ebrahimpour_-_Highly_availabl_a8ucfeL.pdf", "type": "related"}]}, {"guid": "fd550680-0b17-5313-a175-75ec188998b4", "code": "K8AR9R", "id": 27976, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/K8AR9R/Bayton-Martin_KEqRHXJ.png", "date": "2023-06-20T14:00:00+02:00", "start": "14:00", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-27976-using-dense-vector-search-at-the-eu-publications-office", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/K8AR9R/", "title": "Using Dense Vector search at the EU Publications Office", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "How dense vector functionality was used to provide several \u2018Google-like\u2019 capabilities such as Extractive Answers and knowledge graph search over a large dataset at the EU Publications Office.", "description": "In this session you will discover how dense vector functionality was used to enhance traditional search and provide users with a 'Google-like' search experience during a proof of concept over a large dataset of multi-lingual legal content curated by the European Union Publications Office in Luxembourg. The presentation will explain how a combination of Elasticsearch, Google BERT transformer models, and the Pureinsights Discovery Platform (PDP) were utilised during the project and discuss the results obtained. There will also be a live demonstration showing the power of semantic understanding across documents and search queries.", "recording_license": "", "do_not_record": false, "persons": [{"code": "K9BEWT", "name": "Martin Bayton", "avatar": "https://program.berlinbuzzwords.de/media/avatars/K9BEWT_Ws83r0N.webp", "biography": "Martin is currently Director of International Marketing at Pureinsights and is an evangelist for search and data analytics. He joined Pureinsights from Accenture where he worked for the Search & Content Analytics Group in Applied Intelligence. Prior to Accenture, he was Senior Manager Global OEM Partner Marketing at Qlik. Martin also spent close to 10 years working for the enterprise search vendor Convera.\r\nMartin holds an MBA from Nottingham Business School and a BSc in Mechanical Engineering from Nottingham Trent University.", "public_name": "Martin Bayton", "guid": "4ad7d93b-f46a-5961-926a-8100ac0a743c", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/K9BEWT/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/K8AR9R/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/K8AR9R/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/K8AR9R/resources/Martin_Bayton_-_Using_Dense_Vector_se_VJtVNmW.pdf", "type": "related"}]}, {"guid": "3fd3f61c-ada8-59fa-a66d-ffdc428977e1", "code": "KDT73L", "id": 27755, "logo": "https://program.berlinbuzzwords.de/media/berlin-buzzwords-2023/submissions/KDT73L/Bogh-K%C3%B6ster-Torsten_Berger-Dennis_uXP3UmH.png", "date": "2023-06-20T14:50:00+02:00", "start": "14:50", "duration": "00:40", "room": "Frannz Salon", "slug": "berlin-buzzwords-2023-27755-searching-large-data-sets-in-near-constant-time", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KDT73L/", "title": "Searching large data sets in (near) constant time", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Tackle large search results by estimating hit count, interpolating a first phase ranking and limiting\r\nthe returned result set to the most relevant documents in a multi-million document index.", "description": "In low latency search environments, queries producing large result sets are a real pain. A proper ranking of large result sets burns a lot cpu. Those queries have the potential to slow down or even brick your cluster. On the customer side it is questionable whether it makes sense to return millions of documents as the customer has to filter them afterwards anyway.\r\n\r\nThose large result sets caused us heavy headache as they significantly reduced the available compute head room on the nodes of our Solr cluster. They even bricked the whole cluster when hitting the cluster in high volume. In this project report we'll guide you through the steps (and math) how we:\r\n\r\n- constructed index based random experiments,\r\n- estimate the rough query hit count of a query by extrapolating bucket search results,\r\n- collect and apply static first phase ranking information,\r\n- use the information collected to filter the result set to the most relevant documents to return no more than a given number of documents,\r\n- extrapolate hit and facet counts to mimic the original search result and\r\n- handle document collapsing and facetting.\r\n\r\nIn this talk we'll guide you through the software architectural aspects as well as the math applied. Although applied on a Solr search system, this concept can be applied on other search engines as well.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EC7FUQ", "name": "Torsten B\u00f8gh K\u00f6ster", "avatar": "https://program.berlinbuzzwords.de/media/avatars/EC7FUQ_a6oRD2D.webp", "biography": "Torsten is a freelance search & operations engineer with a focus on open-source search, container, and cloud technology. He tweaks Apache Solr installation in the cloud and on bare-metal with a focus on observability.", "public_name": "Torsten B\u00f8gh K\u00f6ster", "guid": "506f5f77-daa2-5d6e-967f-9821df380070", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/EC7FUQ/"}, {"code": "XRQLDW", "name": "Dennis Berger", "avatar": "https://program.berlinbuzzwords.de/media/avatars/XRQLDW_4gdXwFr.webp", "biography": "Dennis Berger is a freelance software and infrastructure engineer. He started his career in the industry developing low-latency applications and infrastructures with deep knowledge of the operating system, kernel, and application code. Now he focuses on developing fast and resource-efficient applications across the stack, from the IO path in the kernel to the user space, using innovative and modern technologies.", "public_name": "Dennis Berger", "guid": "4366bd5d-8e86-559e-a0c4-6ae290d907f7", "url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/speaker/XRQLDW/"}], "links": [], "feedback_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KDT73L/feedback/", "origin_url": "https://program.berlinbuzzwords.de/berlin-buzzwords-2023/talk/KDT73L/", "attachments": [{"title": "Slides", "url": "/media/berlin-buzzwords-2023/submissions/KDT73L/resources/Torsten_Bogh_Koester__Dennis_Berger_-_BW0Xm0f.pdf", "type": "related"}]}]}}]}}}