From 9525d5f8aa190f143af44036d726054d0fc1f7fb Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Fri, 27 Jun 2025 20:01:33 +0000 Subject: [PATCH 01/25] exmaples+website+sdks/python: update docs and exmaples for milvus transform --- .../beam-ml/milvus_enrichment_transform.ipynb | 2413 +++++++++++++++++ .../transforms/elementwise/enrichment.py | 72 + .../transforms/elementwise/enrichment_test.py | 82 +- .../python/elementwise/enrichment-milvus.md | 67 + .../python/elementwise/enrichment.md | 1 + .../section-menu/en/documentation.html | 1 + 6 files changed, 2633 insertions(+), 3 deletions(-) create mode 100644 examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb create mode 100644 website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-milvus.md diff --git a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb new file mode 100644 index 000000000000..a6ea23b9492f --- /dev/null +++ b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb @@ -0,0 +1,2413 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "47053bac", + "metadata": {}, + "outputs": [], + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ] + }, + { + "cell_type": "markdown", + "id": "aa881240-2f38-4335-9d4d-444776d77c92", + "metadata": {}, + "source": [ + "# Use Apache Beam and Milvus to enrich data\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "0611da21-d031-4b16-8301-9b76bda731e7", + "metadata": {}, + "source": [ + "This notebook shows how to enrich data by using the Apache Beam [enrichment transform](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment/) with [Milvus](https://milvus.io/). The enrichment transform is an Apache Beam turnkey transform that lets you enrich data by using a key-value lookup. This transform has the following features:\n", + "\n", + "- The transform has a built-in Apache Beam handler that interacts with Milvus data during enrichment.\n", + "- The enrichment transform uses client-side throttling to rate limit the requests. The default retry strategy uses exponential backoff. You can configure rate limiting to suit your use case.\n", + "\n", + "This notebook demonstrates the following search engine optimization use case:\n", + "\n", + "A specialized technical search engine company wants to improve its query result relevance by dynamically enriching search results with semantically related content. The example uses a vector database of technical articles and documentation stored in Milvus to enrich incoming user queries. The enriched data is then used to provide users with more comprehensive and contextually relevant search results, especially for complex technical topics.\n", + "\n", + "## Before you begin\n", + "Set up your environment and download dependencies.\n", + "\n", + "### Install Apache Beam\n", + "To use the enrichment transform with the built-in Milvus handler, install the Apache Beam SDK version 2.67.0 or later." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e550cd55-e91e-4d43-b1bd-b0e89bb8cbd9", + "metadata": {}, + "outputs": [], + "source": [ + "# Disable tokenizers parallelism to prevent deadlocks when forking processes\n", + "# This avoids the \"huggingface/tokenizers: The current process just got forked\" warning.\n", + "import os\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "31747c45-107a-49be-8885-5a6cc9dc1236", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# The Apache Beam test dependencies are included here for the TestContainers\n", + "# Milvus standalone DB container that will be used later in the demo.\n", + "!pip install rich sentence_transformers llama_index --quiet\n", + "!pip install apache_beam[interactive,test]>=2.67.0 --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "666e0c2b-0341-4b0e-8d73-561abc39bb10", + "metadata": {}, + "outputs": [], + "source": [ + "# Standard library imports.\n", + "from collections import defaultdict\n", + "from math import ceil\n", + "from typing import List\n", + "\n", + "# Third-party imports.\n", + "import apache_beam as beam\n", + "from apache_beam.ml.rag.types import Chunk, Content, Embedding\n", + "from apache_beam.transforms.enrichment import Enrichment\n", + "import numpy as np\n", + "import pandas as pd\n", + "from pymilvus import DataType, CollectionSchema, FieldSchema, Function, FunctionType, MilvusClient, RRFRanker\n", + "from pymilvus.milvus_client import IndexParams\n", + "from rich import print_json\n", + "from sentence_transformers import SentenceTransformer\n", + "from torch import cuda\n", + "\n", + "# Local application imports.\n", + "from llama_index.core.text_splitter import SentenceSplitter\n", + "from apache_beam.ml.rag.enrichment.milvus_search import (\n", + " HybridSearchParameters, \n", + " KeywordSearchMetrics, \n", + " KeywordSearchParameters,\n", + " MilvusCollectionLoadParameters, \n", + " MilvusConnectionParameters, \n", + " MilvusSearchEnrichmentHandler,\n", + " MilvusSearchParameters, \n", + " SearchStrategy, \n", + " VectorSearchMetrics, \n", + " VectorSearchParameters\n", + ")\n", + "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusEnrichmentTestHelper" + ] + }, + { + "cell_type": "markdown", + "id": "338808ff-3f80-48e5-9c76-b8d19f8769b7", + "metadata": {}, + "source": [ + "## Collect Data" + ] + }, + { + "cell_type": "markdown", + "id": "d83ad549-5ee1-4a4c-ae5a-e638c3d0279f", + "metadata": {}, + "source": [ + "This content has been paraphrased from publicly available information on the internet using a large language model (OpenAI’s GPT-4) and is provided for informational purposes only." + ] + }, + { + "cell_type": "markdown", + "id": "d39a070a-206d-41f6-9033-fff0d5ea2128", + "metadata": {}, + "source": [ + "The third data point, related to Google Beam, was intentionally included to illustrate the importance of metadata filtering (filtered search) in Milvus—such as when a user searches for the term “Beam.” without it the vector database retrieval engine may confuse between Apache Beam and Google Beam." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38781cf5-e18f-40f5-827e-2d441ae7d2fa", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " {\n", + " \"id\": \"1\",\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", + " \"keywords\": [\"Apache Beam\", \"stream processing\", \"batch processing\", \"data pipelines\", \"SDK\"],\n", + " \"tags\": [\"Data Engineering\", \"Open Source\", \"Streaming\", \"Batch\", \"Big Data\"],\n", + " \"content\": (\n", + " \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. \"\n", + " \"Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. \"\n", + " \"Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. \"\n", + " \"The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. \"\n", + " \"Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. \"\n", + " \"Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. \"\n", + " \"Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. \"\n", + " \"It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. \"\n", + " \"Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. \"\n", + " \"This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. \"\n", + " \"The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. \"\n", + " \"The Beam model is based on a unified programming model that decouples pipeline logic from execution. \"\n", + " \"This makes it easier to reason about time and state in both batch and streaming pipelines. \"\n", + " \"Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. \"\n", + " \"Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. \"\n", + " \"Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. \"\n", + " \"Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n", + " )\n", + " },\n", + " {\n", + " \"id\": \"2\",\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", + " \"keywords\": [\"Google Cloud\", \"Dataflow\", \"Apache Beam\", \"serverless\", \"stream and batch\"],\n", + " \"tags\": [\"Cloud Computing\", \"Data Pipelines\", \"Google Cloud\", \"Serverless\", \"Enterprise\"],\n", + " \"content\": (\n", + " \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. \"\n", + " \"It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. \"\n", + " \"Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. \"\n", + " \"Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. \"\n", + " \"Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. \"\n", + " \"Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. \"\n", + " \"With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. \"\n", + " \"It’s a key component for architects building scalable, cloud-native data platforms. \"\n", + " \"Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. \"\n", + " \"Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. \"\n", + " \"Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments. \"\n", + " \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. \"\n", + " \"It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. \"\n", + " \"Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. \"\n", + " \"In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. \"\n", + " \"Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\n", + " )\n", + " },\n", + " {\n", + " \"id\": \"3\",\n", + " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", + " \"keywords\": [\"Google Beam\", \"Project Starline\", \"3D video\", \"AI communication\", \"real-time meetings\"],\n", + " \"tags\": [\"AI\", \"Communication\", \"3D Technology\", \"Remote Work\", \"Enterprise Tech\"],\n", + " \"content\": (\n", + " \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. \"\n", + " \"Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. \"\n", + " \"This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. \"\n", + " \"Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. \"\n", + " \"Powered by Google AI, Beam represents a significant leap in communication technology. \"\n", + " \"Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. \"\n", + " \"Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. \"\n", + " \"Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. \"\n", + " \"It’s a promising step toward more human and effective remote interactions.\"\n", + " )\n", + " }\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "758c2af7-12c7-477b-9257-3c88712960e7", + "metadata": {}, + "source": [ + "## Exploratory Data Analysis (EDA)" + ] + }, + { + "cell_type": "markdown", + "id": "5e751905-7217-4571-bc07-991ef850a6b2", + "metadata": {}, + "source": [ + "### Average Words/Tokens per Doc" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "489e93b6-de41-4ec3-be33-a15c3cba12e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
# Words
count3.000000
mean253.666667
std72.858310
min172.000000
25%224.500000
50%277.000000
75%294.500000
max312.000000
\n", + "
" + ], + "text/plain": [ + " # Words\n", + "count 3.000000\n", + "mean 253.666667\n", + "std 72.858310\n", + "min 172.000000\n", + "25% 224.500000\n", + "50% 277.000000\n", + "75% 294.500000\n", + "max 312.000000" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The second video may skew the average tokens results since it is a youtube short video.\n", + "contents = [c['content'] for c in corpus]\n", + "content_lengths = [len(content.split(\" \")) for content in contents]\n", + "df = pd.DataFrame(content_lengths, columns=['# Words'])\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "eb32aad0-febd-45af-b4bd-e2176b07e2dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The mean word count for each video is about 254 words, which corresponds to a rough token count of 331 tokens.\n" + ] + } + ], + "source": [ + "mean_word_count = ceil(np.mean(content_lengths))\n", + "token_to_word_ratio = 1.3\n", + "approx_token_count = ceil(mean_word_count * token_to_word_ratio)\n", + "print(f'The mean word count for each video is about {mean_word_count} words, which corresponds to a rough token count of {approx_token_count} tokens.')" + ] + }, + { + "cell_type": "markdown", + "id": "42c1c159-875d-411b-a009-4361301b39f6", + "metadata": {}, + "source": [ + "## Preprocess Data" + ] + }, + { + "cell_type": "markdown", + "id": "d545355e-41da-4c53-ba9a-4d33b1fe376c", + "metadata": {}, + "source": [ + "### Chunking" + ] + }, + { + "cell_type": "markdown", + "id": "a034c5d0-0906-4193-80ac-736a32d7b47e", + "metadata": {}, + "source": [ + "We'll use sentence splitting as the chunking strategy for simplicity.
\n", + "Ideally, we would pass a tokenizer here — preferably the same one used by the retriever — to ensure consistency.
\n", + "However, in this example, we are not using a tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e7e45d70-0c23-409d-b435-b9479245c1ff", + "metadata": {}, + "outputs": [], + "source": [ + "# The `chunk_size` parameter is constrained by the embedding model we’re using.\n", + "# Since we’re using `sentence-transformers/all-MiniLM-L6-v2`, which has a maximum token limit of ~384 tokens,\n", + "# we need to ensure chunk sizes stay well within that limit.\n", + "# Given that each document in our dataset contains approximately 331 tokens,\n", + "# using a chunk size of 256 allows us to preserve nearly the most semantic meaning of each entry\n", + "# while staying safely under the model’s token limit.\n", + "chunk_size = 256\n", + "llama_txt_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5a013b08-d7e7-4367-ad49-43ad1320158f", + "metadata": {}, + "outputs": [], + "source": [ + "def split_contents(corpus: list[dict], text_splitter: SentenceSplitter, content_field: str='content') -> list[list[str]]:\n", + " result = []\n", + " for video in corpus:\n", + " split = llama_txt_splitter.split_text(video[content_field])\n", + " result.append(split)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2d5ea747-40b3-474e-ac36-ccb81256a36c", + "metadata": {}, + "outputs": [], + "source": [ + "content_splits = split_contents(corpus, llama_txt_splitter, \"content\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9917cefb-6271-4285-a75d-a6d1bfcbfd06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "  [\n",
+       "    \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n",
+       "    \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n",
+       "  ],\n",
+       "  [\n",
+       "    \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n",
+       "    \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\n",
+       "  ],\n",
+       "  [\n",
+       "    \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\n",
+       "  ]\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\n", + " \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", + " \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", + " \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print_json(data=content_splits)" + ] + }, + { + "cell_type": "markdown", + "id": "c860e558-2da3-45a6-9e54-acb8b4ffab22", + "metadata": {}, + "source": [ + "### Embedding Generation" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "aa55928d-c6ca-47c5-883d-d14eb0aa1298", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's choose `sentence-transformers/all-MiniLM-L6-v2` as our embedding generator here.\n", + "# It gives a good balance between embedding generation speed, accuracy, and being free to use.\n", + "model_name = 'sentence-transformers/all-MiniLM-L6-v2'\n", + "model = SentenceTransformer(model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "26e80afa-b9dc-4778-8301-ce38264d58cd", + "metadata": {}, + "outputs": [], + "source": [ + "def get_default_device():\n", + " return \"cuda:0\" if cuda.is_available() else \"cpu\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "68e04606-ca81-4a1f-81d2-964495295ed3", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_embedding(chunk, device=get_default_device()):\n", + " return list(map(float, model.encode(chunk, device=device)))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "43c55049-fbd9-4a1c-ae74-c12b5f5a03ee", + "metadata": {}, + "outputs": [], + "source": [ + "def encode_content_splits(content_splits: list[list[str]],\n", + " model: SentenceTransformer,\n", + " device: str = get_default_device()\n", + " ) -> list[list[tuple[str,list]]]:\n", + " result = []\n", + " for split in content_splits:\n", + " sub_result = []\n", + " for chunk in split:\n", + " encoded = encode_embedding(chunk, device)\n", + " sub_result.append((chunk, encoded))\n", + " result.append(sub_result)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3ec7c739-6adc-4591-b5b2-9e60d7783c3c", + "metadata": {}, + "outputs": [], + "source": [ + "text_vector_tuples = encode_content_splits(content_splits, model)" + ] + }, + { + "cell_type": "markdown", + "id": "3afe67f9-d3cb-499b-b84b-ad8b14f40362", + "metadata": {}, + "source": [ + "### Joining Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "541794c7-f9a6-4d42-a522-8f4a3d1b1dfa", + "metadata": {}, + "outputs": [], + "source": [ + "def join_metadata(corpus: list[dict], \n", + " text_vector_list: list[list[tuple[str, list]]],\n", + " unique_id_field: str='id',\n", + " content_field: str='content',\n", + " embedding_field: str='content_embedding'\n", + " ) -> list[dict]:\n", + " result = []\n", + " for indx, embeddings in enumerate(text_vector_list):\n", + " for j, (chunk_text, embedding) in enumerate(embeddings):\n", + " doc = {**corpus[indx]}\n", + " doc[content_field] = chunk_text\n", + " doc[embedding_field] = embedding\n", + " doc[\"doc_id\"] = f\"{doc[unique_id_field]}_{j+1}\"\n", + " del doc[unique_id_field]\n", + " result.append(doc)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6f2ebedc-7d72-4deb-838c-42b8f103ceb4", + "metadata": {}, + "outputs": [], + "source": [ + "docs = join_metadata(corpus, text_vector_tuples)" + ] + }, + { + "cell_type": "markdown", + "id": "765115e1-4327-44f6-9dff-5d79121eeb02", + "metadata": {}, + "source": [ + "## Milvus Sink I/O" + ] + }, + { + "cell_type": "markdown", + "id": "492adeba-c6cd-404d-9d48-dfcaeca503c2", + "metadata": {}, + "source": [ + "This could be delegated to the Beam Milvus Sink I/O once it is implemented. For now, we will use pymilvs client directly for indexing." + ] + }, + { + "cell_type": "markdown", + "id": "3889aaa4-3c0c-4d71-bad3-b196b5eac8dc", + "metadata": {}, + "source": [ + "### Setup Milvus" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5ae9bc82-9ad7-46dd-b254-19cbdcdd0e07", + "metadata": {}, + "outputs": [], + "source": [ + "db = None" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "aff7b261-3330-4fa9-9a54-3fd87b42521f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Pulling image testcontainers/ryuk:0.8.1\n", + "Container started: aa9a64365154\n", + "Waiting for container with image testcontainers/ryuk:0.8.1 to be ready ...\n", + "Pulling image milvusdb/milvus:v2.5.10\n", + "Container started: 74649e2c3f75\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n" + ] + } + ], + "source": [ + "if not db:\n", + " db = MilvusEnrichmentTestHelper.start_db_container()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "31496ee0-75a2-48ad-954e-9c4ae5abbf5e", + "metadata": {}, + "outputs": [], + "source": [ + "milvus_connection_parameters = MilvusConnectionParameters(uri=db.uri, user=db.user, password=db.password, db_id=db.id)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "82627714-2425-4058-9b47-d262f015caf7", + "metadata": {}, + "outputs": [], + "source": [ + "client = MilvusClient(**milvus_connection_parameters.__dict__)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e8a85f51-5d5f-4533-bf0f-ec825e613dc2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.5.10'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.get_server_version()" + ] + }, + { + "cell_type": "markdown", + "id": "2344abb9-c170-4496-993e-736e2b50c2bb", + "metadata": {}, + "source": [ + "### Define Schema" + ] + }, + { + "cell_type": "markdown", + "id": "31130864-a7c6-45af-bc15-8b64bb9ff8fa", + "metadata": {}, + "source": [ + "#### Define Fields" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c014af94-1bb7-44e4-842c-1039f4a2a11d", + "metadata": {}, + "outputs": [], + "source": [ + "fields = [\n", + " FieldSchema(name=\"id\", dtype=DataType.INT64, is_primary=True, auto_id=True),\n", + " FieldSchema(name=\"vector\", dtype=DataType.FLOAT_VECTOR, dim=model.get_sentence_embedding_dimension()),\n", + " FieldSchema(name=\"sparse_vector\", dtype=DataType.SPARSE_FLOAT_VECTOR),\n", + " FieldSchema(name=\"title\", dtype=DataType.VARCHAR, max_length=256),\n", + " FieldSchema(name=\"content\", dtype=DataType.VARCHAR, max_length=65279),\n", + " FieldSchema(name=\"combined_text\", dtype=DataType.VARCHAR, max_length=65279+256, enable_analyzer=True),\n", + " FieldSchema(name=\"doc_id\", dtype=DataType.VARCHAR, max_length=100),\n", + " FieldSchema(name=\"keywords\", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=100, max_capacity=64),\n", + " FieldSchema(name=\"tags\", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=100, max_capacity=32),\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "76535a60-87f5-48e0-9c73-38aa2c6b4d0e", + "metadata": {}, + "source": [ + "### Define Functions for Processing" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "54fb3428-b007-4804-9d79-b3933d3256c5", + "metadata": {}, + "outputs": [], + "source": [ + "bm25_function = Function(\n", + " name=\"content_bm25_emb\",\n", + " input_field_names=[\"combined_text\"],\n", + " output_field_names=[\"sparse_vector\"],\n", + " function_type=FunctionType.BM25)\n", + "\n", + "functions = [bm25_function]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4c2f123a-5949-4974-af48-a5db5b168c11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': , 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': , 'params': {'dim': 384}}, {'name': 'sparse_vector', 'description': '', 'type': , 'is_function_output': True}, {'name': 'title', 'description': '', 'type': , 'params': {'max_length': 256}}, {'name': 'content', 'description': '', 'type': , 'params': {'max_length': 65279}}, {'name': 'combined_text', 'description': '', 'type': , 'params': {'max_length': 65535, 'enable_analyzer': True}}, {'name': 'doc_id', 'description': '', 'type': , 'params': {'max_length': 100}}, {'name': 'keywords', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 64}, 'element_type': }, {'name': 'tags', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 32}, 'element_type': }], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': , 'input_field_names': ['combined_text'], 'output_field_names': ['sparse_vector'], 'params': {}}]}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema = CollectionSchema(fields=fields,functions=functions)\n", + "schema" + ] + }, + { + "cell_type": "markdown", + "id": "04f15d4b-1192-464b-9635-cb4cbc530431", + "metadata": {}, + "source": [ + "### Define Indices" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "671f4352-2086-4428-83be-0de48926682d", + "metadata": {}, + "outputs": [], + "source": [ + "index_params = IndexParams()" + ] + }, + { + "cell_type": "markdown", + "id": "378909d0-3aa8-46a5-8983-3ab29a1b0049", + "metadata": {}, + "source": [ + "#### Define Dense Vector Index" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "aa8baae5-7c38-4e78-ace4-304c7dc6b127", + "metadata": {}, + "outputs": [], + "source": [ + "index_params.add_index(\n", + " field_name=\"vector\",\n", + " index_name=\"dense_vector_ivf_flat\",\n", + " index_type=\"IVF_FLAT\",\n", + " metric_type=VectorSearchMetrics.COSINE.value,\n", + " params={\"nlist\": 1024})" + ] + }, + { + "cell_type": "markdown", + "id": "f4b45f5a-e583-4d77-9640-75842211fefa", + "metadata": {}, + "source": [ + "#### Define Sparse Vector Index" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d970a35b-f9b2-4f8f-93ef-8de5c83c31b5", + "metadata": {}, + "outputs": [], + "source": [ + "index_params.add_index(\n", + " field_name=\"sparse_vector\",\n", + " index_name=\"sparse_inverted_index\",\n", + " index_type=\"SPARSE_INVERTED_INDEX\",\n", + " metric_type=KeywordSearchMetrics.BM25.value,\n", + " params={\"inverted_index_algo\": \"DAAT_MAXSCORE\", \"bm25_k1\": 1.2, \"bm25_b\": 0.75})" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "0d45a6ad-2009-4e30-b38d-73266da98a06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'field_name': 'vector', 'index_type': 'IVF_FLAT', 'index_name': 'dense_vector_ivf_flat', 'nlist': 1024, 'metric_type': 'COSINE'},\n", + " {'field_name': 'sparse_vector', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index_params" + ] + }, + { + "cell_type": "markdown", + "id": "22a260da-8869-40bb-9cbf-28a73e8cca24", + "metadata": {}, + "source": [ + "### Create Collection" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "51dd4423-240c-4271-bb8c-6270f399a25c", + "metadata": {}, + "outputs": [], + "source": [ + "collection_name = \"beam_minilm_256\"" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "9620b1f2-51fa-491c-ad3f-f0676b9b25f6", + "metadata": {}, + "outputs": [], + "source": [ + "client.drop_collection(collection_name=collection_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e6cf3a1d-265c-44db-aba8-d491fab290d5", + "metadata": {}, + "outputs": [], + "source": [ + "client.create_collection(collection_name=collection_name, schema=schema, index_params=index_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "94497411-43d3-4300-98b3-1cb33759738e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.has_collection(collection_name)" + ] + }, + { + "cell_type": "markdown", + "id": "b10fc2bb-b17c-4d8b-85de-7a0bc10f6779", + "metadata": {}, + "source": [ + "### Index Data" + ] + }, + { + "cell_type": "markdown", + "id": "38b10fcf-7b07-4bf5-a3b0-581ccdd09fe3", + "metadata": {}, + "source": [ + "#### Index" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "20fd6f92-277f-42a3-b0a1-d9e9cb030caa", + "metadata": {}, + "outputs": [], + "source": [ + "data_ready_to_index = []\n", + "for doc in docs:\n", + " item = {}\n", + " item[\"vector\"] = doc[\"content_embedding\"]\n", + " item[\"content\"] = doc[\"content\"]\n", + " item[\"doc_id\"] = doc[\"doc_id\"]\n", + " item[\"title\"] = doc[\"title\"]\n", + " item[\"keywords\"] = doc[\"keywords\"]\n", + " item[\"tags\"] = doc[\"tags\"]\n", + " item[\"combined_text\"] = f\"{doc['title']}. {doc['content']}\"\n", + " data_ready_to_index.append(item)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "178e59dd-d9aa-4948-a02b-f57ee919f0ff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'insert_count': 5, 'ids': [459025737739141235, 459025737739141236, 459025737739141237, 459025737739141238, 459025737739141239], 'cost': 0}" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.insert(collection_name=collection_name, data=data_ready_to_index)" + ] + }, + { + "cell_type": "markdown", + "id": "fa5c502d-2a37-4050-a846-73bebb1bf6c0", + "metadata": {}, + "source": [ + "#### Check the Indexed Data" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "b01b111e-41f2-4d9f-b7f5-4fc42305fbe0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'id': 459025737739141235, 'distance': 0.5704954862594604, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': 'Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.', 'doc_id': '1_1', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", + "---\n", + "{'id': 459025737739141236, 'distance': 0.43758389353752136, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\", 'doc_id': '1_2', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", + "---\n", + "{'id': 459025737739141238, 'distance': 0.36327481269836426, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.', 'doc_id': '2_2', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", + "---\n", + "{'id': 459025737739141239, 'distance': 0.34582412242889404, 'entity': {'title': 'Google Beam: 3D Communication Powered by AI', 'content': 'Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.', 'doc_id': '3_1', 'keywords': ['Google Beam', 'Project Starline', '3D video', 'AI communication', 'real-time meetings'], 'tags': ['AI', 'Communication', '3D Technology', 'Remote Work', 'Enterprise Tech']}}\n", + "---\n", + "{'id': 459025737739141237, 'distance': 0.2492937296628952, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.', 'doc_id': '2_1', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", + "---\n" + ] + } + ], + "source": [ + "# Search by content vector similarity.\n", + "query_embedding = model.encode(\"What is apache beam\")\n", + "\n", + "search_results = client.search(\n", + " collection_name=collection_name,\n", + " data=[query_embedding],\n", + " anns_field=\"vector\",\n", + " limit=5,\n", + " output_fields=[\"title\", \"content\", \"doc_id\", \"keywords\", \"tags\"]\n", + ")\n", + "\n", + "for hits in search_results:\n", + " for hit in hits:\n", + " print(hit)\n", + " print(\"---\")" + ] + }, + { + "cell_type": "markdown", + "id": "ea478136-2ca8-4fee-bb1e-6bfcc2e97c93", + "metadata": {}, + "source": [ + "## Milvus Beam Enrichment Handler" + ] + }, + { + "cell_type": "markdown", + "id": "e9ad2509-3e5d-42e8-b565-ecccde38b8f4", + "metadata": {}, + "source": [ + "### Prep for Milvus Beam Enrichment Handler" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "4911e8cc-10f1-4d21-9251-1b756b61f2c1", + "metadata": {}, + "outputs": [], + "source": [ + "class FormatAndPrintResults(beam.PTransform):\n", + " def expand(self, pcoll):\n", + " return pcoll | beam.Map(self.format_and_print)\n", + " \n", + " @staticmethod\n", + " def format_and_print(chunk):\n", + " # Create a clean structure to display.\n", + " formatted_result = {\n", + " \"query\": chunk.content.text,\n", + " \"query_embedding\": FormatAndPrintResults.get_embedding_count(chunk),\n", + " \"results\": []\n", + " }\n", + " \n", + " # Extract the enrichment data\n", + " enrichment_data = chunk.metadata.get('enrichment_data', defaultdict(list))\n", + " \n", + " # Format each result with its distance score\n", + " for i in range(len(enrichment_data.get('id', []))):\n", + " result = {\n", + " \"id\": enrichment_data['id'][i],\n", + " \"distance\": round(enrichment_data['distance'][i], 4),\n", + " \"fields\": enrichment_data['fields'][i] if i < len(enrichment_data.get('fields', [])) else {}\n", + " }\n", + " formatted_result[\"results\"].append(result)\n", + " \n", + " # Sort by distance in descending order (highest/best first)\n", + " formatted_result[\"results\"] = sorted(formatted_result[\"results\"], key=lambda x: x[\"distance\"], reverse=True)\n", + "\n", + " # Print the formatted JSON\n", + " print_json(data=formatted_result)\n", + " \n", + " # Return the original chunk for further processing if needed\n", + " return chunk\n", + "\n", + " @staticmethod\n", + " def get_embedding_count(chunk):\n", + " if chunk.embedding:\n", + " if chunk.embedding.dense_embedding:\n", + " return len(chunk.embedding.dense_embedding)\n", + " if chunk.embedding.sparse_embedding:\n", + " return len(chunk.embedding.sparse_embedding)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "dcbed23b-1fc2-4f89-a6d0-e05c15d5e655", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "MilvusConnectionParameters(uri='http://localhost:55825', user='', password='', db_id='default', token='', timeout=None, kwargs={})" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "milvus_connection_parameters" + ] + }, + { + "cell_type": "markdown", + "id": "656110c9-1360-49fd-ba17-f55f2257f127", + "metadata": {}, + "source": [ + "### Vector Search" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "74db1238-0a04-4e08-818d-5bce8f09006b", + "metadata": {}, + "outputs": [], + "source": [ + "query = encode_embedding(\"what is beam?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "79e16531-8bec-4b4b-9ed3-cebd705480e0", + "metadata": {}, + "outputs": [], + "source": [ + "search_parameters = MilvusSearchParameters(\n", + " collection_name=collection_name,\n", + " search_strategy=VectorSearchParameters(limit=10, anns_field=\"vector\"),\n", + " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "cbef1911-6464-4ba1-8974-ed00896c7e8b", + "metadata": {}, + "outputs": [], + "source": [ + "collection_load_parameters = MilvusCollectionLoadParameters() " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "f0481286-3f2b-4690-a2f6-a5a00de3ff34", + "metadata": {}, + "outputs": [], + "source": [ + "milvus_handler = MilvusSearchEnrichmentHandler(\n", + " connection_parameters=milvus_connection_parameters,\n", + " search_parameters=search_parameters,\n", + " collection_load_parameters=collection_load_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "35ee37f2-60cd-4d5d-aef6-aed4fda79161", + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{\n",
+       "  \"query\": null,\n",
+       "  \"query_embedding\": 384,\n",
+       "  \"results\": [\n",
+       "    {\n",
+       "      \"id\": 459025737739141235,\n",
+       "      \"distance\": 0.453,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Apache Beam\",\n",
+       "          \"stream processing\",\n",
+       "          \"batch processing\",\n",
+       "          \"data pipelines\",\n",
+       "          \"SDK\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Data Engineering\",\n",
+       "          \"Open Source\",\n",
+       "          \"Streaming\",\n",
+       "          \"Batch\",\n",
+       "          \"Big Data\"\n",
+       "        ],\n",
+       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n",
+       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141236,\n",
+       "      \"distance\": 0.4353,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Apache Beam\",\n",
+       "          \"stream processing\",\n",
+       "          \"batch processing\",\n",
+       "          \"data pipelines\",\n",
+       "          \"SDK\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Data Engineering\",\n",
+       "          \"Open Source\",\n",
+       "          \"Streaming\",\n",
+       "          \"Batch\",\n",
+       "          \"Big Data\"\n",
+       "        ],\n",
+       "        \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n",
+       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141239,\n",
+       "      \"distance\": 0.3927,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Google Beam\",\n",
+       "          \"Project Starline\",\n",
+       "          \"3D video\",\n",
+       "          \"AI communication\",\n",
+       "          \"real-time meetings\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"AI\",\n",
+       "          \"Communication\",\n",
+       "          \"3D Technology\",\n",
+       "          \"Remote Work\",\n",
+       "          \"Enterprise Tech\"\n",
+       "        ],\n",
+       "        \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n",
+       "        \"title\": \"Google Beam: 3D Communication Powered by AI\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141238,\n",
+       "      \"distance\": 0.2925,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Google Cloud\",\n",
+       "          \"Dataflow\",\n",
+       "          \"Apache Beam\",\n",
+       "          \"serverless\",\n",
+       "          \"stream and batch\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Cloud Computing\",\n",
+       "          \"Data Pipelines\",\n",
+       "          \"Google Cloud\",\n",
+       "          \"Serverless\",\n",
+       "          \"Enterprise\"\n",
+       "        ],\n",
+       "        \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n",
+       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141237,\n",
+       "      \"distance\": 0.2342,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Google Cloud\",\n",
+       "          \"Dataflow\",\n",
+       "          \"Apache Beam\",\n",
+       "          \"serverless\",\n",
+       "          \"stream and batch\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Cloud Computing\",\n",
+       "          \"Data Pipelines\",\n",
+       "          \"Google Cloud\",\n",
+       "          \"Serverless\",\n",
+       "          \"Enterprise\"\n",
+       "        ],\n",
+       "        \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n",
+       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n",
+       "      }\n",
+       "    }\n",
+       "  ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"query\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n", + " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", + " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141236\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141239\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam\"\u001b[0m,\n", + " \u001b[32m\"Project Starline\"\u001b[0m,\n", + " \u001b[32m\"3D video\"\u001b[0m,\n", + " \u001b[32m\"AI communication\"\u001b[0m,\n", + " \u001b[32m\"real-time meetings\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"AI\"\u001b[0m,\n", + " \u001b[32m\"Communication\"\u001b[0m,\n", + " \u001b[32m\"3D Technology\"\u001b[0m,\n", + " \u001b[32m\"Remote Work\"\u001b[0m,\n", + " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141238\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2925\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141237\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2342\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with beam.Pipeline() as p:\n", + " _ = (\n", + " p\n", + " | \"Create\" >> beam.Create([Chunk(content=Content(),embedding=Embedding(dense_embedding=query))])\n", + " | \"Enrich W/ Milvus Vector Search\" >> Enrichment(milvus_handler)\n", + " | \"Format and Print Results\" >> FormatAndPrintResults())" + ] + }, + { + "cell_type": "markdown", + "id": "cb626be4-1c1c-4426-a6be-9cc8e385f2c8", + "metadata": {}, + "source": [ + "### Keyword Search" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "f159ad87-5153-48bb-87b3-3845d3c76420", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"what is beam?\"" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "8b8cad3e-8a18-464b-8de6-aa4515a653c5", + "metadata": {}, + "outputs": [], + "source": [ + "search_parameters = MilvusSearchParameters(\n", + " collection_name=collection_name,\n", + " search_strategy=KeywordSearchParameters(limit=10,anns_field=\"sparse_vector\"),\n", + " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "47cfc650-0b34-4333-9321-19be2e8fdc85", + "metadata": {}, + "outputs": [], + "source": [ + "collection_load_parameters = MilvusCollectionLoadParameters() " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "4754763b-66bf-4f90-9920-28cef223b536", + "metadata": {}, + "outputs": [], + "source": [ + "milvus_handler = MilvusSearchEnrichmentHandler(\n", + " connection_parameters=milvus_connection_parameters,\n", + " search_parameters=search_parameters,\n", + " collection_load_parameters=collection_load_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "a3db4837-01c7-42d7-b4e8-58d8d361fe93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "  \"query\": \"what is beam?\",\n",
+       "  \"query_embedding\": null,\n",
+       "  \"results\": [\n",
+       "    {\n",
+       "      \"id\": 459025737739141236,\n",
+       "      \"distance\": 0.5657,\n",
+       "      \"fields\": {\n",
+       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
+       "        \"keywords\": [\n",
+       "          \"Apache Beam\",\n",
+       "          \"stream processing\",\n",
+       "          \"batch processing\",\n",
+       "          \"data pipelines\",\n",
+       "          \"SDK\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Data Engineering\",\n",
+       "          \"Open Source\",\n",
+       "          \"Streaming\",\n",
+       "          \"Batch\",\n",
+       "          \"Big Data\"\n",
+       "        ],\n",
+       "        \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141239,\n",
+       "      \"distance\": 0.5471,\n",
+       "      \"fields\": {\n",
+       "        \"title\": \"Google Beam: 3D Communication Powered by AI\",\n",
+       "        \"keywords\": [\n",
+       "          \"Google Beam\",\n",
+       "          \"Project Starline\",\n",
+       "          \"3D video\",\n",
+       "          \"AI communication\",\n",
+       "          \"real-time meetings\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"AI\",\n",
+       "          \"Communication\",\n",
+       "          \"3D Technology\",\n",
+       "          \"Remote Work\",\n",
+       "          \"Enterprise Tech\"\n",
+       "        ],\n",
+       "        \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141235,\n",
+       "      \"distance\": 0.53,\n",
+       "      \"fields\": {\n",
+       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
+       "        \"keywords\": [\n",
+       "          \"Apache Beam\",\n",
+       "          \"stream processing\",\n",
+       "          \"batch processing\",\n",
+       "          \"data pipelines\",\n",
+       "          \"SDK\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Data Engineering\",\n",
+       "          \"Open Source\",\n",
+       "          \"Streaming\",\n",
+       "          \"Batch\",\n",
+       "          \"Big Data\"\n",
+       "        ],\n",
+       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141237,\n",
+       "      \"distance\": 0.5055,\n",
+       "      \"fields\": {\n",
+       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n",
+       "        \"keywords\": [\n",
+       "          \"Google Cloud\",\n",
+       "          \"Dataflow\",\n",
+       "          \"Apache Beam\",\n",
+       "          \"serverless\",\n",
+       "          \"stream and batch\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Cloud Computing\",\n",
+       "          \"Data Pipelines\",\n",
+       "          \"Google Cloud\",\n",
+       "          \"Serverless\",\n",
+       "          \"Enterprise\"\n",
+       "        ],\n",
+       "        \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141238,\n",
+       "      \"distance\": 0.134,\n",
+       "      \"fields\": {\n",
+       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n",
+       "        \"keywords\": [\n",
+       "          \"Google Cloud\",\n",
+       "          \"Dataflow\",\n",
+       "          \"Apache Beam\",\n",
+       "          \"serverless\",\n",
+       "          \"stream and batch\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Cloud Computing\",\n",
+       "          \"Data Pipelines\",\n",
+       "          \"Google Cloud\",\n",
+       "          \"Serverless\",\n",
+       "          \"Enterprise\"\n",
+       "        ],\n",
+       "        \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\n",
+       "      }\n",
+       "    }\n",
+       "  ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"query\"\u001b[0m: \u001b[32m\"what is beam?\"\u001b[0m,\n", + " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n", + " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141236\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5657\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141239\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5471\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam\"\u001b[0m,\n", + " \u001b[32m\"Project Starline\"\u001b[0m,\n", + " \u001b[32m\"3D video\"\u001b[0m,\n", + " \u001b[32m\"AI communication\"\u001b[0m,\n", + " \u001b[32m\"real-time meetings\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"AI\"\u001b[0m,\n", + " \u001b[32m\"Communication\"\u001b[0m,\n", + " \u001b[32m\"3D Technology\"\u001b[0m,\n", + " \u001b[32m\"Remote Work\"\u001b[0m,\n", + " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.53\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141237\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5055\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141238\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.134\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with beam.Pipeline() as p:\n", + " _ = (\n", + " p\n", + " | \"Create\" >> beam.Create([Chunk(content=Content(text=query))])\n", + " | \"Enrich W/ Milvus Keyword Search\" >> Enrichment(milvus_handler)\n", + " | \"Format and Print Results\" >> FormatAndPrintResults()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "de344931-4f2e-473d-bd53-c2708c1d1bcc", + "metadata": {}, + "source": [ + "### Hybrid Search" + ] + }, + { + "cell_type": "markdown", + "id": "4afec961-71ae-49cc-85ac-2b88eff6b23b", + "metadata": {}, + "source": [ + "Let’s choose a deliberate query that illustrates the benefits of hybrid search:\n", + "\n", + "Query: \"real-time data processing systems\"\n", + "\n", + "This query demonstrates hybrid search advantages because:\n", + "\n", + "* Dense vector (semantic) contribution: Will understand the conceptual relationship between \"real-time processing\" and \"streaming\" (found in docs #1 and #2)\n", + "* Sparse vector (keyword) contribution: Will match exact terms like \"data\" and \"processing\" (found in docs #1 and #2)\n", + "* Hybrid advantage: Document #1 about Apache Beam should rank highest since it contains more specific technical details about real-time processing capabilities like \"event time,\" \"triggers,\" and \"stateful processing\" - even though the exact phrase \"real-time data processing\" doesn't appear in any document" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "172b6c80-2a03-49d0-afc7-12bb0a4dc989", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"real-time data processing system\"\n", + "query_embedding = encode_embedding(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "eb6d951c-0def-45cc-84a4-b6f7b7575f23", + "metadata": {}, + "outputs": [], + "source": [ + "hybrid_search_parameters = HybridSearchParameters(\n", + " vector=VectorSearchParameters(limit=10,anns_field=\"vector\"),\n", + " keyword=KeywordSearchParameters(limit=10,anns_field=\"sparse_vector\"),\n", + " ranker=RRFRanker(3),\n", + " limit=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "b339c498-d229-42e6-b439-b29eb107b533", + "metadata": {}, + "outputs": [], + "source": [ + "search_parameters = MilvusSearchParameters(\n", + " collection_name=collection_name,\n", + " search_strategy=hybrid_search_parameters,\n", + " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "b346abe6-03c9-4b28-a0fb-74936b9f3a06", + "metadata": {}, + "outputs": [], + "source": [ + "collection_load_parameters = MilvusCollectionLoadParameters() " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "ab27810d-40a8-4b6a-bc82-441e13763ebc", + "metadata": {}, + "outputs": [], + "source": [ + "milvus_handler = MilvusSearchEnrichmentHandler(\n", + " connection_parameters=milvus_connection_parameters,\n", + " search_parameters=search_parameters,\n", + " collection_load_parameters=collection_load_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "9a37aa5b-d652-4dd3-9fe0-e277182415b9", + "metadata": {}, + "outputs": [], + "source": [ + "chunk = Chunk(\n", + " content=Content(text=query),\n", + " embedding=Embedding(dense_embedding=query_embedding)\n", + ")\n", + "\n", + "chunks = [chunk]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "ea9d84f7-d142-4afa-9a6f-6c310d9604b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "  \"query\": \"real-time data processing system\",\n",
+       "  \"query_embedding\": 384,\n",
+       "  \"results\": [\n",
+       "    {\n",
+       "      \"id\": 459025737739141235,\n",
+       "      \"distance\": 0.5,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Apache Beam\",\n",
+       "          \"stream processing\",\n",
+       "          \"batch processing\",\n",
+       "          \"data pipelines\",\n",
+       "          \"SDK\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Data Engineering\",\n",
+       "          \"Open Source\",\n",
+       "          \"Streaming\",\n",
+       "          \"Batch\",\n",
+       "          \"Big Data\"\n",
+       "        ],\n",
+       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n",
+       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141237,\n",
+       "      \"distance\": 0.3667,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Google Cloud\",\n",
+       "          \"Dataflow\",\n",
+       "          \"Apache Beam\",\n",
+       "          \"serverless\",\n",
+       "          \"stream and batch\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Cloud Computing\",\n",
+       "          \"Data Pipelines\",\n",
+       "          \"Google Cloud\",\n",
+       "          \"Serverless\",\n",
+       "          \"Enterprise\"\n",
+       "        ],\n",
+       "        \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n",
+       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n",
+       "      }\n",
+       "    }\n",
+       "  ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"query\"\u001b[0m: \u001b[32m\"real-time data processing system\"\u001b[0m,\n", + " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", + " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141237\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3667\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with beam.Pipeline() as p:\n", + " _ = (\n", + " p\n", + " | \"Create\" >> beam.Create(chunks)\n", + " | \"Enrich W/ Milvus Keyword Search\" >> Enrichment(milvus_handler)\n", + " | \"Format and Print Results\" >> FormatAndPrintResults()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "58753d47-5e63-49ef-8d95-f9acd94b8c0e", + "metadata": {}, + "source": [ + "### Filtered Search (Metadata Filtering)" + ] + }, + { + "cell_type": "markdown", + "id": "cb72f9c6-5a29-4810-9768-574aa7ea5128", + "metadata": {}, + "source": [ + "#### Searching for Apache Beam" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "6e79ef5c-a121-4e69-9089-0991821f8745", + "metadata": {}, + "outputs": [], + "source": [ + "query = encode_embedding(\"what is beam?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "ebbcbbe8-f63d-4ff4-9160-719a0fbe9b06", + "metadata": {}, + "outputs": [], + "source": [ + "vector_search_parameters = VectorSearchParameters(\n", + " filter=\"ARRAY_CONTAINS(keywords, 'data pipelines')\",\n", + " limit=10,\n", + " anns_field=\"vector\")" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "5314c531-14bb-4d81-92a5-fcf9cca7fa81", + "metadata": {}, + "outputs": [], + "source": [ + "search_parameters = MilvusSearchParameters(\n", + " collection_name=collection_name,\n", + " search_strategy=VectorSearchParameters(filter=\"ARRAY_CONTAINS(keywords, 'data pipelines')\",limit=10,anns_field=\"vector\"),\n", + " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "0ecf2ac6-cf90-4ce7-b17f-113af90ab950", + "metadata": {}, + "outputs": [], + "source": [ + "collection_load_parameters = MilvusCollectionLoadParameters() " + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "0cd92b69-b9dc-445c-9bd7-21bb3ceb0fd3", + "metadata": {}, + "outputs": [], + "source": [ + "milvus_handler = MilvusSearchEnrichmentHandler(\n", + " connection_parameters=milvus_connection_parameters,\n", + " search_parameters=search_parameters,\n", + " collection_load_parameters=collection_load_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "b06ecf64-c314-4c6a-ae1a-4fdf059aeead", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "  \"query\": null,\n",
+       "  \"query_embedding\": 384,\n",
+       "  \"results\": [\n",
+       "    {\n",
+       "      \"id\": 459025737739141235,\n",
+       "      \"distance\": 0.453,\n",
+       "      \"fields\": {\n",
+       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
+       "        \"keywords\": [\n",
+       "          \"Apache Beam\",\n",
+       "          \"stream processing\",\n",
+       "          \"batch processing\",\n",
+       "          \"data pipelines\",\n",
+       "          \"SDK\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Data Engineering\",\n",
+       "          \"Open Source\",\n",
+       "          \"Streaming\",\n",
+       "          \"Batch\",\n",
+       "          \"Big Data\"\n",
+       "        ],\n",
+       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": 459025737739141236,\n",
+       "      \"distance\": 0.4353,\n",
+       "      \"fields\": {\n",
+       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
+       "        \"keywords\": [\n",
+       "          \"Apache Beam\",\n",
+       "          \"stream processing\",\n",
+       "          \"batch processing\",\n",
+       "          \"data pipelines\",\n",
+       "          \"SDK\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"Data Engineering\",\n",
+       "          \"Open Source\",\n",
+       "          \"Streaming\",\n",
+       "          \"Batch\",\n",
+       "          \"Big Data\"\n",
+       "        ],\n",
+       "        \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n",
+       "      }\n",
+       "    }\n",
+       "  ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"query\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n", + " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", + " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141236\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with beam.Pipeline() as p:\n", + " _ = (\n", + " p\n", + " | \"Create\" >> beam.Create([Chunk(content=Content(),embedding=Embedding(dense_embedding=query))])\n", + " | \"Enrich W/ Milvus Vector Search\" >> Enrichment(milvus_handler)\n", + " | \"Format and Print Results\" >> FormatAndPrintResults())" + ] + }, + { + "cell_type": "markdown", + "id": "3e61bcf4-96e7-47dd-bb37-4788e99a2b89", + "metadata": {}, + "source": [ + "#### Searching for Google Beam" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a8077395-c374-400f-abdc-fe6630eab8a4", + "metadata": {}, + "outputs": [], + "source": [ + "query = encode_embedding(\"what is beam?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "3b712779-f283-4e37-88ed-d6b65c6c45d2", + "metadata": {}, + "outputs": [], + "source": [ + "search_parameters = MilvusSearchParameters(\n", + " collection_name=collection_name,\n", + " search_strategy=VectorSearchParameters(filter=\"ARRAY_CONTAINS(tags, 'Remote Work')\",limit=10,anns_field=\"vector\"),\n", + " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "7f0924a3-8832-4138-a599-d3aef648b962", + "metadata": {}, + "outputs": [], + "source": [ + "collection_load_parameters = MilvusCollectionLoadParameters() " + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "516ecbf0-9bb0-4177-829b-b79300b29bbe", + "metadata": {}, + "outputs": [], + "source": [ + "milvus_handler = MilvusSearchEnrichmentHandler(\n", + " connection_parameters=milvus_connection_parameters,\n", + " search_parameters=search_parameters,\n", + " collection_load_parameters=collection_load_parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "db32dda5-0668-4162-80ea-b6a0c2a79063", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "  \"query\": null,\n",
+       "  \"query_embedding\": 384,\n",
+       "  \"results\": [\n",
+       "    {\n",
+       "      \"id\": 459025737739141239,\n",
+       "      \"distance\": 0.3927,\n",
+       "      \"fields\": {\n",
+       "        \"keywords\": [\n",
+       "          \"Google Beam\",\n",
+       "          \"Project Starline\",\n",
+       "          \"3D video\",\n",
+       "          \"AI communication\",\n",
+       "          \"real-time meetings\"\n",
+       "        ],\n",
+       "        \"tags\": [\n",
+       "          \"AI\",\n",
+       "          \"Communication\",\n",
+       "          \"3D Technology\",\n",
+       "          \"Remote Work\",\n",
+       "          \"Enterprise Tech\"\n",
+       "        ],\n",
+       "        \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n",
+       "        \"title\": \"Google Beam: 3D Communication Powered by AI\"\n",
+       "      }\n",
+       "    }\n",
+       "  ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"query\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n", + " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", + " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141239\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam\"\u001b[0m,\n", + " \u001b[32m\"Project Starline\"\u001b[0m,\n", + " \u001b[32m\"3D video\"\u001b[0m,\n", + " \u001b[32m\"AI communication\"\u001b[0m,\n", + " \u001b[32m\"real-time meetings\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"AI\"\u001b[0m,\n", + " \u001b[32m\"Communication\"\u001b[0m,\n", + " \u001b[32m\"3D Technology\"\u001b[0m,\n", + " \u001b[32m\"Remote Work\"\u001b[0m,\n", + " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with beam.Pipeline() as p:\n", + " _ = (\n", + " p\n", + " | \"Create\" >> beam.Create([Chunk(content=Content(),embedding=Embedding(dense_embedding=query))])\n", + " | \"Enrich W/ Milvus Vector Search\" >> Enrichment(milvus_handler)\n", + " | \"Format and Print Results\" >> FormatAndPrintResults())" + ] + }, + { + "cell_type": "markdown", + "id": "c2670682-24bf-45b6-9593-bed0e3b1cee2", + "metadata": {}, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "953e61f4-5188-45a6-b30b-d581f7471d17", + "metadata": {}, + "outputs": [], + "source": [ + "client.release_collection(collection_name=collection_name)\n", + "client.drop_collection(collection_name=collection_name)\n", + "MilvusEnrichmentTestHelper.stop_db_container(db)\n", + "db = None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdb361ae-99e7-41a3-9f95-9021175041e7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py index acee633b6f67..b84242e1eb12 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py @@ -116,3 +116,75 @@ def enrichment_with_vertex_ai_legacy(): | "Enrich W/ Vertex AI" >> Enrichment(vertex_ai_handler) | "Print" >> beam.Map(print)) # [END enrichment_with_vertex_ai_legacy] + + +def enrichment_with_milvus(): + # [START enrichment_with_milvus] + import os + import apache_beam as beam + from apache_beam.ml.rag.types import Content + from apache_beam.ml.rag.types import Chunk + from apache_beam.ml.rag.types import Embedding + from apache_beam.transforms.enrichment import Enrichment + from apache_beam.ml.rag.enrichment.milvus_search import ( + MilvusSearchEnrichmentHandler, + MilvusConnectionParameters, + MilvusSearchParameters, + MilvusCollectionLoadParameters, + VectorSearchParameters, + VectorSearchMetrics) + + uri = os.environ.get("MILVUS_VECTOR_DB_URI") + user = os.environ.get("MILVUS_VECTOR_DB_USER") + password = os.environ.get("MILVUS_VECTOR_DB_PASSWORD") + db_id = os.environ.get("MILVUS_VECTOR_DB_ID") + token = os.environ.get("MILVUS_VECTOR_DB_TOKEN") + collection_name = os.environ.get("MILVUS_VECTOR_DB_COLLECTION_NAME") + + data = [ + Chunk( + id="query1", + embedding=Embedding(dense_embedding=[0.1, 0.2, 0.3]), + content=Content()) + ] + + connection_parameters = MilvusConnectionParameters( + uri, user, password, db_id, token) + + # The first condition (language == "en") excludes documents in other + # languages. Initially, this gives us two documents. After applying the second + # condition (cost < 50), only the first document returns in search results. + filter_expr = 'metadata["language"] == "en" AND cost < 50' + + search_params = {"metric_type": VectorSearchMetrics.COSINE.value, "nprobe": 1} + + vector_search_params = VectorSearchParameters( + anns_field="dense_embedding_cosine", + limit=3, + filter=filter_expr, + search_params=search_params) + + search_parameters = MilvusSearchParameters( + collection_name=collection_name, + search_strategy=vector_search_params, + output_fields=["id", "content", "domain", "cost", "metadata"], + round_decimal=2) + + # MilvusCollectionLoadParameters is optional and provides fine-grained control + # over how collections are loaded into memory. For simple use cases or when + # getting started, this parameter can be omitted to use default loading + # behavior. Consider using it in resource-constrained environments to optimize + # memory usage and query performance. + collection_load_parameters = MilvusCollectionLoadParameters() + + milvus_search_handler = MilvusSearchEnrichmentHandler( + connection_parameters=connection_parameters, + search_parameters=search_parameters, + collection_load_parameters=collection_load_parameters) + with beam.Pipeline() as p: + _ = ( + p + | "Create" >> beam.Create(data) + | "Enrich W/ Milvus" >> Enrichment(milvus_search_handler) + | "Print" >> beam.Map(print)) + # [END enrichment_with_milvus] diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index 8a7cdfbe9263..67e20e4241d9 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -18,16 +18,27 @@ # pytype: skip-file # pylint: disable=line-too-long +import os import unittest from io import StringIO import mock +import pytest # pylint: disable=unused-import try: - from apache_beam.examples.snippets.transforms.elementwise.enrichment import enrichment_with_bigtable, \ - enrichment_with_vertex_ai_legacy - from apache_beam.examples.snippets.transforms.elementwise.enrichment import enrichment_with_vertex_ai + from apache_beam.examples.snippets.transforms.elementwise.enrichment import ( + enrichment_with_bigtable, + enrichment_with_vertex_ai, + enrichment_with_vertex_ai_legacy, + enrichment_with_milvus) + from apache_beam.ml.rag.enrichment.milvus_search import ( + MilvusConnectionParameters) + from apache_beam.ml.rag.enrichment.milvus_search_it_test import ( + MilvusEnrichmentTestHelper, + MilvusDBContainerInfo, + parse_chunk_strings, + assert_chunks_equivalent) from apache_beam.io.requestresponse import RequestResponseIO except ImportError: raise unittest.SkipTest('RequestResponseIO dependencies are not installed') @@ -60,7 +71,15 @@ def validate_enrichment_with_vertex_ai_legacy(): return expected +def validate_enrichment_with_milvus(): + expected = '''[START enrichment_with_milvus] +Chunk(content=Content(text=None), id='query1', index=0, metadata={'enrichment_data': defaultdict(, {'id': [1], 'distance': [1.0], 'fields': [{'content': 'This is a test document', 'cost': 49, 'domain': 'medical', 'id': 1, 'metadata': {'language': 'en'}}]})}, embedding=Embedding(dense_embedding=[0.1, 0.2, 0.3], sparse_embedding=None)) + [END enrichment_with_milvus]'''.splitlines()[1:-1] + return expected + + @mock.patch('sys.stdout', new_callable=StringIO) +@pytest.mark.uses_testcontainer class EnrichmentTest(unittest.TestCase): def test_enrichment_with_bigtable(self, mock_stdout): enrichment_with_bigtable() @@ -83,6 +102,63 @@ def test_enrichment_with_vertex_ai_legacy(self, mock_stdout): self.maxDiff = None self.assertEqual(output, expected) + def test_enrichment_with_milvus(self, mock_stdout): + milvus_db = None + try: + milvus_db = EnrichmentTestHelpers.pre_milvus_enrichment() + enrichment_with_milvus() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_milvus() + self.maxDiff = None + output = parse_chunk_strings(output) + expected = parse_chunk_strings(expected) + assert_chunks_equivalent(output, expected) + except Exception as e: + self.fail(f"Test failed with unexpected error: {e}") + finally: + if milvus_db: + EnrichmentTestHelpers.post_milvus_enrichment(milvus_db) + + +class EnrichmentTestHelpers: + @staticmethod + def pre_milvus_enrichment() -> MilvusDBContainerInfo: + # Create Milvus db container and make sure it is up and running. + db = MilvusEnrichmentTestHelper.start_db_container() + + # Construct connection parameters. + connection_params = MilvusConnectionParameters( + uri=db.uri, + user=db.user, + password=db.password, + db_id=db.id, + token=db.token) + + # Initialize db with data required for testing. + collection_name = MilvusEnrichmentTestHelper.initialize_db_with_data( + connection_params) + + # Setup environment variables for db and collection configuration. This will + # be used downstream by the milvus enrichment handler. + os.environ['MILVUS_VECTOR_DB_URI'] = db.uri + os.environ['MILVUS_VECTOR_DB_USER'] = db.user + os.environ['MILVUS_VECTOR_DB_PASSWORD'] = db.password + os.environ['MILVUS_VECTOR_DB_ID'] = db.id + os.environ['MILVUS_VECTOR_DB_TOKEN'] = db.token + os.environ['MILVUS_VECTOR_DB_COLLECTION_NAME'] = collection_name + + return db + + @staticmethod + def post_milvus_enrichment(db: MilvusDBContainerInfo): + MilvusEnrichmentTestHelper.stop_db_container(db) + os.environ.pop('MILVUS_VECTOR_DB_URI', None) + os.environ.pop('MILVUS_VECTOR_DB_USER', None) + os.environ.pop('MILVUS_VECTOR_DB_PASSWORD', None) + os.environ.pop('MILVUS_VECTOR_DB_ID', None) + os.environ.pop('MILVUS_VECTOR_DB_TOKEN', None) + os.environ.pop('MILVUS_VECTOR_DB_COLLECTION_NAME', None) + if __name__ == '__main__': unittest.main() diff --git a/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-milvus.md b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-milvus.md new file mode 100644 index 000000000000..ffd06528a9f0 --- /dev/null +++ b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment-milvus.md @@ -0,0 +1,67 @@ +--- +title: "Enrichment with Milvus" +--- + + +# Use Milvus to enrich data + +{{< localstorage language language-py >}} + + + + + +
+ + {{< button-pydoc path="apache_beam.ml.rag.enrichment.milvus_search" class="MilvusSearchEnrichmentHandler" >}} + +
+ +In Apache Beam 2.67.0 and later versions, the enrichment transform includes +a built-in enrichment handler for +[Milvus](https://milvus.io/). +The following example demonstrates how to create a pipeline that use the enrichment transform with the [`MilvusSearchEnrichmentHandler`](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.rag.enrichment.milvus_search.html#apache_beam.ml.rag.enrichment.milvus_search.MilvusSearchEnrichmentHandler) handler. + +The data in the Milvus instance collection `docs_catalog` follows this format: + +{{< table >}} +| id | content | domain | cost | metadata | dense_embedding | sparse_embedding | +|:--:|:-------:|:------:|:----:|:--------:|:--------------:|:----------------:| +| 1 | This is a test document | medical | 49 | {"language": "en"} | [0.1, 0.2, 0.3] | [auto-generated by Milvus] | +| 2 | Another test document | legal | 75 | {"language": "en"} | [0.2, 0.3, 0.4] | [auto-generated by Milvus] | +| 3 | وثيقة اختبار | financial | 149 | {"language": "ar"} | [0.3, 0.4, 0.5] | [auto-generated by Milvus] | +{{< /table >}} + + +{{< highlight language="py" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py" enrichment_with_milvus >}} +{{}} + +{{< paragraph class="notebook-skip" >}} +Output: +{{< /paragraph >}} +{{< highlight class="notebook-skip" >}} +{{< code_sample "sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py" enrichment_with_milvus >}} +{{< /highlight >}} + +## Notebook exmaple + + + Open In Colab + + +## Related transforms + +Not applicable. + +{{< button-pydoc path="apache_beam.ml.rag.enrichment.milvus_search" class="MilvusSearchEnrichmentHandler" >}} diff --git a/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md index 6c05b6b515a4..c30ea70468d8 100644 --- a/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md +++ b/website/www/site/content/en/documentation/transforms/python/elementwise/enrichment.md @@ -42,6 +42,7 @@ The following examples demonstrate how to create a pipeline that use the enrichm | Service | Example | |:-----------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Cloud Bigtable | [Enrichment with Bigtable](/documentation/transforms/python/elementwise/enrichment-bigtable/#example) | +| Milvus | [Enrichment with Milvus](/documentation/transforms/python/elementwise/enrichment-milvus/#example) | | Vertex AI Feature Store | [Enrichment with Vertex AI Feature Store](/documentation/transforms/python/elementwise/enrichment-vertexai/#example-1-enrichment-with-vertex-ai-feature-store) | | Vertex AI Feature Store (Legacy) | [Enrichment with Legacy Vertex AI Feature Store](/documentation/transforms/python/elementwise/enrichment-vertexai/#example-2-enrichment-with-vertex-ai-feature-store-legacy) | {{< /table >}} diff --git a/website/www/site/layouts/partials/section-menu/en/documentation.html b/website/www/site/layouts/partials/section-menu/en/documentation.html index 6b37450786f9..3285f5fff83c 100755 --- a/website/www/site/layouts/partials/section-menu/en/documentation.html +++ b/website/www/site/layouts/partials/section-menu/en/documentation.html @@ -297,6 +297,7 @@ From d48246422e1c94d1b7778ddfd1faad40025355ea Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sat, 16 Aug 2025 09:41:57 +0000 Subject: [PATCH 02/25] examples: update jupyter notebook example --- .../beam-ml/milvus_enrichment_transform.ipynb | 562 ++++++++---------- 1 file changed, 256 insertions(+), 306 deletions(-) diff --git a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb index a6ea23b9492f..4e0648c18c32 100644 --- a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb +++ b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 102, "id": "47053bac", "metadata": {}, "outputs": [], @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 103, "id": "e550cd55-e91e-4d43-b1bd-b0e89bb8cbd9", "metadata": {}, "outputs": [], @@ -80,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 104, "id": "31747c45-107a-49be-8885-5a6cc9dc1236", "metadata": {}, "outputs": [ @@ -88,9 +88,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", - "\u001b[0m" + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[33mWARNING: apache-beam 2.67.0 does not provide the extra 'milvus'\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } ], @@ -98,12 +102,12 @@ "# The Apache Beam test dependencies are included here for the TestContainers\n", "# Milvus standalone DB container that will be used later in the demo.\n", "!pip install rich sentence_transformers llama_index --quiet\n", - "!pip install apache_beam[interactive,test]>=2.67.0 --quiet" + "!pip install apache_beam[milvus,gcp,test,interactive]>=2.67.0 --quiet" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 105, "id": "666e0c2b-0341-4b0e-8d73-561abc39bb10", "metadata": {}, "outputs": [], @@ -168,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 106, "id": "38781cf5-e18f-40f5-827e-2d441ae7d2fa", "metadata": {}, "outputs": [], @@ -261,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 107, "id": "489e93b6-de41-4ec3-be33-a15c3cba12e8", "metadata": {}, "outputs": [ @@ -338,7 +342,7 @@ "max 312.000000" ] }, - "execution_count": 6, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -353,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 108, "id": "eb32aad0-febd-45af-b4bd-e2176b07e2dc", "metadata": {}, "outputs": [ @@ -400,7 +404,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 109, "id": "e7e45d70-0c23-409d-b435-b9479245c1ff", "metadata": {}, "outputs": [], @@ -417,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 110, "id": "5a013b08-d7e7-4367-ad49-43ad1320158f", "metadata": {}, "outputs": [], @@ -432,7 +436,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 111, "id": "2d5ea747-40b3-474e-ac36-ccb81256a36c", "metadata": {}, "outputs": [], @@ -442,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 112, "id": "9917cefb-6271-4285-a75d-a6d1bfcbfd06", "metadata": {}, "outputs": [ @@ -498,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 113, "id": "aa55928d-c6ca-47c5-883d-d14eb0aa1298", "metadata": {}, "outputs": [], @@ -511,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 114, "id": "26e80afa-b9dc-4778-8301-ce38264d58cd", "metadata": {}, "outputs": [], @@ -522,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 115, "id": "68e04606-ca81-4a1f-81d2-964495295ed3", "metadata": {}, "outputs": [], @@ -533,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 116, "id": "43c55049-fbd9-4a1c-ae74-c12b5f5a03ee", "metadata": {}, "outputs": [], @@ -554,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 117, "id": "3ec7c739-6adc-4591-b5b2-9e60d7783c3c", "metadata": {}, "outputs": [], @@ -572,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 118, "id": "541794c7-f9a6-4d42-a522-8f4a3d1b1dfa", "metadata": {}, "outputs": [], @@ -597,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 119, "id": "6f2ebedc-7d72-4deb-838c-42b8f103ceb4", "metadata": {}, "outputs": [], @@ -631,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 120, "id": "5ae9bc82-9ad7-46dd-b254-19cbdcdd0e07", "metadata": {}, "outputs": [], @@ -641,31 +645,10 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 121, "id": "aff7b261-3330-4fa9-9a54-3fd87b42521f", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Pulling image testcontainers/ryuk:0.8.1\n", - "Container started: aa9a64365154\n", - "Waiting for container with image testcontainers/ryuk:0.8.1 to be ready ...\n", - "Pulling image milvusdb/milvus:v2.5.10\n", - "Container started: 74649e2c3f75\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n" - ] - } - ], + "outputs": [], "source": [ "if not db:\n", " db = MilvusEnrichmentTestHelper.start_db_container()" @@ -673,7 +656,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 122, "id": "31496ee0-75a2-48ad-954e-9c4ae5abbf5e", "metadata": {}, "outputs": [], @@ -683,7 +666,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 123, "id": "82627714-2425-4058-9b47-d262f015caf7", "metadata": {}, "outputs": [], @@ -693,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 124, "id": "e8a85f51-5d5f-4533-bf0f-ec825e613dc2", "metadata": {}, "outputs": [ @@ -703,7 +686,7 @@ "'2.5.10'" ] }, - "execution_count": 23, + "execution_count": 124, "metadata": {}, "output_type": "execute_result" } @@ -730,7 +713,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 125, "id": "c014af94-1bb7-44e4-842c-1039f4a2a11d", "metadata": {}, "outputs": [], @@ -758,7 +741,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 126, "id": "54fb3428-b007-4804-9d79-b3933d3256c5", "metadata": {}, "outputs": [], @@ -774,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 127, "id": "4c2f123a-5949-4974-af48-a5db5b168c11", "metadata": {}, "outputs": [ @@ -784,7 +767,7 @@ "{'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': , 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': , 'params': {'dim': 384}}, {'name': 'sparse_vector', 'description': '', 'type': , 'is_function_output': True}, {'name': 'title', 'description': '', 'type': , 'params': {'max_length': 256}}, {'name': 'content', 'description': '', 'type': , 'params': {'max_length': 65279}}, {'name': 'combined_text', 'description': '', 'type': , 'params': {'max_length': 65535, 'enable_analyzer': True}}, {'name': 'doc_id', 'description': '', 'type': , 'params': {'max_length': 100}}, {'name': 'keywords', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 64}, 'element_type': }, {'name': 'tags', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 32}, 'element_type': }], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': , 'input_field_names': ['combined_text'], 'output_field_names': ['sparse_vector'], 'params': {}}]}" ] }, - "execution_count": 26, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } @@ -804,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 128, "id": "671f4352-2086-4428-83be-0de48926682d", "metadata": {}, "outputs": [], @@ -822,7 +805,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 129, "id": "aa8baae5-7c38-4e78-ace4-304c7dc6b127", "metadata": {}, "outputs": [], @@ -845,7 +828,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 130, "id": "d970a35b-f9b2-4f8f-93ef-8de5c83c31b5", "metadata": {}, "outputs": [], @@ -860,7 +843,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 131, "id": "0d45a6ad-2009-4e30-b38d-73266da98a06", "metadata": {}, "outputs": [ @@ -871,7 +854,7 @@ " {'field_name': 'sparse_vector', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]" ] }, - "execution_count": 30, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } @@ -890,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 132, "id": "51dd4423-240c-4271-bb8c-6270f399a25c", "metadata": {}, "outputs": [], @@ -900,7 +883,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 133, "id": "9620b1f2-51fa-491c-ad3f-f0676b9b25f6", "metadata": {}, "outputs": [], @@ -910,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 134, "id": "e6cf3a1d-265c-44db-aba8-d491fab290d5", "metadata": {}, "outputs": [], @@ -920,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 135, "id": "94497411-43d3-4300-98b3-1cb33759738e", "metadata": {}, "outputs": [ @@ -930,7 +913,7 @@ "True" ] }, - "execution_count": 34, + "execution_count": 135, "metadata": {}, "output_type": "execute_result" } @@ -957,7 +940,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 136, "id": "20fd6f92-277f-42a3-b0a1-d9e9cb030caa", "metadata": {}, "outputs": [], @@ -977,17 +960,17 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 137, "id": "178e59dd-d9aa-4948-a02b-f57ee919f0ff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'insert_count': 5, 'ids': [459025737739141235, 459025737739141236, 459025737739141237, 459025737739141238, 459025737739141239], 'cost': 0}" + "{'insert_count': 5, 'ids': [460150998305603791, 460150998305603792, 460150998305603793, 460150998305603794, 460150998305603795], 'cost': 0}" ] }, - "execution_count": 36, + "execution_count": 137, "metadata": {}, "output_type": "execute_result" } @@ -1006,7 +989,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 138, "id": "b01b111e-41f2-4d9f-b7f5-4fc42305fbe0", "metadata": {}, "outputs": [ @@ -1014,15 +997,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'id': 459025737739141235, 'distance': 0.5704954862594604, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': 'Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.', 'doc_id': '1_1', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", + "{'id': 460150998305603791, 'distance': 0.5704954862594604, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': 'Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.', 'doc_id': '1_1', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", "---\n", - "{'id': 459025737739141236, 'distance': 0.43758389353752136, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\", 'doc_id': '1_2', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", + "{'id': 460150998305603792, 'distance': 0.43758389353752136, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\", 'doc_id': '1_2', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", "---\n", - "{'id': 459025737739141238, 'distance': 0.36327481269836426, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.', 'doc_id': '2_2', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", + "{'id': 460150998305603794, 'distance': 0.36327481269836426, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.', 'doc_id': '2_2', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", "---\n", - "{'id': 459025737739141239, 'distance': 0.34582412242889404, 'entity': {'title': 'Google Beam: 3D Communication Powered by AI', 'content': 'Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.', 'doc_id': '3_1', 'keywords': ['Google Beam', 'Project Starline', '3D video', 'AI communication', 'real-time meetings'], 'tags': ['AI', 'Communication', '3D Technology', 'Remote Work', 'Enterprise Tech']}}\n", + "{'id': 460150998305603795, 'distance': 0.34582412242889404, 'entity': {'title': 'Google Beam: 3D Communication Powered by AI', 'content': 'Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.', 'doc_id': '3_1', 'keywords': ['Google Beam', 'Project Starline', '3D video', 'AI communication', 'real-time meetings'], 'tags': ['AI', 'Communication', '3D Technology', 'Remote Work', 'Enterprise Tech']}}\n", "---\n", - "{'id': 459025737739141237, 'distance': 0.2492937296628952, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.', 'doc_id': '2_1', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", + "{'id': 460150998305603793, 'distance': 0.2492937296628952, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.', 'doc_id': '2_1', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", "---\n" ] } @@ -1063,7 +1046,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 139, "id": "4911e8cc-10f1-4d21-9251-1b756b61f2c1", "metadata": {}, "outputs": [], @@ -1113,7 +1096,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 140, "id": "dcbed23b-1fc2-4f89-a6d0-e05c15d5e655", "metadata": { "scrolled": true @@ -1122,10 +1105,10 @@ { "data": { "text/plain": [ - "MilvusConnectionParameters(uri='http://localhost:55825', user='', password='', db_id='default', token='', timeout=None, kwargs={})" + "MilvusConnectionParameters(uri='http://localhost:33623', user='', password='', db_id='default', token='', timeout=None, kwargs={})" ] }, - "execution_count": 39, + "execution_count": 140, "metadata": {}, "output_type": "execute_result" } @@ -1144,7 +1127,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 141, "id": "74db1238-0a04-4e08-818d-5bce8f09006b", "metadata": {}, "outputs": [], @@ -1154,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 142, "id": "79e16531-8bec-4b4b-9ed3-cebd705480e0", "metadata": {}, "outputs": [], @@ -1167,7 +1150,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 143, "id": "cbef1911-6464-4ba1-8974-ed00896c7e8b", "metadata": {}, "outputs": [], @@ -1177,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 144, "id": "f0481286-3f2b-4690-a2f6-a5a00de3ff34", "metadata": {}, "outputs": [], @@ -1190,41 +1173,10 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 145, "id": "35ee37f2-60cd-4d5d-aef6-aed4fda79161", "metadata": {}, "outputs": [ - { - "data": { - "application/javascript": [ - "\n", - " if (typeof window.interactive_beam_jquery == 'undefined') {\n", - " var jqueryScript = document.createElement('script');\n", - " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", - " jqueryScript.type = 'text/javascript';\n", - " jqueryScript.onload = function() {\n", - " var datatableScript = document.createElement('script');\n", - " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", - " datatableScript.type = 'text/javascript';\n", - " datatableScript.onload = function() {\n", - " window.interactive_beam_jquery = jQuery.noConflict(true);\n", - " window.interactive_beam_jquery(document).ready(function($){\n", - " \n", - " });\n", - " }\n", - " document.head.appendChild(datatableScript);\n", - " };\n", - " document.head.appendChild(jqueryScript);\n", - " } else {\n", - " window.interactive_beam_jquery(document).ready(function($){\n", - " \n", - " });\n", - " }" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1233,16 +1185,9 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 459025737739141235,\n", + " \"id\": 460150998305603791,\n", " \"distance\": 0.453,\n", " \"fields\": {\n", - " \"keywords\": [\n", - " \"Apache Beam\",\n", - " \"stream processing\",\n", - " \"batch processing\",\n", - " \"data pipelines\",\n", - " \"SDK\"\n", - " ],\n", " \"tags\": [\n", " \"Data Engineering\",\n", " \"Open Source\",\n", @@ -1251,20 +1196,20 @@ " \"Big Data\"\n", " ],\n", " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n", - " }\n", - " },\n", - " {\n", - " \"id\": 459025737739141236,\n", - " \"distance\": 0.4353,\n", - " \"fields\": {\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", " \"stream processing\",\n", " \"batch processing\",\n", " \"data pipelines\",\n", " \"SDK\"\n", - " ],\n", + " ]\n", + " }\n", + " },\n", + " {\n", + " \"id\": 460150998305603792,\n", + " \"distance\": 0.4353,\n", + " \"fields\": {\n", " \"tags\": [\n", " \"Data Engineering\",\n", " \"Open Source\",\n", @@ -1273,20 +1218,20 @@ " \"Big Data\"\n", " ],\n", " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", + " \"keywords\": [\n", + " \"Apache Beam\",\n", + " \"stream processing\",\n", + " \"batch processing\",\n", + " \"data pipelines\",\n", + " \"SDK\"\n", + " ]\n", " }\n", " },\n", " {\n", - " \"id\": 459025737739141239,\n", + " \"id\": 460150998305603795,\n", " \"distance\": 0.3927,\n", " \"fields\": {\n", - " \"keywords\": [\n", - " \"Google Beam\",\n", - " \"Project Starline\",\n", - " \"3D video\",\n", - " \"AI communication\",\n", - " \"real-time meetings\"\n", - " ],\n", " \"tags\": [\n", " \"AI\",\n", " \"Communication\",\n", @@ -1295,20 +1240,20 @@ " \"Enterprise Tech\"\n", " ],\n", " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", - " \"title\": \"Google Beam: 3D Communication Powered by AI\"\n", + " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", + " \"keywords\": [\n", + " \"Google Beam\",\n", + " \"Project Starline\",\n", + " \"3D video\",\n", + " \"AI communication\",\n", + " \"real-time meetings\"\n", + " ]\n", " }\n", " },\n", " {\n", - " \"id\": 459025737739141238,\n", + " \"id\": 460150998305603794,\n", " \"distance\": 0.2925,\n", " \"fields\": {\n", - " \"keywords\": [\n", - " \"Google Cloud\",\n", - " \"Dataflow\",\n", - " \"Apache Beam\",\n", - " \"serverless\",\n", - " \"stream and batch\"\n", - " ],\n", " \"tags\": [\n", " \"Cloud Computing\",\n", " \"Data Pipelines\",\n", @@ -1317,20 +1262,20 @@ " \"Enterprise\"\n", " ],\n", " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n", - " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n", - " }\n", - " },\n", - " {\n", - " \"id\": 459025737739141237,\n", - " \"distance\": 0.2342,\n", - " \"fields\": {\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", " \"Dataflow\",\n", " \"Apache Beam\",\n", " \"serverless\",\n", " \"stream and batch\"\n", - " ],\n", + " ]\n", + " }\n", + " },\n", + " {\n", + " \"id\": 460150998305603793,\n", + " \"distance\": 0.2342,\n", + " \"fields\": {\n", " \"tags\": [\n", " \"Cloud Computing\",\n", " \"Data Pipelines\",\n", @@ -1339,7 +1284,14 @@ " \"Enterprise\"\n", " ],\n", " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", - " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", + " \"keywords\": [\n", + " \"Google Cloud\",\n", + " \"Dataflow\",\n", + " \"Apache Beam\",\n", + " \"serverless\",\n", + " \"stream and batch\"\n", + " ]\n", " }\n", " }\n", " ]\n", @@ -1352,16 +1304,9 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"stream processing\"\u001b[0m,\n", - " \u001b[32m\"batch processing\"\u001b[0m,\n", - " \u001b[32m\"data pipelines\"\u001b[0m,\n", - " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Data Engineering\"\u001b[0m,\n", " \u001b[32m\"Open Source\"\u001b[0m,\n", @@ -1370,20 +1315,20 @@ " \u001b[32m\"Big Data\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141236\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", - " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", " \u001b[32m\"stream processing\"\u001b[0m,\n", " \u001b[32m\"batch processing\"\u001b[0m,\n", " \u001b[32m\"data pipelines\"\u001b[0m,\n", " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603792\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Data Engineering\"\u001b[0m,\n", " \u001b[32m\"Open Source\"\u001b[0m,\n", @@ -1392,20 +1337,20 @@ " \u001b[32m\"Big Data\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141239\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603795\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Beam\"\u001b[0m,\n", - " \u001b[32m\"Project Starline\"\u001b[0m,\n", - " \u001b[32m\"3D video\"\u001b[0m,\n", - " \u001b[32m\"AI communication\"\u001b[0m,\n", - " \u001b[32m\"real-time meetings\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"AI\"\u001b[0m,\n", " \u001b[32m\"Communication\"\u001b[0m,\n", @@ -1414,20 +1359,20 @@ " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam\"\u001b[0m,\n", + " \u001b[32m\"Project Starline\"\u001b[0m,\n", + " \u001b[32m\"3D video\"\u001b[0m,\n", + " \u001b[32m\"AI communication\"\u001b[0m,\n", + " \u001b[32m\"real-time meetings\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141238\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603794\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2925\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Dataflow\"\u001b[0m,\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"serverless\"\u001b[0m,\n", - " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", @@ -1436,20 +1381,20 @@ " \u001b[32m\"Enterprise\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141237\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2342\u001b[0m,\n", - " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", " \u001b[32m\"Dataflow\"\u001b[0m,\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", " \u001b[32m\"serverless\"\u001b[0m,\n", " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603793\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2342\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", @@ -1458,7 +1403,14 @@ " \u001b[32m\"Enterprise\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -1488,7 +1440,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 146, "id": "f159ad87-5153-48bb-87b3-3845d3c76420", "metadata": {}, "outputs": [], @@ -1498,7 +1450,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 147, "id": "8b8cad3e-8a18-464b-8de6-aa4515a653c5", "metadata": {}, "outputs": [], @@ -1511,7 +1463,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 148, "id": "47cfc650-0b34-4333-9321-19be2e8fdc85", "metadata": {}, "outputs": [], @@ -1521,7 +1473,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 149, "id": "4754763b-66bf-4f90-9920-28cef223b536", "metadata": {}, "outputs": [], @@ -1534,7 +1486,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 150, "id": "a3db4837-01c7-42d7-b4e8-58d8d361fe93", "metadata": {}, "outputs": [ @@ -1546,9 +1498,10 @@ " \"query_embedding\": null,\n", " \"results\": [\n", " {\n", - " \"id\": 459025737739141236,\n", + " \"id\": 460150998305603792,\n", " \"distance\": 0.5657,\n", " \"fields\": {\n", + " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", @@ -1563,14 +1516,14 @@ " \"Streaming\",\n", " \"Batch\",\n", " \"Big Data\"\n", - " ],\n", - " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n", + " ]\n", " }\n", " },\n", " {\n", - " \"id\": 459025737739141239,\n", + " \"id\": 460150998305603795,\n", " \"distance\": 0.5471,\n", " \"fields\": {\n", + " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", " \"keywords\": [\n", " \"Google Beam\",\n", @@ -1585,14 +1538,14 @@ " \"3D Technology\",\n", " \"Remote Work\",\n", " \"Enterprise Tech\"\n", - " ],\n", - " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\n", + " ]\n", " }\n", " },\n", " {\n", - " \"id\": 459025737739141235,\n", + " \"id\": 460150998305603791,\n", " \"distance\": 0.53,\n", " \"fields\": {\n", + " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", @@ -1607,14 +1560,14 @@ " \"Streaming\",\n", " \"Batch\",\n", " \"Big Data\"\n", - " ],\n", - " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n", + " ]\n", " }\n", " },\n", " {\n", - " \"id\": 459025737739141237,\n", + " \"id\": 460150998305603793,\n", " \"distance\": 0.5055,\n", " \"fields\": {\n", + " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", @@ -1629,14 +1582,14 @@ " \"Google Cloud\",\n", " \"Serverless\",\n", " \"Enterprise\"\n", - " ],\n", - " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\n", + " ]\n", " }\n", " },\n", " {\n", - " \"id\": 459025737739141238,\n", + " \"id\": 460150998305603794,\n", " \"distance\": 0.134,\n", " \"fields\": {\n", + " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n", " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", @@ -1651,8 +1604,7 @@ " \"Google Cloud\",\n", " \"Serverless\",\n", " \"Enterprise\"\n", - " ],\n", - " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\n", + " ]\n", " }\n", " }\n", " ]\n", @@ -1665,9 +1617,10 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141236\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603792\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5657\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", @@ -1682,14 +1635,14 @@ " \u001b[32m\"Streaming\"\u001b[0m,\n", " \u001b[32m\"Batch\"\u001b[0m,\n", " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141239\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603795\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5471\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Beam\"\u001b[0m,\n", @@ -1704,14 +1657,14 @@ " \u001b[32m\"3D Technology\"\u001b[0m,\n", " \u001b[32m\"Remote Work\"\u001b[0m,\n", " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.53\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", @@ -1726,14 +1679,14 @@ " \u001b[32m\"Streaming\"\u001b[0m,\n", " \u001b[32m\"Batch\"\u001b[0m,\n", " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141237\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603793\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5055\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", @@ -1748,14 +1701,14 @@ " \u001b[32m\"Google Cloud\"\u001b[0m,\n", " \u001b[32m\"Serverless\"\u001b[0m,\n", " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141238\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603794\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.134\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", @@ -1770,8 +1723,7 @@ " \u001b[32m\"Google Cloud\"\u001b[0m,\n", " \u001b[32m\"Serverless\"\u001b[0m,\n", " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -1818,7 +1770,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 151, "id": "172b6c80-2a03-49d0-afc7-12bb0a4dc989", "metadata": {}, "outputs": [], @@ -1829,7 +1781,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 152, "id": "eb6d951c-0def-45cc-84a4-b6f7b7575f23", "metadata": {}, "outputs": [], @@ -1843,7 +1795,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 153, "id": "b339c498-d229-42e6-b439-b29eb107b533", "metadata": {}, "outputs": [], @@ -1856,7 +1808,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 154, "id": "b346abe6-03c9-4b28-a0fb-74936b9f3a06", "metadata": {}, "outputs": [], @@ -1866,7 +1818,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 155, "id": "ab27810d-40a8-4b6a-bc82-441e13763ebc", "metadata": {}, "outputs": [], @@ -1879,7 +1831,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 156, "id": "9a37aa5b-d652-4dd3-9fe0-e277182415b9", "metadata": {}, "outputs": [], @@ -1894,7 +1846,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 157, "id": "ea9d84f7-d142-4afa-9a6f-6c310d9604b0", "metadata": {}, "outputs": [ @@ -1906,16 +1858,9 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 459025737739141235,\n", + " \"id\": 460150998305603791,\n", " \"distance\": 0.5,\n", " \"fields\": {\n", - " \"keywords\": [\n", - " \"Apache Beam\",\n", - " \"stream processing\",\n", - " \"batch processing\",\n", - " \"data pipelines\",\n", - " \"SDK\"\n", - " ],\n", " \"tags\": [\n", " \"Data Engineering\",\n", " \"Open Source\",\n", @@ -1924,20 +1869,20 @@ " \"Big Data\"\n", " ],\n", " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", + " \"keywords\": [\n", + " \"Apache Beam\",\n", + " \"stream processing\",\n", + " \"batch processing\",\n", + " \"data pipelines\",\n", + " \"SDK\"\n", + " ]\n", " }\n", " },\n", " {\n", - " \"id\": 459025737739141237,\n", + " \"id\": 460150998305603793,\n", " \"distance\": 0.3667,\n", " \"fields\": {\n", - " \"keywords\": [\n", - " \"Google Cloud\",\n", - " \"Dataflow\",\n", - " \"Apache Beam\",\n", - " \"serverless\",\n", - " \"stream and batch\"\n", - " ],\n", " \"tags\": [\n", " \"Cloud Computing\",\n", " \"Data Pipelines\",\n", @@ -1946,7 +1891,14 @@ " \"Enterprise\"\n", " ],\n", " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", - " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", + " \"keywords\": [\n", + " \"Google Cloud\",\n", + " \"Dataflow\",\n", + " \"Apache Beam\",\n", + " \"serverless\",\n", + " \"stream and batch\"\n", + " ]\n", " }\n", " }\n", " ]\n", @@ -1959,16 +1911,9 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"stream processing\"\u001b[0m,\n", - " \u001b[32m\"batch processing\"\u001b[0m,\n", - " \u001b[32m\"data pipelines\"\u001b[0m,\n", - " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Data Engineering\"\u001b[0m,\n", " \u001b[32m\"Open Source\"\u001b[0m,\n", @@ -1977,20 +1922,20 @@ " \u001b[32m\"Big Data\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141237\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603793\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3667\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Dataflow\"\u001b[0m,\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"serverless\"\u001b[0m,\n", - " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", @@ -1999,7 +1944,14 @@ " \u001b[32m\"Enterprise\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -2038,7 +1990,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 159, "id": "6e79ef5c-a121-4e69-9089-0991821f8745", "metadata": {}, "outputs": [], @@ -2048,7 +2000,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 160, "id": "ebbcbbe8-f63d-4ff4-9160-719a0fbe9b06", "metadata": {}, "outputs": [], @@ -2061,7 +2013,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 161, "id": "5314c531-14bb-4d81-92a5-fcf9cca7fa81", "metadata": {}, "outputs": [], @@ -2074,7 +2026,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 162, "id": "0ecf2ac6-cf90-4ce7-b17f-113af90ab950", "metadata": {}, "outputs": [], @@ -2084,7 +2036,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 163, "id": "0cd92b69-b9dc-445c-9bd7-21bb3ceb0fd3", "metadata": {}, "outputs": [], @@ -2097,7 +2049,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 164, "id": "b06ecf64-c314-4c6a-ae1a-4fdf059aeead", "metadata": {}, "outputs": [ @@ -2109,7 +2061,7 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 459025737739141235,\n", + " \"id\": 460150998305603791,\n", " \"distance\": 0.453,\n", " \"fields\": {\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", @@ -2131,7 +2083,7 @@ " }\n", " },\n", " {\n", - " \"id\": 459025737739141236,\n", + " \"id\": 460150998305603792,\n", " \"distance\": 0.4353,\n", " \"fields\": {\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", @@ -2162,7 +2114,7 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141235\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", @@ -2184,7 +2136,7 @@ " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141236\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603792\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", @@ -2232,7 +2184,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 165, "id": "a8077395-c374-400f-abdc-fe6630eab8a4", "metadata": {}, "outputs": [], @@ -2242,7 +2194,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 166, "id": "3b712779-f283-4e37-88ed-d6b65c6c45d2", "metadata": {}, "outputs": [], @@ -2255,7 +2207,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 167, "id": "7f0924a3-8832-4138-a599-d3aef648b962", "metadata": {}, "outputs": [], @@ -2265,7 +2217,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 168, "id": "516ecbf0-9bb0-4177-829b-b79300b29bbe", "metadata": {}, "outputs": [], @@ -2278,7 +2230,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 169, "id": "db32dda5-0668-4162-80ea-b6a0c2a79063", "metadata": {}, "outputs": [ @@ -2290,9 +2242,11 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 459025737739141239,\n", + " \"id\": 460150998305603795,\n", " \"distance\": 0.3927,\n", " \"fields\": {\n", + " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", + " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", " \"keywords\": [\n", " \"Google Beam\",\n", " \"Project Starline\",\n", @@ -2306,9 +2260,7 @@ " \"3D Technology\",\n", " \"Remote Work\",\n", " \"Enterprise Tech\"\n", - " ],\n", - " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", - " \"title\": \"Google Beam: 3D Communication Powered by AI\"\n", + " ]\n", " }\n", " }\n", " ]\n", @@ -2321,9 +2273,11 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m459025737739141239\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603795\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Beam\"\u001b[0m,\n", " \u001b[32m\"Project Starline\"\u001b[0m,\n", @@ -2337,9 +2291,7 @@ " \u001b[32m\"3D Technology\"\u001b[0m,\n", " \u001b[32m\"Remote Work\"\u001b[0m,\n", " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -2369,13 +2321,11 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 173, "id": "953e61f4-5188-45a6-b30b-d581f7471d17", "metadata": {}, "outputs": [], "source": [ - "client.release_collection(collection_name=collection_name)\n", - "client.drop_collection(collection_name=collection_name)\n", "MilvusEnrichmentTestHelper.stop_db_container(db)\n", "db = None" ] @@ -2383,7 +2333,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fdb361ae-99e7-41a3-9f95-9021175041e7", + "id": "c1c6b76a-4aaa-498d-af24-d5d1e5f7f21f", "metadata": {}, "outputs": [], "source": [] From 104868e086f35ab5b2e944208202672b548b28a6 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sat, 16 Aug 2025 09:42:29 +0000 Subject: [PATCH 03/25] CHANGES.md: add release note --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index db88b8c79807..857f0da8799b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -74,6 +74,8 @@ * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * BigtableRead Connector for BeamYaml added with new Config Param ([#35696](https://github.com/apache/beam/pull/35696)) +* Added documentation and examples for Milvus enrichment transform (Python) ([#35467](https://github.com/apache/beam/pull/35467)). + Comprehensive documentation and Jupyter notebook examples now available for vector, keyword, and hybrid search operations with Milvus. * Introduced a dedicated module for JUnit-based testing support: `sdks/java/testing/junit`, which provides `TestPipelineExtension` for JUnit 5 while maintaining backward compatibility with existing JUnit 4 `TestRule`-based tests (Java) ([#18733](https://github.com/apache/beam/issues/18733), [#35688](https://github.com/apache/beam/pull/35688)). - To use JUnit 5 with Beam tests, add a test-scoped dependency on `org.apache.beam:beam-sdks-java-testing-junit`. From 9951901af3f8b776e5f773893a816f8dbe736fb7 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sat, 16 Aug 2025 10:59:54 +0000 Subject: [PATCH 04/25] sdks/python: update import err exception --- .../snippets/transforms/elementwise/enrichment_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index add37e37cda8..33406c734f6f 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -40,8 +40,8 @@ parse_chunk_strings, assert_chunks_equivalent) from apache_beam.io.requestresponse import RequestResponseIO -except ImportError: - raise unittest.SkipTest('RequestResponseIO dependencies are not installed') +except ImportError as e: + raise unittest.SkipTest(f'Examples dependencies are not installed: {str(e)}') def validate_enrichment_with_bigtable(): From 62c6a5e2e6ce64f51de87886a10a79b13861de2d Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sat, 16 Aug 2025 11:35:59 +0000 Subject: [PATCH 05/25] sdks/python: experiment with setting milvus as extra dependency this way --- sdks/python/setup.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index f7cd2a808a9a..382f7c263741 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -160,6 +160,8 @@ def cythonize(*args, **kwargs): 'pandas>=1.4.3,!=1.5.0,!=1.5.1,<2.3', ] +milvus_dependency = ['pymilvus>=2.5.10,<3.0.0'] + def find_by_ext(root_dir, ext): for root, _, files in os.walk(root_dir): @@ -444,9 +446,8 @@ def get_portability_package_data(): 'mysql-connector-python>=9.3.0', 'python-tds>=1.16.1', 'sqlalchemy-pytds>=1.0.2', - 'oracledb>=3.1.1', - 'milvus' - ], + 'oracledb>=3.1.1' + ] + milvus_dependency, 'gcp': [ 'cachetools>=3.1.0,<7', 'google-api-core>=2.0.0,<3', @@ -589,7 +590,7 @@ def get_portability_package_data(): ], 'xgboost': ['xgboost>=1.6.0,<2.1.3', 'datatable==1.0.0'], 'tensorflow-hub': ['tensorflow-hub>=0.14.0,<0.16.0'], - 'milvus': ['pymilvus>=2.5.10,<3.0.0'] + 'milvus': milvus_dependency }, zip_safe=False, # PyPI package information. From 508a8ad34cbfd347b9314841fe24c3349f61e0c1 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sat, 16 Aug 2025 11:51:27 +0000 Subject: [PATCH 06/25] sdks/python: revert pytest marker to use test containers --- .../ml/rag/enrichment/milvus_search_it_test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py index 357947ed85a4..adcd0e11fb85 100644 --- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py +++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py @@ -469,14 +469,14 @@ def create_user_yaml(service_port: int, max_vector_field_num=5): os.remove(path) -@pytest.mark.require_docker_in_docker -@unittest.skipUnless( - platform.system() == "Linux", - "Test runs only on Linux due to lack of support, as yet, for nested " - "virtualization in CI environments on Windows/macOS. Many CI providers run " - "tests in virtualized environments, and nested virtualization " - "(Docker inside a VM) is either unavailable or has several issues on " - "non-Linux platforms.") +@pytest.mark.uses_testcontainer +# @unittest.skipUnless( +# platform.system() == "Linux", +# "Test runs only on Linux due to lack of support, as yet, for nested " +# "virtualization in CI environments on Windows/macOS. Many CI providers run " +# "tests in virtualized environments, and nested virtualization " +# "(Docker inside a VM) is either unavailable or has several issues on " +# "non-Linux platforms.") class TestMilvusSearchEnrichment(unittest.TestCase): """Tests for search functionality across all search strategies""" From bc79236227e906d7bd8d09345389728a54fbf298 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sat, 16 Aug 2025 11:55:44 +0000 Subject: [PATCH 07/25] .github: trigger postcommit python --- .github/trigger_files/beam_PostCommit_Python.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/trigger_files/beam_PostCommit_Python.json b/.github/trigger_files/beam_PostCommit_Python.json index c6ec17f48412..518707c6af4a 100644 --- a/.github/trigger_files/beam_PostCommit_Python.json +++ b/.github/trigger_files/beam_PostCommit_Python.json @@ -1,5 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run.", - "modification": 35 + "modification": 390 } From 435d9e8ebd3ae289a68fe28fb83efb3a16f18927 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Tue, 16 Sep 2025 16:14:59 +0000 Subject: [PATCH 08/25] sdks/python: undo `require_docker_in_docker` pytest marker --- .../ml/rag/enrichment/milvus_search_it_test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py index cd576dbdc651..81ceb6b69e71 100644 --- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py +++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py @@ -469,14 +469,14 @@ def create_user_yaml(service_port: int, max_vector_field_num=5): os.remove(path) -@pytest.mark.uses_testcontainer -# @unittest.skipUnless( -# platform.system() == "Linux", -# "Test runs only on Linux due to lack of support, as yet, for nested " -# "virtualization in CI environments on Windows/macOS. Many CI providers run " -# "tests in virtualized environments, and nested virtualization " -# "(Docker inside a VM) is either unavailable or has several issues on " -# "non-Linux platforms.") +@pytest.mark.require_docker_in_docker +@unittest.skipUnless( + platform.system() == "Linux", + "Test runs only on Linux due to lack of support, as yet, for nested " + "virtualization in CI environments on Windows/macOS. Many CI providers run " + "tests in virtualized environments, and nested virtualization " + "(Docker inside a VM) is either unavailable or has several issues on " + "non-Linux platforms.") class TestMilvusSearchEnrichment(unittest.TestCase): """Tests for search functionality across all search strategies""" From 084687be5512931b05ae8966ccedb65fc5c09246 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Tue, 16 Sep 2025 17:14:27 +0000 Subject: [PATCH 09/25] sdks/python: fix formatting issues --- .../examples/snippets/transforms/elementwise/enrichment.py | 4 ++-- .../snippets/transforms/elementwise/enrichment_test.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py index d9d3ed4b0534..12ec205d2e62 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment.py @@ -156,7 +156,7 @@ def enrichment_with_google_cloudsql_pg(): where_clause_template=where_clause_template, where_clause_fields=where_clause_fields) - cloudsql_handler = CloudSQLEnrichmentHandler( + handler = CloudSQLEnrichmentHandler( connection_config=connection_config, table_id=table_id, query_config=query_config) @@ -164,7 +164,7 @@ def enrichment_with_google_cloudsql_pg(): _ = ( p | "Create" >> beam.Create(data) - | "Enrich W/ Google CloudSQL PostgreSQL" >> Enrichment(cloudsql_handler) + | "Enrich W/ Google CloudSQL PostgreSQL" >> Enrichment(handler) | "Print" >> beam.Map(print)) # [END enrichment_with_google_cloudsql_pg] diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index 721e6043396a..a0be4fcbf4d6 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -205,7 +205,6 @@ def test_enrichment_with_external_sqlserver(self, mock_stdout): except Exception as e: self.fail(f"Test failed with unexpected error: {e}") - def test_enrichment_with_milvus(self, mock_stdout): milvus_db = None try: @@ -345,10 +344,8 @@ def post_sql_enrichment_test(res: CloudSQLEnrichmentTestDataConstruct): @staticmethod def pre_milvus_enrichment() -> MilvusDBContainerInfo: - # Create Milvus db container and make sure it is up and running. db = MilvusEnrichmentTestHelper.start_db_container() - # Construct connection parameters. connection_params = MilvusConnectionParameters( uri=db.uri, user=db.user, @@ -356,7 +353,6 @@ def pre_milvus_enrichment() -> MilvusDBContainerInfo: db_id=db.id, token=db.token) - # Initialize db with data required for testing. collection_name = MilvusEnrichmentTestHelper.initialize_db_with_data( connection_params) From 924282a4cd52f4efd584732dae30d67bc7945e32 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Tue, 16 Sep 2025 17:56:39 +0000 Subject: [PATCH 10/25] python: mark `test_enrichment_with_milvus` with require_docker_in_docker --- .../snippets/transforms/elementwise/enrichment_test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index a0be4fcbf4d6..55d25cafd096 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -205,6 +205,11 @@ def test_enrichment_with_external_sqlserver(self, mock_stdout): except Exception as e: self.fail(f"Test failed with unexpected error: {e}") + + +@mock.patch('sys.stdout', new_callable=StringIO) +@pytest.mark.require_docker_in_docker +class DockerInDockerEnrichmentTest(unittest.TestCase): def test_enrichment_with_milvus(self, mock_stdout): milvus_db = None try: From c687fcf81ed0c0dc2c0fbff71a37f0f043cd1077 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Tue, 16 Sep 2025 19:03:31 +0000 Subject: [PATCH 11/25] sdks/python: test milvus example --- .../transforms/elementwise/enrichment_test.py | 41 ++++++++++--------- .../rag/enrichment/milvus_search_it_test.py | 2 +- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index 55d25cafd096..0e6835e7cb6f 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -205,27 +205,18 @@ def test_enrichment_with_external_sqlserver(self, mock_stdout): except Exception as e: self.fail(f"Test failed with unexpected error: {e}") - - -@mock.patch('sys.stdout', new_callable=StringIO) -@pytest.mark.require_docker_in_docker -class DockerInDockerEnrichmentTest(unittest.TestCase): def test_enrichment_with_milvus(self, mock_stdout): - milvus_db = None - try: - milvus_db = EnrichmentTestHelpers.pre_milvus_enrichment() - enrichment_with_milvus() - output = mock_stdout.getvalue().splitlines() - expected = validate_enrichment_with_milvus() - self.maxDiff = None - output = parse_chunk_strings(output) - expected = parse_chunk_strings(expected) - assert_chunks_equivalent(output, expected) - except Exception as e: - self.fail(f"Test failed with unexpected error: {e}") - finally: - if milvus_db: - EnrichmentTestHelpers.post_milvus_enrichment(milvus_db) + with EnrichmentTestHelpers.milvus_test_context(): + try: + enrichment_with_milvus() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_milvus() + self.maxDiff = None + output = parse_chunk_strings(output) + expected = parse_chunk_strings(expected) + assert_chunks_equivalent(output, expected) + except Exception as e: + self.fail(f"Test failed with unexpected error: {e}") @dataclass @@ -248,6 +239,16 @@ def sql_test_context(is_cloudsql: bool, db_adapter: DatabaseTypeAdapter): if result: EnrichmentTestHelpers.post_sql_enrichment_test(result) + @contextmanager + def milvus_test_context(): + db: Optional[MilvusDBContainerInfo] = None + try: + db = EnrichmentTestHelpers.pre_milvus_enrichment() + yield + finally: + if db: + EnrichmentTestHelpers.post_milvus_enrichment(db) + @staticmethod def pre_sql_enrichment_test( is_cloudsql: bool, diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py index 81ceb6b69e71..bcf8520d9c18 100644 --- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py +++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py @@ -297,7 +297,7 @@ def __init__( class MilvusEnrichmentTestHelper: @staticmethod def start_db_container( - image="milvusdb/milvus:v2.3.9", + image="milvusdb/milvus:v2.5.10", max_vec_fields=5, vector_client_max_retries=3, tc_max_retries=TC_MAX_TRIES) -> Optional[MilvusDBContainerInfo]: From 733c233df973d3a4b15565254265eee69209f1d7 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Tue, 16 Sep 2025 19:08:40 +0000 Subject: [PATCH 12/25] sdks/python: update jupyter notebook example --- .../beam-ml/milvus_enrichment_transform.ipynb | 588 +++++++++--------- 1 file changed, 299 insertions(+), 289 deletions(-) diff --git a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb index 4e0648c18c32..64085437797f 100644 --- a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb +++ b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 102, + "execution_count": 70, "id": "47053bac", "metadata": {}, "outputs": [], @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 71, "id": "e550cd55-e91e-4d43-b1bd-b0e89bb8cbd9", "metadata": {}, "outputs": [], @@ -80,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 72, "id": "31747c45-107a-49be-8885-5a6cc9dc1236", "metadata": {}, "outputs": [ @@ -91,8 +91,7 @@ "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "\u001b[33mWARNING: apache-beam 2.67.0 does not provide the extra 'milvus'\u001b[0m\u001b[33m\n", - "\u001b[0m\n", + "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] @@ -107,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 73, "id": "666e0c2b-0341-4b0e-8d73-561abc39bb10", "metadata": {}, "outputs": [], @@ -118,9 +117,6 @@ "from typing import List\n", "\n", "# Third-party imports.\n", - "import apache_beam as beam\n", - "from apache_beam.ml.rag.types import Chunk, Content, Embedding\n", - "from apache_beam.transforms.enrichment import Enrichment\n", "import numpy as np\n", "import pandas as pd\n", "from pymilvus import DataType, CollectionSchema, FieldSchema, Function, FunctionType, MilvusClient, RRFRanker\n", @@ -128,9 +124,13 @@ "from rich import print_json\n", "from sentence_transformers import SentenceTransformer\n", "from torch import cuda\n", + "from llama_index.core.text_splitter import SentenceSplitter\n", "\n", "# Local application imports.\n", - "from llama_index.core.text_splitter import SentenceSplitter\n", + "import apache_beam as beam\n", + "from apache_beam.ml.rag.types import Chunk, Content, Embedding\n", + "from apache_beam.transforms.enrichment import Enrichment\n", + "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusEnrichmentTestHelper\n", "from apache_beam.ml.rag.enrichment.milvus_search import (\n", " HybridSearchParameters, \n", " KeywordSearchMetrics, \n", @@ -141,9 +141,7 @@ " MilvusSearchParameters, \n", " SearchStrategy, \n", " VectorSearchMetrics, \n", - " VectorSearchParameters\n", - ")\n", - "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusEnrichmentTestHelper" + " VectorSearchParameters)" ] }, { @@ -172,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 74, "id": "38781cf5-e18f-40f5-827e-2d441ae7d2fa", "metadata": {}, "outputs": [], @@ -265,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 75, "id": "489e93b6-de41-4ec3-be33-a15c3cba12e8", "metadata": {}, "outputs": [ @@ -342,7 +340,7 @@ "max 312.000000" ] }, - "execution_count": 107, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } @@ -357,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 76, "id": "eb32aad0-febd-45af-b4bd-e2176b07e2dc", "metadata": {}, "outputs": [ @@ -404,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 77, "id": "e7e45d70-0c23-409d-b435-b9479245c1ff", "metadata": {}, "outputs": [], @@ -421,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 78, "id": "5a013b08-d7e7-4367-ad49-43ad1320158f", "metadata": {}, "outputs": [], @@ -436,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 79, "id": "2d5ea747-40b3-474e-ac36-ccb81256a36c", "metadata": {}, "outputs": [], @@ -446,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 80, "id": "9917cefb-6271-4285-a75d-a6d1bfcbfd06", "metadata": {}, "outputs": [ @@ -502,7 +500,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 81, "id": "aa55928d-c6ca-47c5-883d-d14eb0aa1298", "metadata": {}, "outputs": [], @@ -515,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 82, "id": "26e80afa-b9dc-4778-8301-ce38264d58cd", "metadata": {}, "outputs": [], @@ -526,7 +524,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 83, "id": "68e04606-ca81-4a1f-81d2-964495295ed3", "metadata": {}, "outputs": [], @@ -537,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 84, "id": "43c55049-fbd9-4a1c-ae74-c12b5f5a03ee", "metadata": {}, "outputs": [], @@ -558,7 +556,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 85, "id": "3ec7c739-6adc-4591-b5b2-9e60d7783c3c", "metadata": {}, "outputs": [], @@ -576,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 86, "id": "541794c7-f9a6-4d42-a522-8f4a3d1b1dfa", "metadata": {}, "outputs": [], @@ -601,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 87, "id": "6f2ebedc-7d72-4deb-838c-42b8f103ceb4", "metadata": {}, "outputs": [], @@ -635,7 +633,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 88, "id": "5ae9bc82-9ad7-46dd-b254-19cbdcdd0e07", "metadata": {}, "outputs": [], @@ -645,10 +643,39 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 89, "id": "aff7b261-3330-4fa9-9a54-3fd87b42521f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Pulling image milvusdb/milvus:v2.5.10\n", + "INFO:testcontainers.core.container:Pulling image milvusdb/milvus:v2.5.10\n", + "Container started: aefe936ace2d\n", + "INFO:testcontainers.core.container:Container started: aefe936ace2d\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", + "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n" + ] + } + ], "source": [ "if not db:\n", " db = MilvusEnrichmentTestHelper.start_db_container()" @@ -656,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 90, "id": "31496ee0-75a2-48ad-954e-9c4ae5abbf5e", "metadata": {}, "outputs": [], @@ -666,7 +693,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 91, "id": "82627714-2425-4058-9b47-d262f015caf7", "metadata": {}, "outputs": [], @@ -676,7 +703,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 92, "id": "e8a85f51-5d5f-4533-bf0f-ec825e613dc2", "metadata": {}, "outputs": [ @@ -686,7 +713,7 @@ "'2.5.10'" ] }, - "execution_count": 124, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } @@ -713,7 +740,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 93, "id": "c014af94-1bb7-44e4-842c-1039f4a2a11d", "metadata": {}, "outputs": [], @@ -741,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 94, "id": "54fb3428-b007-4804-9d79-b3933d3256c5", "metadata": {}, "outputs": [], @@ -757,7 +784,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 95, "id": "4c2f123a-5949-4974-af48-a5db5b168c11", "metadata": {}, "outputs": [ @@ -767,7 +794,7 @@ "{'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': , 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': , 'params': {'dim': 384}}, {'name': 'sparse_vector', 'description': '', 'type': , 'is_function_output': True}, {'name': 'title', 'description': '', 'type': , 'params': {'max_length': 256}}, {'name': 'content', 'description': '', 'type': , 'params': {'max_length': 65279}}, {'name': 'combined_text', 'description': '', 'type': , 'params': {'max_length': 65535, 'enable_analyzer': True}}, {'name': 'doc_id', 'description': '', 'type': , 'params': {'max_length': 100}}, {'name': 'keywords', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 64}, 'element_type': }, {'name': 'tags', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 32}, 'element_type': }], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': , 'input_field_names': ['combined_text'], 'output_field_names': ['sparse_vector'], 'params': {}}]}" ] }, - "execution_count": 127, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } @@ -787,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 96, "id": "671f4352-2086-4428-83be-0de48926682d", "metadata": {}, "outputs": [], @@ -805,7 +832,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 97, "id": "aa8baae5-7c38-4e78-ace4-304c7dc6b127", "metadata": {}, "outputs": [], @@ -828,7 +855,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 98, "id": "d970a35b-f9b2-4f8f-93ef-8de5c83c31b5", "metadata": {}, "outputs": [], @@ -843,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 99, "id": "0d45a6ad-2009-4e30-b38d-73266da98a06", "metadata": {}, "outputs": [ @@ -854,7 +881,7 @@ " {'field_name': 'sparse_vector', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]" ] }, - "execution_count": 131, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -873,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 100, "id": "51dd4423-240c-4271-bb8c-6270f399a25c", "metadata": {}, "outputs": [], @@ -883,7 +910,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 101, "id": "9620b1f2-51fa-491c-ad3f-f0676b9b25f6", "metadata": {}, "outputs": [], @@ -893,7 +920,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 102, "id": "e6cf3a1d-265c-44db-aba8-d491fab290d5", "metadata": {}, "outputs": [], @@ -903,7 +930,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 103, "id": "94497411-43d3-4300-98b3-1cb33759738e", "metadata": {}, "outputs": [ @@ -913,7 +940,7 @@ "True" ] }, - "execution_count": 135, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -940,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 104, "id": "20fd6f92-277f-42a3-b0a1-d9e9cb030caa", "metadata": {}, "outputs": [], @@ -960,17 +987,17 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 105, "id": "178e59dd-d9aa-4948-a02b-f57ee919f0ff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'insert_count': 5, 'ids': [460150998305603791, 460150998305603792, 460150998305603793, 460150998305603794, 460150998305603795], 'cost': 0}" + "{'insert_count': 5, 'ids': [460862423920279605, 460862423920279606, 460862423920279607, 460862423920279608, 460862423920279609], 'cost': 0}" ] }, - "execution_count": 137, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -989,27 +1016,10 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 106, "id": "b01b111e-41f2-4d9f-b7f5-4fc42305fbe0", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'id': 460150998305603791, 'distance': 0.5704954862594604, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': 'Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.', 'doc_id': '1_1', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", - "---\n", - "{'id': 460150998305603792, 'distance': 0.43758389353752136, 'entity': {'title': 'Apache Beam: Unified Model for Batch and Streaming Data', 'content': \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\", 'doc_id': '1_2', 'keywords': ['Apache Beam', 'stream processing', 'batch processing', 'data pipelines', 'SDK'], 'tags': ['Data Engineering', 'Open Source', 'Streaming', 'Batch', 'Big Data']}}\n", - "---\n", - "{'id': 460150998305603794, 'distance': 0.36327481269836426, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.', 'doc_id': '2_2', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", - "---\n", - "{'id': 460150998305603795, 'distance': 0.34582412242889404, 'entity': {'title': 'Google Beam: 3D Communication Powered by AI', 'content': 'Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.', 'doc_id': '3_1', 'keywords': ['Google Beam', 'Project Starline', '3D video', 'AI communication', 'real-time meetings'], 'tags': ['AI', 'Communication', '3D Technology', 'Remote Work', 'Enterprise Tech']}}\n", - "---\n", - "{'id': 460150998305603793, 'distance': 0.2492937296628952, 'entity': {'title': 'Google Cloud Dataflow: Run Apache Beam in the Cloud', 'content': 'Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.', 'doc_id': '2_1', 'keywords': ['Google Cloud', 'Dataflow', 'Apache Beam', 'serverless', 'stream and batch'], 'tags': ['Cloud Computing', 'Data Pipelines', 'Google Cloud', 'Serverless', 'Enterprise']}}\n", - "---\n" - ] - } - ], + "outputs": [], "source": [ "# Search by content vector similarity.\n", "query_embedding = model.encode(\"What is apache beam\")\n", @@ -1046,7 +1056,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 107, "id": "4911e8cc-10f1-4d21-9251-1b756b61f2c1", "metadata": {}, "outputs": [], @@ -1096,7 +1106,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 108, "id": "dcbed23b-1fc2-4f89-a6d0-e05c15d5e655", "metadata": { "scrolled": true @@ -1105,10 +1115,10 @@ { "data": { "text/plain": [ - "MilvusConnectionParameters(uri='http://localhost:33623', user='', password='', db_id='default', token='', timeout=None, kwargs={})" + "MilvusConnectionParameters(uri='http://localhost:60085', user='', password='', db_id='default', token='', timeout=None, kwargs={})" ] }, - "execution_count": 140, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } @@ -1127,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 109, "id": "74db1238-0a04-4e08-818d-5bce8f09006b", "metadata": {}, "outputs": [], @@ -1137,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 110, "id": "79e16531-8bec-4b4b-9ed3-cebd705480e0", "metadata": {}, "outputs": [], @@ -1150,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 111, "id": "cbef1911-6464-4ba1-8974-ed00896c7e8b", "metadata": {}, "outputs": [], @@ -1160,7 +1170,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 112, "id": "f0481286-3f2b-4690-a2f6-a5a00de3ff34", "metadata": {}, "outputs": [], @@ -1173,7 +1183,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 113, "id": "35ee37f2-60cd-4d5d-aef6-aed4fda79161", "metadata": {}, "outputs": [ @@ -1185,17 +1195,9 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 460150998305603791,\n", + " \"id\": 460862423920279605,\n", " \"distance\": 0.453,\n", " \"fields\": {\n", - " \"tags\": [\n", - " \"Data Engineering\",\n", - " \"Open Source\",\n", - " \"Streaming\",\n", - " \"Batch\",\n", - " \"Big Data\"\n", - " ],\n", - " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", @@ -1203,13 +1205,7 @@ " \"batch processing\",\n", " \"data pipelines\",\n", " \"SDK\"\n", - " ]\n", - " }\n", - " },\n", - " {\n", - " \"id\": 460150998305603792,\n", - " \"distance\": 0.4353,\n", - " \"fields\": {\n", + " ],\n", " \"tags\": [\n", " \"Data Engineering\",\n", " \"Open Source\",\n", @@ -1217,7 +1213,13 @@ " \"Batch\",\n", " \"Big Data\"\n", " ],\n", - " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n", + " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n", + " }\n", + " },\n", + " {\n", + " \"id\": 460862423920279606,\n", + " \"distance\": 0.4353,\n", + " \"fields\": {\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", @@ -1225,21 +1227,21 @@ " \"batch processing\",\n", " \"data pipelines\",\n", " \"SDK\"\n", - " ]\n", + " ],\n", + " \"tags\": [\n", + " \"Data Engineering\",\n", + " \"Open Source\",\n", + " \"Streaming\",\n", + " \"Batch\",\n", + " \"Big Data\"\n", + " ],\n", + " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n", " }\n", " },\n", " {\n", - " \"id\": 460150998305603795,\n", + " \"id\": 460862423920279609,\n", " \"distance\": 0.3927,\n", " \"fields\": {\n", - " \"tags\": [\n", - " \"AI\",\n", - " \"Communication\",\n", - " \"3D Technology\",\n", - " \"Remote Work\",\n", - " \"Enterprise Tech\"\n", - " ],\n", - " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", " \"keywords\": [\n", " \"Google Beam\",\n", @@ -1247,21 +1249,21 @@ " \"3D video\",\n", " \"AI communication\",\n", " \"real-time meetings\"\n", - " ]\n", + " ],\n", + " \"tags\": [\n", + " \"AI\",\n", + " \"Communication\",\n", + " \"3D Technology\",\n", + " \"Remote Work\",\n", + " \"Enterprise Tech\"\n", + " ],\n", + " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\n", " }\n", " },\n", " {\n", - " \"id\": 460150998305603794,\n", + " \"id\": 460862423920279608,\n", " \"distance\": 0.2925,\n", " \"fields\": {\n", - " \"tags\": [\n", - " \"Cloud Computing\",\n", - " \"Data Pipelines\",\n", - " \"Google Cloud\",\n", - " \"Serverless\",\n", - " \"Enterprise\"\n", - " ],\n", - " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n", " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", @@ -1269,13 +1271,7 @@ " \"Apache Beam\",\n", " \"serverless\",\n", " \"stream and batch\"\n", - " ]\n", - " }\n", - " },\n", - " {\n", - " \"id\": 460150998305603793,\n", - " \"distance\": 0.2342,\n", - " \"fields\": {\n", + " ],\n", " \"tags\": [\n", " \"Cloud Computing\",\n", " \"Data Pipelines\",\n", @@ -1283,7 +1279,13 @@ " \"Serverless\",\n", " \"Enterprise\"\n", " ],\n", - " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", + " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\n", + " }\n", + " },\n", + " {\n", + " \"id\": 460862423920279607,\n", + " \"distance\": 0.2342,\n", + " \"fields\": {\n", " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", @@ -1291,7 +1293,15 @@ " \"Apache Beam\",\n", " \"serverless\",\n", " \"stream and batch\"\n", - " ]\n", + " ],\n", + " \"tags\": [\n", + " \"Cloud Computing\",\n", + " \"Data Pipelines\",\n", + " \"Google Cloud\",\n", + " \"Serverless\",\n", + " \"Enterprise\"\n", + " ],\n", + " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\n", " }\n", " }\n", " ]\n", @@ -1304,17 +1314,9 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Data Engineering\"\u001b[0m,\n", - " \u001b[32m\"Open Source\"\u001b[0m,\n", - " \u001b[32m\"Streaming\"\u001b[0m,\n", - " \u001b[32m\"Batch\"\u001b[0m,\n", - " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", @@ -1322,13 +1324,7 @@ " \u001b[32m\"batch processing\"\u001b[0m,\n", " \u001b[32m\"data pipelines\"\u001b[0m,\n", " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603792\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", - " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Data Engineering\"\u001b[0m,\n", " \u001b[32m\"Open Source\"\u001b[0m,\n", @@ -1336,7 +1332,13 @@ " \u001b[32m\"Batch\"\u001b[0m,\n", " \u001b[32m\"Big Data\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279606\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", @@ -1344,21 +1346,21 @@ " \u001b[32m\"batch processing\"\u001b[0m,\n", " \u001b[32m\"data pipelines\"\u001b[0m,\n", " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603795\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279609\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"AI\"\u001b[0m,\n", - " \u001b[32m\"Communication\"\u001b[0m,\n", - " \u001b[32m\"3D Technology\"\u001b[0m,\n", - " \u001b[32m\"Remote Work\"\u001b[0m,\n", - " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Beam\"\u001b[0m,\n", @@ -1366,21 +1368,21 @@ " \u001b[32m\"3D video\"\u001b[0m,\n", " \u001b[32m\"AI communication\"\u001b[0m,\n", " \u001b[32m\"real-time meetings\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"AI\"\u001b[0m,\n", + " \u001b[32m\"Communication\"\u001b[0m,\n", + " \u001b[32m\"3D Technology\"\u001b[0m,\n", + " \u001b[32m\"Remote Work\"\u001b[0m,\n", + " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603794\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279608\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2925\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", - " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Serverless\"\u001b[0m,\n", - " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", @@ -1388,13 +1390,7 @@ " \u001b[32m\"Apache Beam\"\u001b[0m,\n", " \u001b[32m\"serverless\"\u001b[0m,\n", " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - " \u001b[1m}\u001b[0m\n", - " \u001b[1m}\u001b[0m,\n", - " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603793\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2342\u001b[0m,\n", - " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", @@ -1402,7 +1398,13 @@ " \u001b[32m\"Serverless\"\u001b[0m,\n", " \u001b[32m\"Enterprise\"\u001b[0m\n", " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279607\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2342\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", @@ -1410,7 +1412,15 @@ " \u001b[32m\"Apache Beam\"\u001b[0m,\n", " \u001b[32m\"serverless\"\u001b[0m,\n", " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -1440,7 +1450,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 114, "id": "f159ad87-5153-48bb-87b3-3845d3c76420", "metadata": {}, "outputs": [], @@ -1450,7 +1460,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 115, "id": "8b8cad3e-8a18-464b-8de6-aa4515a653c5", "metadata": {}, "outputs": [], @@ -1463,7 +1473,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 116, "id": "47cfc650-0b34-4333-9321-19be2e8fdc85", "metadata": {}, "outputs": [], @@ -1473,7 +1483,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 117, "id": "4754763b-66bf-4f90-9920-28cef223b536", "metadata": {}, "outputs": [], @@ -1486,7 +1496,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 118, "id": "a3db4837-01c7-42d7-b4e8-58d8d361fe93", "metadata": {}, "outputs": [ @@ -1498,11 +1508,9 @@ " \"query_embedding\": null,\n", " \"results\": [\n", " {\n", - " \"id\": 460150998305603792,\n", + " \"id\": 460862423920279606,\n", " \"distance\": 0.5657,\n", " \"fields\": {\n", - " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", " \"stream processing\",\n", @@ -1516,15 +1524,15 @@ " \"Streaming\",\n", " \"Batch\",\n", " \"Big Data\"\n", - " ]\n", + " ],\n", + " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n", " }\n", " },\n", " {\n", - " \"id\": 460150998305603795,\n", + " \"id\": 460862423920279609,\n", " \"distance\": 0.5471,\n", " \"fields\": {\n", - " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", - " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", " \"keywords\": [\n", " \"Google Beam\",\n", " \"Project Starline\",\n", @@ -1538,15 +1546,15 @@ " \"3D Technology\",\n", " \"Remote Work\",\n", " \"Enterprise Tech\"\n", - " ]\n", + " ],\n", + " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", + " \"title\": \"Google Beam: 3D Communication Powered by AI\"\n", " }\n", " },\n", " {\n", - " \"id\": 460150998305603791,\n", + " \"id\": 460862423920279605,\n", " \"distance\": 0.53,\n", " \"fields\": {\n", - " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", " \"stream processing\",\n", @@ -1560,15 +1568,15 @@ " \"Streaming\",\n", " \"Batch\",\n", " \"Big Data\"\n", - " ]\n", + " ],\n", + " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n", " }\n", " },\n", " {\n", - " \"id\": 460150998305603793,\n", + " \"id\": 460862423920279607,\n", " \"distance\": 0.5055,\n", " \"fields\": {\n", - " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", - " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", " \"Dataflow\",\n", @@ -1582,15 +1590,15 @@ " \"Google Cloud\",\n", " \"Serverless\",\n", " \"Enterprise\"\n", - " ]\n", + " ],\n", + " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n", " }\n", " },\n", " {\n", - " \"id\": 460150998305603794,\n", + " \"id\": 460862423920279608,\n", " \"distance\": 0.134,\n", " \"fields\": {\n", - " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n", - " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", " \"Dataflow\",\n", @@ -1604,7 +1612,9 @@ " \"Google Cloud\",\n", " \"Serverless\",\n", " \"Enterprise\"\n", - " ]\n", + " ],\n", + " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n", " }\n", " }\n", " ]\n", @@ -1617,11 +1627,9 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603792\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279606\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5657\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", " \u001b[32m\"stream processing\"\u001b[0m,\n", @@ -1635,15 +1643,15 @@ " \u001b[32m\"Streaming\"\u001b[0m,\n", " \u001b[32m\"Batch\"\u001b[0m,\n", " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603795\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279609\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5471\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Beam\"\u001b[0m,\n", " \u001b[32m\"Project Starline\"\u001b[0m,\n", @@ -1657,15 +1665,15 @@ " \u001b[32m\"3D Technology\"\u001b[0m,\n", " \u001b[32m\"Remote Work\"\u001b[0m,\n", " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.53\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", " \u001b[32m\"stream processing\"\u001b[0m,\n", @@ -1679,15 +1687,15 @@ " \u001b[32m\"Streaming\"\u001b[0m,\n", " \u001b[32m\"Batch\"\u001b[0m,\n", " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603793\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279607\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5055\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", " \u001b[32m\"Dataflow\"\u001b[0m,\n", @@ -1701,15 +1709,15 @@ " \u001b[32m\"Google Cloud\"\u001b[0m,\n", " \u001b[32m\"Serverless\"\u001b[0m,\n", " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603794\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279608\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.134\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", " \u001b[32m\"Dataflow\"\u001b[0m,\n", @@ -1723,7 +1731,9 @@ " \u001b[32m\"Google Cloud\"\u001b[0m,\n", " \u001b[32m\"Serverless\"\u001b[0m,\n", " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -1770,7 +1780,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 119, "id": "172b6c80-2a03-49d0-afc7-12bb0a4dc989", "metadata": {}, "outputs": [], @@ -1781,7 +1791,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 120, "id": "eb6d951c-0def-45cc-84a4-b6f7b7575f23", "metadata": {}, "outputs": [], @@ -1795,7 +1805,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 121, "id": "b339c498-d229-42e6-b439-b29eb107b533", "metadata": {}, "outputs": [], @@ -1808,7 +1818,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 122, "id": "b346abe6-03c9-4b28-a0fb-74936b9f3a06", "metadata": {}, "outputs": [], @@ -1818,7 +1828,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 123, "id": "ab27810d-40a8-4b6a-bc82-441e13763ebc", "metadata": {}, "outputs": [], @@ -1831,7 +1841,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 124, "id": "9a37aa5b-d652-4dd3-9fe0-e277182415b9", "metadata": {}, "outputs": [], @@ -1846,7 +1856,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 125, "id": "ea9d84f7-d142-4afa-9a6f-6c310d9604b0", "metadata": {}, "outputs": [ @@ -1858,17 +1868,9 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 460150998305603791,\n", + " \"id\": 460862423920279605,\n", " \"distance\": 0.5,\n", " \"fields\": {\n", - " \"tags\": [\n", - " \"Data Engineering\",\n", - " \"Open Source\",\n", - " \"Streaming\",\n", - " \"Batch\",\n", - " \"Big Data\"\n", - " ],\n", - " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", " \"keywords\": [\n", " \"Apache Beam\",\n", @@ -1876,21 +1878,21 @@ " \"batch processing\",\n", " \"data pipelines\",\n", " \"SDK\"\n", - " ]\n", + " ],\n", + " \"tags\": [\n", + " \"Data Engineering\",\n", + " \"Open Source\",\n", + " \"Streaming\",\n", + " \"Batch\",\n", + " \"Big Data\"\n", + " ],\n", + " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n", " }\n", " },\n", " {\n", - " \"id\": 460150998305603793,\n", + " \"id\": 460862423920279607,\n", " \"distance\": 0.3667,\n", " \"fields\": {\n", - " \"tags\": [\n", - " \"Cloud Computing\",\n", - " \"Data Pipelines\",\n", - " \"Google Cloud\",\n", - " \"Serverless\",\n", - " \"Enterprise\"\n", - " ],\n", - " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", " \"keywords\": [\n", " \"Google Cloud\",\n", @@ -1898,7 +1900,15 @@ " \"Apache Beam\",\n", " \"serverless\",\n", " \"stream and batch\"\n", - " ]\n", + " ],\n", + " \"tags\": [\n", + " \"Cloud Computing\",\n", + " \"Data Pipelines\",\n", + " \"Google Cloud\",\n", + " \"Serverless\",\n", + " \"Enterprise\"\n", + " ],\n", + " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\n", " }\n", " }\n", " ]\n", @@ -1911,17 +1921,9 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Data Engineering\"\u001b[0m,\n", - " \u001b[32m\"Open Source\"\u001b[0m,\n", - " \u001b[32m\"Streaming\"\u001b[0m,\n", - " \u001b[32m\"Batch\"\u001b[0m,\n", - " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Apache Beam\"\u001b[0m,\n", @@ -1929,21 +1931,21 @@ " \u001b[32m\"batch processing\"\u001b[0m,\n", " \u001b[32m\"data pipelines\"\u001b[0m,\n", " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603793\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279607\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3667\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", - " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Serverless\"\u001b[0m,\n", - " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Cloud\"\u001b[0m,\n", @@ -1951,7 +1953,15 @@ " \u001b[32m\"Apache Beam\"\u001b[0m,\n", " \u001b[32m\"serverless\"\u001b[0m,\n", " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -1990,7 +2000,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 126, "id": "6e79ef5c-a121-4e69-9089-0991821f8745", "metadata": {}, "outputs": [], @@ -2000,7 +2010,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 127, "id": "ebbcbbe8-f63d-4ff4-9160-719a0fbe9b06", "metadata": {}, "outputs": [], @@ -2013,7 +2023,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 128, "id": "5314c531-14bb-4d81-92a5-fcf9cca7fa81", "metadata": {}, "outputs": [], @@ -2026,7 +2036,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 129, "id": "0ecf2ac6-cf90-4ce7-b17f-113af90ab950", "metadata": {}, "outputs": [], @@ -2036,7 +2046,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 130, "id": "0cd92b69-b9dc-445c-9bd7-21bb3ceb0fd3", "metadata": {}, "outputs": [], @@ -2049,7 +2059,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 131, "id": "b06ecf64-c314-4c6a-ae1a-4fdf059aeead", "metadata": {}, "outputs": [ @@ -2061,7 +2071,7 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 460150998305603791,\n", + " \"id\": 460862423920279605,\n", " \"distance\": 0.453,\n", " \"fields\": {\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", @@ -2083,7 +2093,7 @@ " }\n", " },\n", " {\n", - " \"id\": 460150998305603792,\n", + " \"id\": 460862423920279606,\n", " \"distance\": 0.4353,\n", " \"fields\": {\n", " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", @@ -2114,7 +2124,7 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603791\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", @@ -2136,7 +2146,7 @@ " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603792\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279606\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", @@ -2184,7 +2194,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 132, "id": "a8077395-c374-400f-abdc-fe6630eab8a4", "metadata": {}, "outputs": [], @@ -2194,7 +2204,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 133, "id": "3b712779-f283-4e37-88ed-d6b65c6c45d2", "metadata": {}, "outputs": [], @@ -2207,7 +2217,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 134, "id": "7f0924a3-8832-4138-a599-d3aef648b962", "metadata": {}, "outputs": [], @@ -2217,7 +2227,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 135, "id": "516ecbf0-9bb0-4177-829b-b79300b29bbe", "metadata": {}, "outputs": [], @@ -2230,7 +2240,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 136, "id": "db32dda5-0668-4162-80ea-b6a0c2a79063", "metadata": {}, "outputs": [ @@ -2242,11 +2252,9 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 460150998305603795,\n", + " \"id\": 460862423920279609,\n", " \"distance\": 0.3927,\n", " \"fields\": {\n", - " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", - " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", " \"keywords\": [\n", " \"Google Beam\",\n", " \"Project Starline\",\n", @@ -2260,7 +2268,9 @@ " \"3D Technology\",\n", " \"Remote Work\",\n", " \"Enterprise Tech\"\n", - " ]\n", + " ],\n", + " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", + " \"title\": \"Google Beam: 3D Communication Powered by AI\"\n", " }\n", " }\n", " ]\n", @@ -2273,11 +2283,9 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460150998305603795\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279609\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[32m\"Google Beam\"\u001b[0m,\n", " \u001b[32m\"Project Starline\"\u001b[0m,\n", @@ -2291,7 +2299,9 @@ " \u001b[32m\"3D Technology\"\u001b[0m,\n", " \u001b[32m\"Remote Work\"\u001b[0m,\n", " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -2321,7 +2331,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 137, "id": "953e61f4-5188-45a6-b30b-d581f7471d17", "metadata": {}, "outputs": [], From 2c786334744c60baa52bcec7c9635c6d61e4380d Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Tue, 16 Sep 2025 19:27:22 +0000 Subject: [PATCH 13/25] CHANGES.md: update release notes --- CHANGES.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f12f25c15a2e..d8b0095cfb52 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -74,10 +74,10 @@ ## New Features / Improvements * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). -* Python examples added for CloudSQL enrichment handler on [Beam website](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-cloudsql/) (Python) ([#35473](https://github.com/apache/beam/issues/36095)). +* Python examples added for CloudSQL enrichment handler on [Beam website](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-cloudsql/) (Python) ([#36095](https://github.com/apache/beam/issues/36095)). * Support for batch mode execution in WriteToPubSub transform added (Python) ([#35990](https://github.com/apache/beam/issues/35990)). * Python examples added for Milvus search enrichment handler on [Beam Website](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-milvus/) - including jupyter notebook example (Python) ([#35467](https://github.com/apache/beam/pull/35467)). + including jupyter notebook example (Python) ([#36176](https://github.com/apache/beam/issues/36176)). ## Breaking Changes From e8898826fe1710e9b0e3c35f7fba60f78ba78685 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Tue, 16 Sep 2025 19:34:41 +0000 Subject: [PATCH 14/25] sdks/python: fix linting issues --- .../examples/snippets/transforms/elementwise/enrichment_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index 0e6835e7cb6f..a073a4420f41 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -228,6 +228,7 @@ class CloudSQLEnrichmentTestDataConstruct: class EnrichmentTestHelpers: + @staticmethod @contextmanager def sql_test_context(is_cloudsql: bool, db_adapter: DatabaseTypeAdapter): result: Optional[CloudSQLEnrichmentTestDataConstruct] = None @@ -239,6 +240,7 @@ def sql_test_context(is_cloudsql: bool, db_adapter: DatabaseTypeAdapter): if result: EnrichmentTestHelpers.post_sql_enrichment_test(result) + @staticmethod @contextmanager def milvus_test_context(): db: Optional[MilvusDBContainerInfo] = None From 116a95c6a12af0c86aa96b1f8f607b50562aaec9 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Wed, 24 Sep 2025 16:37:43 +0000 Subject: [PATCH 15/25] sdks/python: properly skip milvus test on any container startup failures --- .../transforms/elementwise/enrichment_test.py | 37 +++++++++++++------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index a073a4420f41..0ba21d5f08de 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -64,6 +64,11 @@ raise unittest.SkipTest(f'Examples dependencies are not installed: {str(e)}') +class TestContainerStartupError(Exception): + """Raised when any test container fails to start.""" + pass + + def validate_enrichment_with_bigtable(): expected = '''[START enrichment_with_bigtable] Row(sale_id=1, customer_id=1, product_id=1, quantity=1, product={'product_id': '1', 'product_name': 'pixel 5', 'product_stock': '2'}) @@ -206,17 +211,19 @@ def test_enrichment_with_external_sqlserver(self, mock_stdout): self.fail(f"Test failed with unexpected error: {e}") def test_enrichment_with_milvus(self, mock_stdout): - with EnrichmentTestHelpers.milvus_test_context(): - try: - enrichment_with_milvus() - output = mock_stdout.getvalue().splitlines() - expected = validate_enrichment_with_milvus() - self.maxDiff = None - output = parse_chunk_strings(output) - expected = parse_chunk_strings(expected) - assert_chunks_equivalent(output, expected) - except Exception as e: - self.fail(f"Test failed with unexpected error: {e}") + try: + with EnrichmentTestHelpers.milvus_test_context(): + enrichment_with_milvus() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_milvus() + self.maxDiff = None + output = parse_chunk_strings(output) + expected = parse_chunk_strings(expected) + assert_chunks_equivalent(output, expected) + except TestContainerStartupError as e: + raise unittest.SkipTest(str(e)) + except Exception as e: + self.fail(f"Test failed with unexpected error: {e}") @dataclass @@ -236,6 +243,8 @@ def sql_test_context(is_cloudsql: bool, db_adapter: DatabaseTypeAdapter): result = EnrichmentTestHelpers.pre_sql_enrichment_test( is_cloudsql, db_adapter) yield + except Exception as e: + raise unittest.SkipTest(f"Milvus container setup failed: {str(e)}") finally: if result: EnrichmentTestHelpers.post_sql_enrichment_test(result) @@ -352,7 +361,11 @@ def post_sql_enrichment_test(res: CloudSQLEnrichmentTestDataConstruct): @staticmethod def pre_milvus_enrichment() -> MilvusDBContainerInfo: - db = MilvusEnrichmentTestHelper.start_db_container() + try: + db = MilvusEnrichmentTestHelper.start_db_container() + except Exception as e: + raise TestContainerStartupError( + f"Milvus container failed to start: {str(e)}") connection_params = MilvusConnectionParameters( uri=db.uri, From 3a3b03aa6a9a267aee2be404a50f11844b483ef6 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Wed, 24 Sep 2025 17:02:18 +0000 Subject: [PATCH 16/25] sdks/python: properly skip sql tests on any container startup failure --- .../transforms/elementwise/enrichment_test.py | 89 ++++++++++--------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index 0ba21d5f08de..8c8ec28953ec 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -167,48 +167,54 @@ def test_enrichment_with_vertex_ai_legacy(self, mock_stdout): os.environ.get('ALLOYDB_PASSWORD'), "ALLOYDB_PASSWORD environment var is not provided") def test_enrichment_with_google_cloudsql_pg(self, mock_stdout): - db_adapter = DatabaseTypeAdapter.POSTGRESQL - with EnrichmentTestHelpers.sql_test_context(True, db_adapter): - try: + try: + db_adapter = DatabaseTypeAdapter.POSTGRESQL + with EnrichmentTestHelpers.sql_test_context(True, db_adapter): enrichment_with_google_cloudsql_pg() output = mock_stdout.getvalue().splitlines() expected = validate_enrichment_with_google_cloudsql_pg() self.assertEqual(output, expected) - except Exception as e: - self.fail(f"Test failed with unexpected error: {e}") + except Exception as e: + self.fail(f"Test failed with unexpected error: {e}") def test_enrichment_with_external_pg(self, mock_stdout): - db_adapter = DatabaseTypeAdapter.POSTGRESQL - with EnrichmentTestHelpers.sql_test_context(False, db_adapter): - try: + try: + db_adapter = DatabaseTypeAdapter.POSTGRESQL + with EnrichmentTestHelpers.sql_test_context(False, db_adapter): enrichment_with_external_pg() output = mock_stdout.getvalue().splitlines() expected = validate_enrichment_with_external_pg() self.assertEqual(output, expected) - except Exception as e: - self.fail(f"Test failed with unexpected error: {e}") + except TestContainerStartupError as e: + raise unittest.SkipTest(str(e)) + except Exception as e: + self.fail(f"Test failed with unexpected error: {e}") def test_enrichment_with_external_mysql(self, mock_stdout): - db_adapter = DatabaseTypeAdapter.MYSQL - with EnrichmentTestHelpers.sql_test_context(False, db_adapter): - try: - enrichment_with_external_mysql() - output = mock_stdout.getvalue().splitlines() - expected = validate_enrichment_with_external_mysql() - self.assertEqual(output, expected) - except Exception as e: - self.fail(f"Test failed with unexpected error: {e}") + try: + db_adapter = DatabaseTypeAdapter.MYSQL + with EnrichmentTestHelpers.sql_test_context(False, db_adapter): + enrichment_with_external_mysql() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_external_mysql() + self.assertEqual(output, expected) + except TestContainerStartupError as e: + raise unittest.SkipTest(str(e)) + except Exception as e: + self.fail(f"Test failed with unexpected error: {e}") def test_enrichment_with_external_sqlserver(self, mock_stdout): - db_adapter = DatabaseTypeAdapter.SQLSERVER - with EnrichmentTestHelpers.sql_test_context(False, db_adapter): - try: + try: + db_adapter = DatabaseTypeAdapter.SQLSERVER + with EnrichmentTestHelpers.sql_test_context(False, db_adapter): enrichment_with_external_sqlserver() output = mock_stdout.getvalue().splitlines() expected = validate_enrichment_with_external_sqlserver() self.assertEqual(output, expected) - except Exception as e: - self.fail(f"Test failed with unexpected error: {e}") + except TestContainerStartupError as e: + raise unittest.SkipTest(str(e)) + except Exception as e: + self.fail(f"Test failed with unexpected error: {e}") def test_enrichment_with_milvus(self, mock_stdout): try: @@ -243,8 +249,6 @@ def sql_test_context(is_cloudsql: bool, db_adapter: DatabaseTypeAdapter): result = EnrichmentTestHelpers.pre_sql_enrichment_test( is_cloudsql, db_adapter) yield - except Exception as e: - raise unittest.SkipTest(f"Milvus container setup failed: {str(e)}") finally: if result: EnrichmentTestHelpers.post_sql_enrichment_test(result) @@ -307,20 +311,25 @@ def pre_sql_enrichment_test( password=password, db_id=db_id) else: - db = SQLEnrichmentTestHelper.start_sql_db_container(db_adapter) - os.environ['EXTERNAL_SQL_DB_HOST'] = db.host - os.environ['EXTERNAL_SQL_DB_PORT'] = str(db.port) - os.environ['EXTERNAL_SQL_DB_ID'] = db.id - os.environ['EXTERNAL_SQL_DB_USER'] = db.user - os.environ['EXTERNAL_SQL_DB_PASSWORD'] = db.password - os.environ['EXTERNAL_SQL_DB_TABLE_ID'] = table_id - connection_config = ExternalSQLDBConnectionConfig( - db_adapter=db_adapter, - host=db.host, - port=db.port, - user=db.user, - password=db.password, - db_id=db.id) + try: + db = SQLEnrichmentTestHelper.start_sql_db_container(db_adapter) + os.environ['EXTERNAL_SQL_DB_HOST'] = db.host + os.environ['EXTERNAL_SQL_DB_PORT'] = str(db.port) + os.environ['EXTERNAL_SQL_DB_ID'] = db.id + os.environ['EXTERNAL_SQL_DB_USER'] = db.user + os.environ['EXTERNAL_SQL_DB_PASSWORD'] = db.password + os.environ['EXTERNAL_SQL_DB_TABLE_ID'] = table_id + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=db_adapter, + host=db.host, + port=db.port, + user=db.user, + password=db.password, + db_id=db.id) + except Exception as e: + db_name = db_adapter.value.lower() + raise TestContainerStartupError( + f"{db_name} container failed to start: {str(e)}") conenctor = connection_config.get_connector_handler() engine = create_engine( From 8bab6248a16f37f327325c0b1c40492daf2ee5ad Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Wed, 24 Sep 2025 17:35:18 +0000 Subject: [PATCH 17/25] sdks/python: fix linting issues --- .../transforms/elementwise/enrichment_test.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py index 8c8ec28953ec..76e5de5e6220 100644 --- a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py +++ b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py @@ -194,10 +194,10 @@ def test_enrichment_with_external_mysql(self, mock_stdout): try: db_adapter = DatabaseTypeAdapter.MYSQL with EnrichmentTestHelpers.sql_test_context(False, db_adapter): - enrichment_with_external_mysql() - output = mock_stdout.getvalue().splitlines() - expected = validate_enrichment_with_external_mysql() - self.assertEqual(output, expected) + enrichment_with_external_mysql() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_external_mysql() + self.assertEqual(output, expected) except TestContainerStartupError as e: raise unittest.SkipTest(str(e)) except Exception as e: @@ -219,13 +219,13 @@ def test_enrichment_with_external_sqlserver(self, mock_stdout): def test_enrichment_with_milvus(self, mock_stdout): try: with EnrichmentTestHelpers.milvus_test_context(): - enrichment_with_milvus() - output = mock_stdout.getvalue().splitlines() - expected = validate_enrichment_with_milvus() - self.maxDiff = None - output = parse_chunk_strings(output) - expected = parse_chunk_strings(expected) - assert_chunks_equivalent(output, expected) + enrichment_with_milvus() + output = mock_stdout.getvalue().splitlines() + expected = validate_enrichment_with_milvus() + self.maxDiff = None + output = parse_chunk_strings(output) + expected = parse_chunk_strings(expected) + assert_chunks_equivalent(output, expected) except TestContainerStartupError as e: raise unittest.SkipTest(str(e)) except Exception as e: @@ -329,7 +329,7 @@ def pre_sql_enrichment_test( except Exception as e: db_name = db_adapter.value.lower() raise TestContainerStartupError( - f"{db_name} container failed to start: {str(e)}") + f"{db_name} container failed to start: {str(e)}") conenctor = connection_config.get_connector_handler() engine = create_engine( @@ -374,7 +374,7 @@ def pre_milvus_enrichment() -> MilvusDBContainerInfo: db = MilvusEnrichmentTestHelper.start_db_container() except Exception as e: raise TestContainerStartupError( - f"Milvus container failed to start: {str(e)}") + f"Milvus container failed to start: {str(e)}") connection_params = MilvusConnectionParameters( uri=db.uri, From f8af0379481e8480b1f8165ceebd50c90b71cf06 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sat, 18 Oct 2025 18:41:35 +0000 Subject: [PATCH 18/25] examples: address comments on milvus jupyter notebook --- .../beam-ml/milvus_enrichment_transform.ipynb | 2276 +++++++++-------- 1 file changed, 1245 insertions(+), 1031 deletions(-) diff --git a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb index 64085437797f..3b632892d62b 100644 --- a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb +++ b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 70, + "execution_count": 928, "id": "47053bac", "metadata": {}, "outputs": [], @@ -49,7 +49,7 @@ "id": "0611da21-d031-4b16-8301-9b76bda731e7", "metadata": {}, "source": [ - "This notebook shows how to enrich data by using the Apache Beam [enrichment transform](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment/) with [Milvus](https://milvus.io/). The enrichment transform is an Apache Beam turnkey transform that lets you enrich data by using a key-value lookup. This transform has the following features:\n", + "This notebook shows how to enrich data by using the Apache Beam [enrichment transform](https://beam.apache.org/documentation/transforms/python/elementwise/enrichment-milvus) with [Milvus](https://milvus.io/). The enrichment transform is an Apache Beam turnkey transform that lets you enrich data by using a key-value lookup. This transform has the following features:\n", "\n", "- The transform has a built-in Apache Beam handler that interacts with Milvus data during enrichment.\n", "- The enrichment transform uses client-side throttling to rate limit the requests. The default retry strategy uses exponential backoff. You can configure rate limiting to suit your use case.\n", @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 929, "id": "e550cd55-e91e-4d43-b1bd-b0e89bb8cbd9", "metadata": {}, "outputs": [], @@ -80,20 +80,38 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 930, "id": "31747c45-107a-49be-8885-5a6cc9dc1236", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "94580.11s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1760812661.675888 2629967 backup_poller.cc:126] Run client channel backup poller: UNKNOWN:pollset_work {created_time:\"2025-10-18T18:37:41.674930542+00:00\", children:[UNKNOWN:epoll_wait: Bad file descriptor (9) {created_time:\"2025-10-18T18:37:41.674662134+00:00\"}]}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "94591.45s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1760812676.675094 2630020 backup_poller.cc:126] Run client channel backup poller: UNKNOWN:pollset_work {created_time:\"2025-10-18T18:37:56.674462799+00:00\", children:[UNKNOWN:epoll_wait: Bad file descriptor (9) {created_time:\"2025-10-18T18:37:56.674318855+00:00\"}]}\n" ] } ], @@ -101,12 +119,12 @@ "# The Apache Beam test dependencies are included here for the TestContainers\n", "# Milvus standalone DB container that will be used later in the demo.\n", "!pip install rich sentence_transformers llama_index --quiet\n", - "!pip install apache_beam[milvus,gcp,test,interactive]>=2.67.0 --quiet" + "!pip install apache_beam[milvus,gcp,test,interactive] --quiet" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 931, "id": "666e0c2b-0341-4b0e-8d73-561abc39bb10", "metadata": {}, "outputs": [], @@ -170,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 932, "id": "38781cf5-e18f-40f5-827e-2d441ae7d2fa", "metadata": {}, "outputs": [], @@ -263,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 933, "id": "489e93b6-de41-4ec3-be33-a15c3cba12e8", "metadata": {}, "outputs": [ @@ -340,7 +358,7 @@ "max 312.000000" ] }, - "execution_count": 75, + "execution_count": 933, "metadata": {}, "output_type": "execute_result" } @@ -355,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 934, "id": "eb32aad0-febd-45af-b4bd-e2176b07e2dc", "metadata": {}, "outputs": [ @@ -374,239 +392,6 @@ "print(f'The mean word count for each video is about {mean_word_count} words, which corresponds to a rough token count of {approx_token_count} tokens.')" ] }, - { - "cell_type": "markdown", - "id": "42c1c159-875d-411b-a009-4361301b39f6", - "metadata": {}, - "source": [ - "## Preprocess Data" - ] - }, - { - "cell_type": "markdown", - "id": "d545355e-41da-4c53-ba9a-4d33b1fe376c", - "metadata": {}, - "source": [ - "### Chunking" - ] - }, - { - "cell_type": "markdown", - "id": "a034c5d0-0906-4193-80ac-736a32d7b47e", - "metadata": {}, - "source": [ - "We'll use sentence splitting as the chunking strategy for simplicity.
\n", - "Ideally, we would pass a tokenizer here — preferably the same one used by the retriever — to ensure consistency.
\n", - "However, in this example, we are not using a tokenizer." - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "e7e45d70-0c23-409d-b435-b9479245c1ff", - "metadata": {}, - "outputs": [], - "source": [ - "# The `chunk_size` parameter is constrained by the embedding model we’re using.\n", - "# Since we’re using `sentence-transformers/all-MiniLM-L6-v2`, which has a maximum token limit of ~384 tokens,\n", - "# we need to ensure chunk sizes stay well within that limit.\n", - "# Given that each document in our dataset contains approximately 331 tokens,\n", - "# using a chunk size of 256 allows us to preserve nearly the most semantic meaning of each entry\n", - "# while staying safely under the model’s token limit.\n", - "chunk_size = 256\n", - "llama_txt_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20)" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "5a013b08-d7e7-4367-ad49-43ad1320158f", - "metadata": {}, - "outputs": [], - "source": [ - "def split_contents(corpus: list[dict], text_splitter: SentenceSplitter, content_field: str='content') -> list[list[str]]:\n", - " result = []\n", - " for video in corpus:\n", - " split = llama_txt_splitter.split_text(video[content_field])\n", - " result.append(split)\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "id": "2d5ea747-40b3-474e-ac36-ccb81256a36c", - "metadata": {}, - "outputs": [], - "source": [ - "content_splits = split_contents(corpus, llama_txt_splitter, \"content\")" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "id": "9917cefb-6271-4285-a75d-a6d1bfcbfd06", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
[\n",
-       "  [\n",
-       "    \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n",
-       "    \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n",
-       "  ],\n",
-       "  [\n",
-       "    \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n",
-       "    \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\n",
-       "  ],\n",
-       "  [\n",
-       "    \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\n",
-       "  ]\n",
-       "]\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m[\u001b[0m\n", - " \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", - " \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", - " \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m\n", - " \u001b[1m]\u001b[0m\n", - "\u001b[1m]\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "print_json(data=content_splits)" - ] - }, - { - "cell_type": "markdown", - "id": "c860e558-2da3-45a6-9e54-acb8b4ffab22", - "metadata": {}, - "source": [ - "### Embedding Generation" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "aa55928d-c6ca-47c5-883d-d14eb0aa1298", - "metadata": {}, - "outputs": [], - "source": [ - "# Let's choose `sentence-transformers/all-MiniLM-L6-v2` as our embedding generator here.\n", - "# It gives a good balance between embedding generation speed, accuracy, and being free to use.\n", - "model_name = 'sentence-transformers/all-MiniLM-L6-v2'\n", - "model = SentenceTransformer(model_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "id": "26e80afa-b9dc-4778-8301-ce38264d58cd", - "metadata": {}, - "outputs": [], - "source": [ - "def get_default_device():\n", - " return \"cuda:0\" if cuda.is_available() else \"cpu\"" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "id": "68e04606-ca81-4a1f-81d2-964495295ed3", - "metadata": {}, - "outputs": [], - "source": [ - "def encode_embedding(chunk, device=get_default_device()):\n", - " return list(map(float, model.encode(chunk, device=device)))" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "id": "43c55049-fbd9-4a1c-ae74-c12b5f5a03ee", - "metadata": {}, - "outputs": [], - "source": [ - "def encode_content_splits(content_splits: list[list[str]],\n", - " model: SentenceTransformer,\n", - " device: str = get_default_device()\n", - " ) -> list[list[tuple[str,list]]]:\n", - " result = []\n", - " for split in content_splits:\n", - " sub_result = []\n", - " for chunk in split:\n", - " encoded = encode_embedding(chunk, device)\n", - " sub_result.append((chunk, encoded))\n", - " result.append(sub_result)\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "id": "3ec7c739-6adc-4591-b5b2-9e60d7783c3c", - "metadata": {}, - "outputs": [], - "source": [ - "text_vector_tuples = encode_content_splits(content_splits, model)" - ] - }, - { - "cell_type": "markdown", - "id": "3afe67f9-d3cb-499b-b84b-ad8b14f40362", - "metadata": {}, - "source": [ - "### Joining Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "id": "541794c7-f9a6-4d42-a522-8f4a3d1b1dfa", - "metadata": {}, - "outputs": [], - "source": [ - "def join_metadata(corpus: list[dict], \n", - " text_vector_list: list[list[tuple[str, list]]],\n", - " unique_id_field: str='id',\n", - " content_field: str='content',\n", - " embedding_field: str='content_embedding'\n", - " ) -> list[dict]:\n", - " result = []\n", - " for indx, embeddings in enumerate(text_vector_list):\n", - " for j, (chunk_text, embedding) in enumerate(embeddings):\n", - " doc = {**corpus[indx]}\n", - " doc[content_field] = chunk_text\n", - " doc[embedding_field] = embedding\n", - " doc[\"doc_id\"] = f\"{doc[unique_id_field]}_{j+1}\"\n", - " del doc[unique_id_field]\n", - " result.append(doc)\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "id": "6f2ebedc-7d72-4deb-838c-42b8f103ceb4", - "metadata": {}, - "outputs": [], - "source": [ - "docs = join_metadata(corpus, text_vector_tuples)" - ] - }, { "cell_type": "markdown", "id": "765115e1-4327-44f6-9dff-5d79121eeb02", @@ -633,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 935, "id": "5ae9bc82-9ad7-46dd-b254-19cbdcdd0e07", "metadata": {}, "outputs": [], @@ -643,47 +428,20 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 936, "id": "aff7b261-3330-4fa9-9a54-3fd87b42521f", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Pulling image milvusdb/milvus:v2.5.10\n", - "INFO:testcontainers.core.container:Pulling image milvusdb/milvus:v2.5.10\n", - "Container started: aefe936ace2d\n", - "INFO:testcontainers.core.container:Container started: aefe936ace2d\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n", - "INFO:testcontainers.core.waiting_utils:Waiting for container with image milvusdb/milvus:v2.5.10 to be ready ...\n" - ] - } - ], + "outputs": [], "source": [ - "if not db:\n", - " db = MilvusEnrichmentTestHelper.start_db_container()" + "if db:\n", + " # Stop existing Milvus DB container to prevent duplicates.\n", + " MilvusEnrichmentTestHelper.stop_db_container(db)\n", + "db = MilvusEnrichmentTestHelper.start_db_container()" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 937, "id": "31496ee0-75a2-48ad-954e-9c4ae5abbf5e", "metadata": {}, "outputs": [], @@ -693,7 +451,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 938, "id": "82627714-2425-4058-9b47-d262f015caf7", "metadata": {}, "outputs": [], @@ -703,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 939, "id": "e8a85f51-5d5f-4533-bf0f-ec825e613dc2", "metadata": {}, "outputs": [ @@ -713,7 +471,7 @@ "'2.5.10'" ] }, - "execution_count": 92, + "execution_count": 939, "metadata": {}, "output_type": "execute_result" } @@ -727,7 +485,7 @@ "id": "2344abb9-c170-4496-993e-736e2b50c2bb", "metadata": {}, "source": [ - "### Define Schema" + "### Define Vector Schema and Indices" ] }, { @@ -740,21 +498,18 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 940, "id": "c014af94-1bb7-44e4-842c-1039f4a2a11d", "metadata": {}, "outputs": [], "source": [ "fields = [\n", - " FieldSchema(name=\"id\", dtype=DataType.INT64, is_primary=True, auto_id=True),\n", - " FieldSchema(name=\"vector\", dtype=DataType.FLOAT_VECTOR, dim=model.get_sentence_embedding_dimension()),\n", - " FieldSchema(name=\"sparse_vector\", dtype=DataType.SPARSE_FLOAT_VECTOR),\n", - " FieldSchema(name=\"title\", dtype=DataType.VARCHAR, max_length=256),\n", + " FieldSchema(name=\"id\", dtype=DataType.VARCHAR, is_primary=True, max_length=100),\n", " FieldSchema(name=\"content\", dtype=DataType.VARCHAR, max_length=65279),\n", - " FieldSchema(name=\"combined_text\", dtype=DataType.VARCHAR, max_length=65279+256, enable_analyzer=True),\n", - " FieldSchema(name=\"doc_id\", dtype=DataType.VARCHAR, max_length=100),\n", - " FieldSchema(name=\"keywords\", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=100, max_capacity=64),\n", - " FieldSchema(name=\"tags\", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_length=100, max_capacity=32),\n", + " FieldSchema(name=\"embedding\", dtype=DataType.FLOAT_VECTOR, dim=embedding_model_config[\"token_limit\"]),\n", + " FieldSchema(name=\"sparse_embedding\", dtype=DataType.SPARSE_FLOAT_VECTOR),\n", + " FieldSchema(name=\"metadata\", dtype=DataType.JSON),\n", + " FieldSchema(name=\"title_and_content\", dtype=DataType.VARCHAR, max_length=65279+256, enable_analyzer=True),\n", "]" ] }, @@ -763,20 +518,20 @@ "id": "76535a60-87f5-48e0-9c73-38aa2c6b4d0e", "metadata": {}, "source": [ - "### Define Functions for Processing" + "#### Define Functions for Processing" ] }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 941, "id": "54fb3428-b007-4804-9d79-b3933d3256c5", "metadata": {}, "outputs": [], "source": [ "bm25_function = Function(\n", " name=\"content_bm25_emb\",\n", - " input_field_names=[\"combined_text\"],\n", - " output_field_names=[\"sparse_vector\"],\n", + " input_field_names=[\"title_and_content\"],\n", + " output_field_names=[\"sparse_embedding\"],\n", " function_type=FunctionType.BM25)\n", "\n", "functions = [bm25_function]" @@ -784,17 +539,17 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 942, "id": "4c2f123a-5949-4974-af48-a5db5b168c11", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'auto_id': True, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': , 'is_primary': True, 'auto_id': True}, {'name': 'vector', 'description': '', 'type': , 'params': {'dim': 384}}, {'name': 'sparse_vector', 'description': '', 'type': , 'is_function_output': True}, {'name': 'title', 'description': '', 'type': , 'params': {'max_length': 256}}, {'name': 'content', 'description': '', 'type': , 'params': {'max_length': 65279}}, {'name': 'combined_text', 'description': '', 'type': , 'params': {'max_length': 65535, 'enable_analyzer': True}}, {'name': 'doc_id', 'description': '', 'type': , 'params': {'max_length': 100}}, {'name': 'keywords', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 64}, 'element_type': }, {'name': 'tags', 'description': '', 'type': , 'params': {'max_length': 100, 'max_capacity': 32}, 'element_type': }], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': , 'input_field_names': ['combined_text'], 'output_field_names': ['sparse_vector'], 'params': {}}]}" + "{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': , 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}, {'name': 'content', 'description': '', 'type': , 'params': {'max_length': 65279}}, {'name': 'embedding', 'description': '', 'type': , 'params': {'dim': 384}}, {'name': 'sparse_embedding', 'description': '', 'type': , 'is_function_output': True}, {'name': 'metadata', 'description': '', 'type': }, {'name': 'title_and_content', 'description': '', 'type': , 'params': {'max_length': 65535, 'enable_analyzer': True}}], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': , 'input_field_names': ['title_and_content'], 'output_field_names': ['sparse_embedding'], 'params': {}}]}" ] }, - "execution_count": 95, + "execution_count": 942, "metadata": {}, "output_type": "execute_result" } @@ -809,12 +564,12 @@ "id": "04f15d4b-1192-464b-9635-cb4cbc530431", "metadata": {}, "source": [ - "### Define Indices" + "#### Define Indices" ] }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 943, "id": "671f4352-2086-4428-83be-0de48926682d", "metadata": {}, "outputs": [], @@ -832,14 +587,14 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 944, "id": "aa8baae5-7c38-4e78-ace4-304c7dc6b127", "metadata": {}, "outputs": [], "source": [ "index_params.add_index(\n", - " field_name=\"vector\",\n", - " index_name=\"dense_vector_ivf_flat\",\n", + " field_name=\"embedding\",\n", + " index_name=\"dense_embedding_ivf_flat\",\n", " index_type=\"IVF_FLAT\",\n", " metric_type=VectorSearchMetrics.COSINE.value,\n", " params={\"nlist\": 1024})" @@ -855,13 +610,13 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 945, "id": "d970a35b-f9b2-4f8f-93ef-8de5c83c31b5", "metadata": {}, "outputs": [], "source": [ "index_params.add_index(\n", - " field_name=\"sparse_vector\",\n", + " field_name=\"sparse_embedding\",\n", " index_name=\"sparse_inverted_index\",\n", " index_type=\"SPARSE_INVERTED_INDEX\",\n", " metric_type=KeywordSearchMetrics.BM25.value,\n", @@ -870,18 +625,18 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 946, "id": "0d45a6ad-2009-4e30-b38d-73266da98a06", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[{'field_name': 'vector', 'index_type': 'IVF_FLAT', 'index_name': 'dense_vector_ivf_flat', 'nlist': 1024, 'metric_type': 'COSINE'},\n", - " {'field_name': 'sparse_vector', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]" + "[{'field_name': 'embedding', 'index_type': 'IVF_FLAT', 'index_name': 'dense_embedding_ivf_flat', 'nlist': 1024, 'metric_type': 'COSINE'},\n", + " {'field_name': 'sparse_embedding', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]" ] }, - "execution_count": 99, + "execution_count": 946, "metadata": {}, "output_type": "execute_result" } @@ -895,12 +650,12 @@ "id": "22a260da-8869-40bb-9cbf-28a73e8cca24", "metadata": {}, "source": [ - "### Create Collection" + "#### Create Collection" ] }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 947, "id": "51dd4423-240c-4271-bb8c-6270f399a25c", "metadata": {}, "outputs": [], @@ -910,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 948, "id": "9620b1f2-51fa-491c-ad3f-f0676b9b25f6", "metadata": {}, "outputs": [], @@ -920,7 +675,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 949, "id": "e6cf3a1d-265c-44db-aba8-d491fab290d5", "metadata": {}, "outputs": [], @@ -930,7 +685,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 950, "id": "94497411-43d3-4300-98b3-1cb33759738e", "metadata": {}, "outputs": [ @@ -940,7 +695,7 @@ "True" ] }, - "execution_count": 103, + "execution_count": 950, "metadata": {}, "output_type": "execute_result" } @@ -951,91 +706,320 @@ }, { "cell_type": "markdown", - "id": "b10fc2bb-b17c-4d8b-85de-7a0bc10f6779", + "id": "42c1c159-875d-411b-a009-4361301b39f6", "metadata": {}, "source": [ - "### Index Data" + "## Building the Vector Index: Chunking, Embedding, and Storage" ] }, { - "cell_type": "markdown", - "id": "38b10fcf-7b07-4bf5-a3b0-581ccdd09fe3", + "cell_type": "code", + "execution_count": 951, + "id": "f4632377-3d8c-4d60-891f-52a2c3905364", "metadata": {}, + "outputs": [], "source": [ - "#### Index" + "from typing import Dict, Any\n", + "from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider\n", + "from apache_beam.ml.transforms.base import MLTransform\n", + "from apache_beam.ml.transforms.embeddings import huggingface\n", + "from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings\n", + "from dataclasses import asdict\n", + "import tempfile\n", + "import uuid\n", + "import shutil" ] }, { "cell_type": "code", - "execution_count": 104, - "id": "20fd6f92-277f-42a3-b0a1-d9e9cb030caa", + "execution_count": 952, + "id": "b9da8f1b-3657-4869-bdb7-5e4e607bfd7d", "metadata": {}, "outputs": [], "source": [ - "data_ready_to_index = []\n", - "for doc in docs:\n", - " item = {}\n", - " item[\"vector\"] = doc[\"content_embedding\"]\n", - " item[\"content\"] = doc[\"content\"]\n", - " item[\"doc_id\"] = doc[\"doc_id\"]\n", - " item[\"title\"] = doc[\"title\"]\n", - " item[\"keywords\"] = doc[\"keywords\"]\n", - " item[\"tags\"] = doc[\"tags\"]\n", - " item[\"combined_text\"] = f\"{doc['title']}. {doc['content']}\"\n", - " data_ready_to_index.append(item)" + "# Choosing `sentence-transformers/all-MiniLM-L6-v2` as our embedding generator here. It gives\n", + "# a good balance between embedding generation speed, accuracy, and being free to use.\n", + "embedding_model_config = {\n", + " \"name\": 'sentence-transformers/all-MiniLM-L6-v2',\n", + " \"token_limit\": 384\n", + "}" ] }, { "cell_type": "code", - "execution_count": 105, - "id": "178e59dd-d9aa-4948-a02b-f57ee919f0ff", - "metadata": {}, + "execution_count": 953, + "id": "25c5c202-abe0-4d11-82df-e731f0d6201e", + "metadata": { + "scrolled": true + }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n", + "WARNING:root:This output type hint will be ignored and not used for type-checking purposes. Typically, output type hints for a PTransform are single (or nested) types wrapped by a PCollection, PDone, or None. Got: Union[Tuple[apache_beam.pvalue.PCollection[~MLTransformOutputT], apache_beam.pvalue.PCollection[apache_beam.pvalue.Row]], apache_beam.pvalue.PCollection[~MLTransformOutputT]] instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upserted batch of 5 documents. Result: {'upsert_count': 5, 'primary_keys': ['1_0', '1_1', '2_0', '2_1', '3_0']}\n" + ] + }, { "data": { + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... show\n", + "
\n", + " " + ], "text/plain": [ - "{'insert_count': 5, 'ids': [460862423920279605, 460862423920279606, 460862423920279607, 460862423920279608, 460862423920279609], 'cost': 0}" + "" ] }, - "execution_count": 105, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upserted batch of 5 documents. Result: {'upsert_count': 5, 'primary_keys': ['1_0', '1_1', '2_0', '2_1', '3_0']}\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_bbaa3233f446175a45515c8912457390\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_bbaa3233f446175a45515c8912457390\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "client.insert(collection_name=collection_name, data=data_ready_to_index)" - ] - }, - { - "cell_type": "markdown", - "id": "fa5c502d-2a37-4050-a846-73bebb1bf6c0", - "metadata": {}, - "source": [ - "#### Check the Indexed Data" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "id": "b01b111e-41f2-4d9f-b7f5-4fc42305fbe0", - "metadata": {}, - "outputs": [], - "source": [ - "# Search by content vector similarity.\n", - "query_embedding = model.encode(\"What is apache beam\")\n", + "class DocumentSplitterDoFn(beam.DoFn):\n", + " def setup(self):\n", + " # The `chunk_size` parameter is constrained by the embedding model we’re using.\n", + " # Since we’re using `sentence-transformers/all-MiniLM-L6-v2`, which has a maximum\n", + " # token limit of ~384 tokens, we need to ensure chunk sizes stay well within that limit.\n", + " # Given that each document in our dataset contains approximately 331 tokens, using a chunk\n", + " # size of 256 allows us to preserve nearly the most semantic meaning of each entry while\n", + " # staying safely under the model’s token limit.\n", + " #\n", + " # For simplicity, We'll use sentence splitting as the chunking strategy for simplicity. Ideally,\n", + " # we would pass a tokenizer here — preferably the same one used by the retriever to ensure\n", + " # consistency. However, in this example, we are not using a tokenizer.\n", + " from llama_index.core.text_splitter import SentenceSplitter\n", + " chunk_size, chunk_overlap = 256, 20\n", + " self.llama_txt_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", "\n", - "search_results = client.search(\n", - " collection_name=collection_name,\n", - " data=[query_embedding],\n", - " anns_field=\"vector\",\n", - " limit=5,\n", - " output_fields=[\"title\", \"content\", \"doc_id\", \"keywords\", \"tags\"]\n", - ")\n", + " def process(self, element: Dict[str, Any]) -> List[Chunk]:\n", + " id_field, content_field = 'id', 'content'\n", + " metadata_fields = [\"title\", \"keywords\", \"tags\"]\n", + " global_doc_id = element.get('id', str(uuid.uuid4()))\n", + " text_content = element.get('content', '')\n", + " splits = self.llama_txt_splitter.split_text(text_content)\n", + " for i, split in enumerate(splits):\n", + " local_doc_id = f\"{global_doc_id}_{i}\"\n", + " yield Chunk(id=local_doc_id, content=Content(split), metadata={f:element[f] for f in metadata_fields})\n", + "\n", + "class ChunkingTransformProvider(ChunkingTransformProvider):\n", + " def get_splitter_transform(self) -> beam.PTransform[beam.PCollection[Dict[str, Any]], beam.PCollection[Chunk]]:\n", + " return beam.ParDo(DocumentSplitterDoFn())\n", + "\n", + "class IndexToVectorDBDoFn(beam.DoFn):\n", + " def __init__(self, collection_name: str, batch_size: int = 100):\n", + " self.collection_name = collection_name\n", + " self.batch_size = batch_size\n", + "\n", + " def setup(self):\n", + " self._client = MilvusClient(**milvus_connection_parameters.__dict__)\n", + "\n", + " def start_bundle(self):\n", + " self._batch = []\n", + "\n", + " def process(self, doc: Chunk):\n", + " doc_to_index = {\n", + " \"id\": doc.id,\n", + " \"content\": doc.content.text,\n", + " \"title_and_content\": f\"{doc.metadata['title']}. {doc.content.text}\",\n", + " \"metadata\": doc.metadata,\n", + " \"embedding\": doc.embedding.dense_embedding,\n", + " }\n", + " self._batch.append(doc_to_index)\n", "\n", - "for hits in search_results:\n", - " for hit in hits:\n", - " print(hit)\n", - " print(\"---\")" + " if len(self._batch) >= self.batch_size:\n", + " self._flush_batch()\n", + "\n", + " yield doc_to_index\n", + "\n", + " def finish_bundle(self):\n", + " if self._batch:\n", + " self._flush_batch()\n", + "\n", + " def _flush_batch(self):\n", + " if self._batch:\n", + " # Upsert API gives us a built-in idempotency over the insert API.\n", + " result = self._client.upsert(collection_name=self.collection_name, data=self._batch)\n", + " print(f\"Upserted batch of {len(self._batch)} documents. Result: {result}\")\n", + " self._batch = []\n", + "\n", + "huggingface_embedder = HuggingfaceTextEmbeddings(\n", + " model_name=embedding_model_config[\"name\"],\n", + " max_seq_length=embedding_model_config[\"token_limit\"])\n", + "\n", + "with beam.Pipeline() as pipeline:\n", + " data_transformed = (\n", + " pipeline\n", + " | 'Creating Documents' >> beam.Create(corpus)\n", + " | 'Converting to Chunks' >> MLTransform(\n", + " write_artifact_location=tempfile.mkdtemp()).with_transform(ChunkingTransformProvider())\n", + " | 'Generating Embeddings' >> MLTransform(\n", + " write_artifact_location=tempfile.mkdtemp()).with_transform(huggingface_embedder)\n", + " | 'Indexing to Vector DB' >> beam.ParDo(IndexToVectorDBDoFn(collection_name=collection_name))\n", + " )\n", + "\n", + "ib.show(data_transformed)" ] }, { @@ -1056,7 +1040,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 954, "id": "4911e8cc-10f1-4d21-9251-1b756b61f2c1", "metadata": {}, "outputs": [], @@ -1105,62 +1089,55 @@ ] }, { - "cell_type": "code", - "execution_count": 108, - "id": "dcbed23b-1fc2-4f89-a6d0-e05c15d5e655", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "MilvusConnectionParameters(uri='http://localhost:60085', user='', password='', db_id='default', token='', timeout=None, kwargs={})" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], + "cell_type": "markdown", + "id": "656110c9-1360-49fd-ba17-f55f2257f127", + "metadata": {}, "source": [ - "milvus_connection_parameters" + "### Vector Search" ] }, { "cell_type": "markdown", - "id": "656110c9-1360-49fd-ba17-f55f2257f127", + "id": "2d165518-b27b-40a8-ae0a-42342df3c1eb", "metadata": {}, "source": [ - "### Vector Search" + "Let’s choose a deliberate query that illustrates the unique benefits of pure vector search, especially its ability to grasp semantic meaning:\n", + "\n", + "Query: `How do I process large datasets efficiently?`\n", + "\n", + "This query demonstrates vector search advantages because:\n", + "\n", + "- **Dense vector (semantic) contribution:** The semantic component understands the conceptual intent of \"processing large datasets efficiently,\" connecting it to frameworks like **Apache Beam** and **Google Cloud Dataflow**, even if those terms aren't in the query.\n", + "- **Overcoming keyword limitations:** For conversational queries like this, traditional keyword search struggles. Vector search moves beyond exact lexical matching to find documents that semantically answer the \"how-to\" aspect.\n", + "- **Vector search advantage:** Documents describing solutions like **Apache Beam** (e.g., Document #1) rank highest. Vector search understands that Beam's \"unified programming model for defining and executing data processing pipelines\" directly addresses the query's need for efficient large dataset processing, even without an exact phrase match, by prioritizing based on deep semantic alignment." ] }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 955, "id": "74db1238-0a04-4e08-818d-5bce8f09006b", "metadata": {}, "outputs": [], "source": [ - "query = encode_embedding(\"what is beam?\")" + "query = encode_embedding(\"How do I process large datasets efficiently?\")" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 956, "id": "79e16531-8bec-4b4b-9ed3-cebd705480e0", "metadata": {}, "outputs": [], "source": [ "search_parameters = MilvusSearchParameters(\n", " collection_name=collection_name,\n", - " search_strategy=VectorSearchParameters(limit=10, anns_field=\"vector\"),\n", - " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + " search_strategy=VectorSearchParameters(limit=10, anns_field=\"embedding\"),\n", + " output_fields=[\"metadata\",\"content\"])" ] }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 957, "id": "cbef1911-6464-4ba1-8974-ed00896c7e8b", "metadata": {}, "outputs": [], @@ -1170,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 958, "id": "f0481286-3f2b-4690-a2f6-a5a00de3ff34", "metadata": {}, "outputs": [], @@ -1183,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 959, "id": "35ee37f2-60cd-4d5d-aef6-aed4fda79161", "metadata": {}, "outputs": [ @@ -1195,113 +1172,123 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 460862423920279605,\n", - " \"distance\": 0.453,\n", + " \"id\": \"1_0\",\n", + " \"distance\": 0.3657,\n", " \"fields\": {\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", - " \"keywords\": [\n", - " \"Apache Beam\",\n", - " \"stream processing\",\n", - " \"batch processing\",\n", - " \"data pipelines\",\n", - " \"SDK\"\n", - " ],\n", - " \"tags\": [\n", - " \"Data Engineering\",\n", - " \"Open Source\",\n", - " \"Streaming\",\n", - " \"Batch\",\n", - " \"Big Data\"\n", - " ],\n", - " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n", + " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", + " \"metadata\": {\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", + " \"keywords\": [\n", + " \"Apache Beam\",\n", + " \"stream processing\",\n", + " \"batch processing\",\n", + " \"data pipelines\",\n", + " \"SDK\"\n", + " ],\n", + " \"tags\": [\n", + " \"Data Engineering\",\n", + " \"Open Source\",\n", + " \"Streaming\",\n", + " \"Batch\",\n", + " \"Big Data\"\n", + " ]\n", + " }\n", " }\n", " },\n", " {\n", - " \"id\": 460862423920279606,\n", - " \"distance\": 0.4353,\n", + " \"id\": \"2_1\",\n", + " \"distance\": 0.3369,\n", " \"fields\": {\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", - " \"keywords\": [\n", - " \"Apache Beam\",\n", - " \"stream processing\",\n", - " \"batch processing\",\n", - " \"data pipelines\",\n", - " \"SDK\"\n", - " ],\n", - " \"tags\": [\n", - " \"Data Engineering\",\n", - " \"Open Source\",\n", - " \"Streaming\",\n", - " \"Batch\",\n", - " \"Big Data\"\n", - " ],\n", - " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n", + " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n", + " \"metadata\": {\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", + " \"keywords\": [\n", + " \"Google Cloud\",\n", + " \"Dataflow\",\n", + " \"Apache Beam\",\n", + " \"serverless\",\n", + " \"stream and batch\"\n", + " ],\n", + " \"tags\": [\n", + " \"Cloud Computing\",\n", + " \"Data Pipelines\",\n", + " \"Google Cloud\",\n", + " \"Serverless\",\n", + " \"Enterprise\"\n", + " ]\n", + " }\n", " }\n", " },\n", " {\n", - " \"id\": 460862423920279609,\n", - " \"distance\": 0.3927,\n", + " \"id\": \"2_0\",\n", + " \"distance\": 0.2918,\n", " \"fields\": {\n", - " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", - " \"keywords\": [\n", - " \"Google Beam\",\n", - " \"Project Starline\",\n", - " \"3D video\",\n", - " \"AI communication\",\n", - " \"real-time meetings\"\n", - " ],\n", - " \"tags\": [\n", - " \"AI\",\n", - " \"Communication\",\n", - " \"3D Technology\",\n", - " \"Remote Work\",\n", - " \"Enterprise Tech\"\n", - " ],\n", - " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\n", + " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n", + " \"metadata\": {\n", + " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", + " \"keywords\": [\n", + " \"Google Cloud\",\n", + " \"Dataflow\",\n", + " \"Apache Beam\",\n", + " \"serverless\",\n", + " \"stream and batch\"\n", + " ],\n", + " \"tags\": [\n", + " \"Cloud Computing\",\n", + " \"Data Pipelines\",\n", + " \"Google Cloud\",\n", + " \"Serverless\",\n", + " \"Enterprise\"\n", + " ]\n", + " }\n", " }\n", " },\n", " {\n", - " \"id\": 460862423920279608,\n", - " \"distance\": 0.2925,\n", + " \"id\": \"1_1\",\n", + " \"distance\": 0.2638,\n", " \"fields\": {\n", - " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", - " \"keywords\": [\n", - " \"Google Cloud\",\n", - " \"Dataflow\",\n", - " \"Apache Beam\",\n", - " \"serverless\",\n", - " \"stream and batch\"\n", - " ],\n", - " \"tags\": [\n", - " \"Cloud Computing\",\n", - " \"Data Pipelines\",\n", - " \"Google Cloud\",\n", - " \"Serverless\",\n", - " \"Enterprise\"\n", - " ],\n", - " \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\n", + " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n", + " \"metadata\": {\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", + " \"keywords\": [\n", + " \"Apache Beam\",\n", + " \"stream processing\",\n", + " \"batch processing\",\n", + " \"data pipelines\",\n", + " \"SDK\"\n", + " ],\n", + " \"tags\": [\n", + " \"Data Engineering\",\n", + " \"Open Source\",\n", + " \"Streaming\",\n", + " \"Batch\",\n", + " \"Big Data\"\n", + " ]\n", + " }\n", " }\n", " },\n", " {\n", - " \"id\": 460862423920279607,\n", - " \"distance\": 0.2342,\n", + " \"id\": \"3_0\",\n", + " \"distance\": 0.031,\n", " \"fields\": {\n", - " \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n", - " \"keywords\": [\n", - " \"Google Cloud\",\n", - " \"Dataflow\",\n", - " \"Apache Beam\",\n", - " \"serverless\",\n", - " \"stream and batch\"\n", - " ],\n", - " \"tags\": [\n", - " \"Cloud Computing\",\n", - " \"Data Pipelines\",\n", - " \"Google Cloud\",\n", - " \"Serverless\",\n", - " \"Enterprise\"\n", - " ],\n", - " \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\n", + " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", + " \"metadata\": {\n", + " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", + " \"keywords\": [\n", + " \"Google Beam\",\n", + " \"Project Starline\",\n", + " \"3D video\",\n", + " \"AI communication\",\n", + " \"real-time meetings\"\n", + " ],\n", + " \"tags\": [\n", + " \"AI\",\n", + " \"Communication\",\n", + " \"3D Technology\",\n", + " \"Remote Work\",\n", + " \"Enterprise Tech\"\n", + " ]\n", + " }\n", " }\n", " }\n", " ]\n", @@ -1314,113 +1301,123 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"1_0\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3657\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"stream processing\"\u001b[0m,\n", - " \u001b[32m\"batch processing\"\u001b[0m,\n", - " \u001b[32m\"data pipelines\"\u001b[0m,\n", - " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Data Engineering\"\u001b[0m,\n", - " \u001b[32m\"Open Source\"\u001b[0m,\n", - " \u001b[32m\"Streaming\"\u001b[0m,\n", - " \u001b[32m\"Batch\"\u001b[0m,\n", - " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279606\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"2_1\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3369\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"stream processing\"\u001b[0m,\n", - " \u001b[32m\"batch processing\"\u001b[0m,\n", - " \u001b[32m\"data pipelines\"\u001b[0m,\n", - " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Data Engineering\"\u001b[0m,\n", - " \u001b[32m\"Open Source\"\u001b[0m,\n", - " \u001b[32m\"Streaming\"\u001b[0m,\n", - " \u001b[32m\"Batch\"\u001b[0m,\n", - " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279609\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"2_0\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2918\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Beam\"\u001b[0m,\n", - " \u001b[32m\"Project Starline\"\u001b[0m,\n", - " \u001b[32m\"3D video\"\u001b[0m,\n", - " \u001b[32m\"AI communication\"\u001b[0m,\n", - " \u001b[32m\"real-time meetings\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"AI\"\u001b[0m,\n", - " \u001b[32m\"Communication\"\u001b[0m,\n", - " \u001b[32m\"3D Technology\"\u001b[0m,\n", - " \u001b[32m\"Remote Work\"\u001b[0m,\n", - " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279608\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2925\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"1_1\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2638\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Dataflow\"\u001b[0m,\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"serverless\"\u001b[0m,\n", - " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", - " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Serverless\"\u001b[0m,\n", - " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279607\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2342\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"3_0\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.031\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Dataflow\"\u001b[0m,\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"serverless\"\u001b[0m,\n", - " \u001b[32m\"stream and batch\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", - " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", - " \u001b[32m\"Google Cloud\"\u001b[0m,\n", - " \u001b[32m\"Serverless\"\u001b[0m,\n", - " \u001b[32m\"Enterprise\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam\"\u001b[0m,\n", + " \u001b[32m\"Project Starline\"\u001b[0m,\n", + " \u001b[32m\"3D video\"\u001b[0m,\n", + " \u001b[32m\"AI communication\"\u001b[0m,\n", + " \u001b[32m\"real-time meetings\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"AI\"\u001b[0m,\n", + " \u001b[32m\"Communication\"\u001b[0m,\n", + " \u001b[32m\"3D Technology\"\u001b[0m,\n", + " \u001b[32m\"Remote Work\"\u001b[0m,\n", + " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -1448,42 +1445,58 @@ "### Keyword Search" ] }, + { + "cell_type": "markdown", + "id": "b30b29dc-0a59-4cff-b8a3-ace6e801b4da", + "metadata": {}, + "source": [ + "Let’s choose a deliberate query that illustrates the unique benefits of pure keyword search, especially its ability to pinpoint exact textual matches:\n", + "\n", + "Query: `Project Starline`\n", + "\n", + "This query demonstrates keyword search advantages because:\n", + "\n", + "- **Keyword (lexical) contribution:** The query, `Project Starline`, is an exact phrase. Keyword search is designed to prioritize and precisely match such literal strings, acting as an exact textual filter for specific product names or unique identifiers.\n", + "- **Overcoming vector limitations:** For a highly specific, proper noun like \"Project Starline\", pure vector search might struggle. It could semantically relate to other \"projects\" or \"communication technologies,\" potentially diluting the precision by not inherently prioritizing the exact string match over broader semantic similarity.\n", + "- **Keyword search advantage:** Only Document 3 (\"Google Beam: 3D Communication Powered by AI\") contains the exact phrase: `Google Beam is an innovative video communication platform that builds on the research of Project Starline.` A keyword search for \"Project Starline\" will exclusively and precisely retrieve Document 3, showcasing its unparalleled accuracy for factual lookups and named entities where the exact string is paramount.\n" + ] + }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 960, "id": "f159ad87-5153-48bb-87b3-3845d3c76420", "metadata": {}, "outputs": [], "source": [ - "query = \"what is beam?\"" + "query = \"Project Starline\"" ] }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 961, "id": "8b8cad3e-8a18-464b-8de6-aa4515a653c5", "metadata": {}, "outputs": [], "source": [ "search_parameters = MilvusSearchParameters(\n", " collection_name=collection_name,\n", - " search_strategy=KeywordSearchParameters(limit=10,anns_field=\"sparse_vector\"),\n", - " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + " search_strategy=KeywordSearchParameters(limit=10,anns_field=\"sparse_embedding\"),\n", + " output_fields=[\"metadata\",\"content\"])" ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 962, "id": "47cfc650-0b34-4333-9321-19be2e8fdc85", "metadata": {}, "outputs": [], "source": [ - "collection_load_parameters = MilvusCollectionLoadParameters() " + "collection_load_parameters = MilvusCollectionLoadParameters()" ] }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 963, "id": "4754763b-66bf-4f90-9920-28cef223b536", "metadata": {}, "outputs": [], @@ -1496,7 +1509,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 964, "id": "a3db4837-01c7-42d7-b4e8-58d8d361fe93", "metadata": {}, "outputs": [ @@ -1504,117 +1517,31 @@ "data": { "text/html": [ "
{\n",
-       "  \"query\": \"what is beam?\",\n",
+       "  \"query\": \"Project Starline\",\n",
        "  \"query_embedding\": null,\n",
        "  \"results\": [\n",
        "    {\n",
-       "      \"id\": 460862423920279606,\n",
-       "      \"distance\": 0.5657,\n",
+       "      \"id\": \"3_0\",\n",
+       "      \"distance\": 2.8536,\n",
        "      \"fields\": {\n",
-       "        \"keywords\": [\n",
-       "          \"Apache Beam\",\n",
-       "          \"stream processing\",\n",
-       "          \"batch processing\",\n",
-       "          \"data pipelines\",\n",
-       "          \"SDK\"\n",
-       "        ],\n",
-       "        \"tags\": [\n",
-       "          \"Data Engineering\",\n",
-       "          \"Open Source\",\n",
-       "          \"Streaming\",\n",
-       "          \"Batch\",\n",
-       "          \"Big Data\"\n",
-       "        ],\n",
-       "        \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n",
-       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n",
-       "      }\n",
-       "    },\n",
-       "    {\n",
-       "      \"id\": 460862423920279609,\n",
-       "      \"distance\": 0.5471,\n",
-       "      \"fields\": {\n",
-       "        \"keywords\": [\n",
-       "          \"Google Beam\",\n",
-       "          \"Project Starline\",\n",
-       "          \"3D video\",\n",
-       "          \"AI communication\",\n",
-       "          \"real-time meetings\"\n",
-       "        ],\n",
-       "        \"tags\": [\n",
-       "          \"AI\",\n",
-       "          \"Communication\",\n",
-       "          \"3D Technology\",\n",
-       "          \"Remote Work\",\n",
-       "          \"Enterprise Tech\"\n",
-       "        ],\n",
        "        \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n",
-       "        \"title\": \"Google Beam: 3D Communication Powered by AI\"\n",
-       "      }\n",
-       "    },\n",
-       "    {\n",
-       "      \"id\": 460862423920279605,\n",
-       "      \"distance\": 0.53,\n",
-       "      \"fields\": {\n",
-       "        \"keywords\": [\n",
-       "          \"Apache Beam\",\n",
-       "          \"stream processing\",\n",
-       "          \"batch processing\",\n",
-       "          \"data pipelines\",\n",
-       "          \"SDK\"\n",
-       "        ],\n",
-       "        \"tags\": [\n",
-       "          \"Data Engineering\",\n",
-       "          \"Open Source\",\n",
-       "          \"Streaming\",\n",
-       "          \"Batch\",\n",
-       "          \"Big Data\"\n",
-       "        ],\n",
-       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n",
-       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\"\n",
-       "      }\n",
-       "    },\n",
-       "    {\n",
-       "      \"id\": 460862423920279607,\n",
-       "      \"distance\": 0.5055,\n",
-       "      \"fields\": {\n",
-       "        \"keywords\": [\n",
-       "          \"Google Cloud\",\n",
-       "          \"Dataflow\",\n",
-       "          \"Apache Beam\",\n",
-       "          \"serverless\",\n",
-       "          \"stream and batch\"\n",
-       "        ],\n",
-       "        \"tags\": [\n",
-       "          \"Cloud Computing\",\n",
-       "          \"Data Pipelines\",\n",
-       "          \"Google Cloud\",\n",
-       "          \"Serverless\",\n",
-       "          \"Enterprise\"\n",
-       "        ],\n",
-       "        \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n",
-       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n",
-       "      }\n",
-       "    },\n",
-       "    {\n",
-       "      \"id\": 460862423920279608,\n",
-       "      \"distance\": 0.134,\n",
-       "      \"fields\": {\n",
-       "        \"keywords\": [\n",
-       "          \"Google Cloud\",\n",
-       "          \"Dataflow\",\n",
-       "          \"Apache Beam\",\n",
-       "          \"serverless\",\n",
-       "          \"stream and batch\"\n",
-       "        ],\n",
-       "        \"tags\": [\n",
-       "          \"Cloud Computing\",\n",
-       "          \"Data Pipelines\",\n",
-       "          \"Google Cloud\",\n",
-       "          \"Serverless\",\n",
-       "          \"Enterprise\"\n",
-       "        ],\n",
-       "        \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n",
-       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Google Beam: 3D Communication Powered by AI\",\n",
+       "          \"keywords\": [\n",
+       "            \"Google Beam\",\n",
+       "            \"Project Starline\",\n",
+       "            \"3D video\",\n",
+       "            \"AI communication\",\n",
+       "            \"real-time meetings\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"AI\",\n",
+       "            \"Communication\",\n",
+       "            \"3D Technology\",\n",
+       "            \"Remote Work\",\n",
+       "            \"Enterprise Tech\"\n",
+       "          ]\n",
+       "        }\n",
        "      }\n",
        "    }\n",
        "  ]\n",
@@ -1623,117 +1550,31 @@
       ],
       "text/plain": [
        "\u001b[1m{\u001b[0m\n",
-       "  \u001b[1;34m\"query\"\u001b[0m: \u001b[32m\"what is beam?\"\u001b[0m,\n",
+       "  \u001b[1;34m\"query\"\u001b[0m: \u001b[32m\"Project Starline\"\u001b[0m,\n",
        "  \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n",
        "  \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n",
        "    \u001b[1m{\u001b[0m\n",
-       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279606\u001b[0m,\n",
-       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5657\u001b[0m,\n",
-       "      \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n",
-       "        \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Apache Beam\"\u001b[0m,\n",
-       "          \u001b[32m\"stream processing\"\u001b[0m,\n",
-       "          \u001b[32m\"batch processing\"\u001b[0m,\n",
-       "          \u001b[32m\"data pipelines\"\u001b[0m,\n",
-       "          \u001b[32m\"SDK\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Data Engineering\"\u001b[0m,\n",
-       "          \u001b[32m\"Open Source\"\u001b[0m,\n",
-       "          \u001b[32m\"Streaming\"\u001b[0m,\n",
-       "          \u001b[32m\"Batch\"\u001b[0m,\n",
-       "          \u001b[32m\"Big Data\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n",
-       "        \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n",
-       "      \u001b[1m}\u001b[0m\n",
-       "    \u001b[1m}\u001b[0m,\n",
-       "    \u001b[1m{\u001b[0m\n",
-       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279609\u001b[0m,\n",
-       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5471\u001b[0m,\n",
+       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"3_0\"\u001b[0m,\n",
+       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m2.8536\u001b[0m,\n",
        "      \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n",
-       "        \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Google Beam\"\u001b[0m,\n",
-       "          \u001b[32m\"Project Starline\"\u001b[0m,\n",
-       "          \u001b[32m\"3D video\"\u001b[0m,\n",
-       "          \u001b[32m\"AI communication\"\u001b[0m,\n",
-       "          \u001b[32m\"real-time meetings\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"AI\"\u001b[0m,\n",
-       "          \u001b[32m\"Communication\"\u001b[0m,\n",
-       "          \u001b[32m\"3D Technology\"\u001b[0m,\n",
-       "          \u001b[32m\"Remote Work\"\u001b[0m,\n",
-       "          \u001b[32m\"Enterprise Tech\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
        "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n",
-       "        \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n",
-       "      \u001b[1m}\u001b[0m\n",
-       "    \u001b[1m}\u001b[0m,\n",
-       "    \u001b[1m{\u001b[0m\n",
-       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n",
-       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.53\u001b[0m,\n",
-       "      \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n",
-       "        \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Apache Beam\"\u001b[0m,\n",
-       "          \u001b[32m\"stream processing\"\u001b[0m,\n",
-       "          \u001b[32m\"batch processing\"\u001b[0m,\n",
-       "          \u001b[32m\"data pipelines\"\u001b[0m,\n",
-       "          \u001b[32m\"SDK\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Data Engineering\"\u001b[0m,\n",
-       "          \u001b[32m\"Open Source\"\u001b[0m,\n",
-       "          \u001b[32m\"Streaming\"\u001b[0m,\n",
-       "          \u001b[32m\"Batch\"\u001b[0m,\n",
-       "          \u001b[32m\"Big Data\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n",
-       "        \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m\n",
-       "      \u001b[1m}\u001b[0m\n",
-       "    \u001b[1m}\u001b[0m,\n",
-       "    \u001b[1m{\u001b[0m\n",
-       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279607\u001b[0m,\n",
-       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5055\u001b[0m,\n",
-       "      \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n",
-       "        \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Google Cloud\"\u001b[0m,\n",
-       "          \u001b[32m\"Dataflow\"\u001b[0m,\n",
-       "          \u001b[32m\"Apache Beam\"\u001b[0m,\n",
-       "          \u001b[32m\"serverless\"\u001b[0m,\n",
-       "          \u001b[32m\"stream and batch\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Cloud Computing\"\u001b[0m,\n",
-       "          \u001b[32m\"Data Pipelines\"\u001b[0m,\n",
-       "          \u001b[32m\"Google Cloud\"\u001b[0m,\n",
-       "          \u001b[32m\"Serverless\"\u001b[0m,\n",
-       "          \u001b[32m\"Enterprise\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n",
-       "        \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n",
-       "      \u001b[1m}\u001b[0m\n",
-       "    \u001b[1m}\u001b[0m,\n",
-       "    \u001b[1m{\u001b[0m\n",
-       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279608\u001b[0m,\n",
-       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.134\u001b[0m,\n",
-       "      \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n",
-       "        \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Google Cloud\"\u001b[0m,\n",
-       "          \u001b[32m\"Dataflow\"\u001b[0m,\n",
-       "          \u001b[32m\"Apache Beam\"\u001b[0m,\n",
-       "          \u001b[32m\"serverless\"\u001b[0m,\n",
-       "          \u001b[32m\"stream and batch\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Cloud Computing\"\u001b[0m,\n",
-       "          \u001b[32m\"Data Pipelines\"\u001b[0m,\n",
-       "          \u001b[32m\"Google Cloud\"\u001b[0m,\n",
-       "          \u001b[32m\"Serverless\"\u001b[0m,\n",
-       "          \u001b[32m\"Enterprise\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n",
-       "        \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m\n",
+       "        \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "          \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n",
+       "          \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
+       "            \u001b[32m\"Google Beam\"\u001b[0m,\n",
+       "            \u001b[32m\"Project Starline\"\u001b[0m,\n",
+       "            \u001b[32m\"3D video\"\u001b[0m,\n",
+       "            \u001b[32m\"AI communication\"\u001b[0m,\n",
+       "            \u001b[32m\"real-time meetings\"\u001b[0m\n",
+       "          \u001b[1m]\u001b[0m,\n",
+       "          \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
+       "            \u001b[32m\"AI\"\u001b[0m,\n",
+       "            \u001b[32m\"Communication\"\u001b[0m,\n",
+       "            \u001b[32m\"3D Technology\"\u001b[0m,\n",
+       "            \u001b[32m\"Remote Work\"\u001b[0m,\n",
+       "            \u001b[32m\"Enterprise Tech\"\u001b[0m\n",
+       "          \u001b[1m]\u001b[0m\n",
+       "        \u001b[1m}\u001b[0m\n",
        "      \u001b[1m}\u001b[0m\n",
        "    \u001b[1m}\u001b[0m\n",
        "  \u001b[1m]\u001b[0m\n",
@@ -1764,23 +1605,23 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4afec961-71ae-49cc-85ac-2b88eff6b23b",
+   "id": "e65b2158-5dce-46d1-80de-3c8047419224",
    "metadata": {},
    "source": [
-    "Let’s choose a deliberate query that illustrates the benefits of hybrid search:\n",
+    "Let’s choose a deliberate query that illustrates the unique benefits of hybrid search:\n",
     "\n",
-    "Query: \"real-time data processing systems\"\n",
+    "Query: `real-time data processing systems`\n",
     "\n",
     "This query demonstrates hybrid search advantages because:\n",
     "\n",
-    "* Dense vector (semantic) contribution: Will understand the conceptual relationship between \"real-time processing\" and \"streaming\" (found in docs #1 and #2)\n",
-    "* Sparse vector (keyword) contribution: Will match exact terms like \"data\" and \"processing\" (found in docs #1 and #2)\n",
-    "* Hybrid advantage: Document #1 about Apache Beam should rank highest since it contains more specific technical details about real-time processing capabilities like \"event time,\" \"triggers,\" and \"stateful processing\" - even though the exact phrase \"real-time data processing\" doesn't appear in any document"
+    "* **Dense vector (semantic) contribution:** Will understand the conceptual relationship between \"real-time processing\" and \"streaming\" (found in docs #1 and #2)\n",
+    "* **Sparse vector (keyword) contribution:** Will match exact terms like \"data\" and \"processing\" (found in docs #1 and #2)\n",
+    "* **Hybrid advantage:** Document #1 about Apache Beam should rank highest since it contains more specific technical details about real-time processing capabilities like \"event time,\" \"triggers,\" and \"stateful processing\" - even though the exact phrase \"real-time data processing\" doesn't appear in any document"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 965,
    "id": "172b6c80-2a03-49d0-afc7-12bb0a4dc989",
    "metadata": {},
    "outputs": [],
@@ -1791,21 +1632,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 966,
    "id": "eb6d951c-0def-45cc-84a4-b6f7b7575f23",
    "metadata": {},
    "outputs": [],
    "source": [
     "hybrid_search_parameters = HybridSearchParameters(\n",
-    "    vector=VectorSearchParameters(limit=10,anns_field=\"vector\"),\n",
-    "    keyword=KeywordSearchParameters(limit=10,anns_field=\"sparse_vector\"),\n",
+    "    vector=VectorSearchParameters(limit=10,anns_field=\"embedding\"),\n",
+    "    keyword=KeywordSearchParameters(limit=10,anns_field=\"sparse_embedding\"),\n",
     "    ranker=RRFRanker(3),\n",
     "    limit=2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 967,
    "id": "b339c498-d229-42e6-b439-b29eb107b533",
    "metadata": {},
    "outputs": [],
@@ -1813,12 +1654,12 @@
     "search_parameters = MilvusSearchParameters(\n",
     "    collection_name=collection_name,\n",
     "    search_strategy=hybrid_search_parameters,\n",
-    "    output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])"
+    "    output_fields=[\"metadata\", \"content\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 968,
    "id": "b346abe6-03c9-4b28-a0fb-74936b9f3a06",
    "metadata": {},
    "outputs": [],
@@ -1828,7 +1669,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 969,
    "id": "ab27810d-40a8-4b6a-bc82-441e13763ebc",
    "metadata": {},
    "outputs": [],
@@ -1841,7 +1682,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 970,
    "id": "9a37aa5b-d652-4dd3-9fe0-e277182415b9",
    "metadata": {},
    "outputs": [],
@@ -1856,7 +1697,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 971,
    "id": "ea9d84f7-d142-4afa-9a6f-6c310d9604b0",
    "metadata": {},
    "outputs": [
@@ -1868,47 +1709,51 @@
        "  \"query_embedding\": 384,\n",
        "  \"results\": [\n",
        "    {\n",
-       "      \"id\": 460862423920279605,\n",
-       "      \"distance\": 0.5,\n",
+       "      \"id\": \"1_0\",\n",
+       "      \"distance\": 0.45,\n",
        "      \"fields\": {\n",
-       "        \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
-       "        \"keywords\": [\n",
-       "          \"Apache Beam\",\n",
-       "          \"stream processing\",\n",
-       "          \"batch processing\",\n",
-       "          \"data pipelines\",\n",
-       "          \"SDK\"\n",
-       "        ],\n",
-       "        \"tags\": [\n",
-       "          \"Data Engineering\",\n",
-       "          \"Open Source\",\n",
-       "          \"Streaming\",\n",
-       "          \"Batch\",\n",
-       "          \"Big Data\"\n",
-       "        ],\n",
-       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n",
+       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
+       "          \"keywords\": [\n",
+       "            \"Apache Beam\",\n",
+       "            \"stream processing\",\n",
+       "            \"batch processing\",\n",
+       "            \"data pipelines\",\n",
+       "            \"SDK\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"Data Engineering\",\n",
+       "            \"Open Source\",\n",
+       "            \"Streaming\",\n",
+       "            \"Batch\",\n",
+       "            \"Big Data\"\n",
+       "          ]\n",
+       "        }\n",
        "      }\n",
        "    },\n",
        "    {\n",
-       "      \"id\": 460862423920279607,\n",
-       "      \"distance\": 0.3667,\n",
+       "      \"id\": \"2_1\",\n",
+       "      \"distance\": 0.3929,\n",
        "      \"fields\": {\n",
-       "        \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n",
-       "        \"keywords\": [\n",
-       "          \"Google Cloud\",\n",
-       "          \"Dataflow\",\n",
-       "          \"Apache Beam\",\n",
-       "          \"serverless\",\n",
-       "          \"stream and batch\"\n",
-       "        ],\n",
-       "        \"tags\": [\n",
-       "          \"Cloud Computing\",\n",
-       "          \"Data Pipelines\",\n",
-       "          \"Google Cloud\",\n",
-       "          \"Serverless\",\n",
-       "          \"Enterprise\"\n",
-       "        ],\n",
-       "        \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\n",
+       "        \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n",
+       "          \"keywords\": [\n",
+       "            \"Google Cloud\",\n",
+       "            \"Dataflow\",\n",
+       "            \"Apache Beam\",\n",
+       "            \"serverless\",\n",
+       "            \"stream and batch\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"Cloud Computing\",\n",
+       "            \"Data Pipelines\",\n",
+       "            \"Google Cloud\",\n",
+       "            \"Serverless\",\n",
+       "            \"Enterprise\"\n",
+       "          ]\n",
+       "        }\n",
        "      }\n",
        "    }\n",
        "  ]\n",
@@ -1921,47 +1766,51 @@
        "  \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n",
        "  \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n",
        "    \u001b[1m{\u001b[0m\n",
-       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n",
-       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.5\u001b[0m,\n",
+       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"1_0\"\u001b[0m,\n",
+       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.45\u001b[0m,\n",
        "      \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n",
-       "        \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n",
-       "        \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Apache Beam\"\u001b[0m,\n",
-       "          \u001b[32m\"stream processing\"\u001b[0m,\n",
-       "          \u001b[32m\"batch processing\"\u001b[0m,\n",
-       "          \u001b[32m\"data pipelines\"\u001b[0m,\n",
-       "          \u001b[32m\"SDK\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Data Engineering\"\u001b[0m,\n",
-       "          \u001b[32m\"Open Source\"\u001b[0m,\n",
-       "          \u001b[32m\"Streaming\"\u001b[0m,\n",
-       "          \u001b[32m\"Batch\"\u001b[0m,\n",
-       "          \u001b[32m\"Big Data\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n",
+       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n",
+       "        \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "          \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n",
+       "          \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
+       "            \u001b[32m\"Apache Beam\"\u001b[0m,\n",
+       "            \u001b[32m\"stream processing\"\u001b[0m,\n",
+       "            \u001b[32m\"batch processing\"\u001b[0m,\n",
+       "            \u001b[32m\"data pipelines\"\u001b[0m,\n",
+       "            \u001b[32m\"SDK\"\u001b[0m\n",
+       "          \u001b[1m]\u001b[0m,\n",
+       "          \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
+       "            \u001b[32m\"Data Engineering\"\u001b[0m,\n",
+       "            \u001b[32m\"Open Source\"\u001b[0m,\n",
+       "            \u001b[32m\"Streaming\"\u001b[0m,\n",
+       "            \u001b[32m\"Batch\"\u001b[0m,\n",
+       "            \u001b[32m\"Big Data\"\u001b[0m\n",
+       "          \u001b[1m]\u001b[0m\n",
+       "        \u001b[1m}\u001b[0m\n",
        "      \u001b[1m}\u001b[0m\n",
        "    \u001b[1m}\u001b[0m,\n",
        "    \u001b[1m{\u001b[0m\n",
-       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279607\u001b[0m,\n",
-       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3667\u001b[0m,\n",
+       "      \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"2_1\"\u001b[0m,\n",
+       "      \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3929\u001b[0m,\n",
        "      \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n",
-       "        \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n",
-       "        \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Google Cloud\"\u001b[0m,\n",
-       "          \u001b[32m\"Dataflow\"\u001b[0m,\n",
-       "          \u001b[32m\"Apache Beam\"\u001b[0m,\n",
-       "          \u001b[32m\"serverless\"\u001b[0m,\n",
-       "          \u001b[32m\"stream and batch\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
-       "          \u001b[32m\"Cloud Computing\"\u001b[0m,\n",
-       "          \u001b[32m\"Data Pipelines\"\u001b[0m,\n",
-       "          \u001b[32m\"Google Cloud\"\u001b[0m,\n",
-       "          \u001b[32m\"Serverless\"\u001b[0m,\n",
-       "          \u001b[32m\"Enterprise\"\u001b[0m\n",
-       "        \u001b[1m]\u001b[0m,\n",
-       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m\n",
+       "        \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n",
+       "        \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n",
+       "          \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n",
+       "          \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n",
+       "            \u001b[32m\"Google Cloud\"\u001b[0m,\n",
+       "            \u001b[32m\"Dataflow\"\u001b[0m,\n",
+       "            \u001b[32m\"Apache Beam\"\u001b[0m,\n",
+       "            \u001b[32m\"serverless\"\u001b[0m,\n",
+       "            \u001b[32m\"stream and batch\"\u001b[0m\n",
+       "          \u001b[1m]\u001b[0m,\n",
+       "          \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n",
+       "            \u001b[32m\"Cloud Computing\"\u001b[0m,\n",
+       "            \u001b[32m\"Data Pipelines\"\u001b[0m,\n",
+       "            \u001b[32m\"Google Cloud\"\u001b[0m,\n",
+       "            \u001b[32m\"Serverless\"\u001b[0m,\n",
+       "            \u001b[32m\"Enterprise\"\u001b[0m\n",
+       "          \u001b[1m]\u001b[0m\n",
+       "        \u001b[1m}\u001b[0m\n",
        "      \u001b[1m}\u001b[0m\n",
        "    \u001b[1m}\u001b[0m\n",
        "  \u001b[1m]\u001b[0m\n",
@@ -1992,16 +1841,34 @@
   },
   {
    "cell_type": "markdown",
-   "id": "cb72f9c6-5a29-4810-9768-574aa7ea5128",
+   "id": "0fdd049f-e856-4fa8-b3df-1498b973946b",
+   "metadata": {},
+   "source": [
+    "When a user queries `what is beam?` using a **vector search strategy**, the semantic nature of **vector embeddings** can lead to ambiguity. Without additional context, the system might confuse **Google Beam** (a 3D communication platform) with **Apache Beam** (a data processing framework).\n",
+    "\n",
+    "**Metadata filtering** directly solves this by adding contextual constraints. For instance, applying a **specific metadata filter** (e.g., `{\"category\": \"computing\"}` or `{\"domain\": \"communication\"}`) before the vector search ensures that only documents relevant to the intended concept are considered. This dramatically narrows down results, enhances search precision, and overcomes the limitations of pure content-based search by disambiguating terms like \"beam\" with specific, structured criteria."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c96898d-af2d-4401-a9ca-8d230fa95e6e",
    "metadata": {},
    "source": [
-    "#### Searching for Apache Beam"
+    "#### Without Filtered Search"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e549b22-256e-44c8-9638-eafc3a844770",
+   "metadata": {},
+   "source": [
+    "As seen in the search results down below when a user searches for `what is beam?` without applying filters, the search results include both `Apache Beam` and `Google Beam`. Filtered search can come in play here by limiting the relevant search results."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
-   "id": "6e79ef5c-a121-4e69-9089-0991821f8745",
+   "execution_count": 972,
+   "id": "1d9217d3-777e-4363-9000-b1de8a989664",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2010,33 +1877,368 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
-   "id": "ebbcbbe8-f63d-4ff4-9160-719a0fbe9b06",
+   "execution_count": 973,
+   "id": "28a45b1c-f9a5-452e-aea6-ac46f17e01bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_parameters = MilvusSearchParameters(\n",
+    "    collection_name=collection_name,\n",
+    "    search_strategy=VectorSearchParameters(\n",
+    "        limit=10,\n",
+    "        anns_field=\"embedding\",\n",
+    "    ),\n",
+    "    output_fields=[\"metadata\",\"content\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 974,
+   "id": "9ce3f0c7-fd1d-49a1-81e9-b8153cd284ea",
    "metadata": {},
    "outputs": [],
    "source": [
-    "vector_search_parameters = VectorSearchParameters(\n",
-    "    filter=\"ARRAY_CONTAINS(keywords, 'data pipelines')\",\n",
-    "    limit=10,\n",
-    "    anns_field=\"vector\")"
+    "collection_load_parameters = MilvusCollectionLoadParameters() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 975,
+   "id": "6fad29b5-c2b0-4458-ab83-b38eb15a7505",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "milvus_handler = MilvusSearchEnrichmentHandler(\n",
+    "    connection_parameters=milvus_connection_parameters,\n",
+    "    search_parameters=search_parameters,\n",
+    "    collection_load_parameters=collection_load_parameters)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 976,
+   "id": "77add8a8-ddb8-48de-b1af-632d78c0d112",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
{\n",
+       "  \"query\": null,\n",
+       "  \"query_embedding\": 384,\n",
+       "  \"results\": [\n",
+       "    {\n",
+       "      \"id\": \"1_0\",\n",
+       "      \"distance\": 0.4598,\n",
+       "      \"fields\": {\n",
+       "        \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
+       "          \"keywords\": [\n",
+       "            \"Apache Beam\",\n",
+       "            \"stream processing\",\n",
+       "            \"batch processing\",\n",
+       "            \"data pipelines\",\n",
+       "            \"SDK\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"Data Engineering\",\n",
+       "            \"Open Source\",\n",
+       "            \"Streaming\",\n",
+       "            \"Batch\",\n",
+       "            \"Big Data\"\n",
+       "          ]\n",
+       "        }\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": \"1_1\",\n",
+       "      \"distance\": 0.4353,\n",
+       "      \"fields\": {\n",
+       "        \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n",
+       "          \"keywords\": [\n",
+       "            \"Apache Beam\",\n",
+       "            \"stream processing\",\n",
+       "            \"batch processing\",\n",
+       "            \"data pipelines\",\n",
+       "            \"SDK\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"Data Engineering\",\n",
+       "            \"Open Source\",\n",
+       "            \"Streaming\",\n",
+       "            \"Batch\",\n",
+       "            \"Big Data\"\n",
+       "          ]\n",
+       "        }\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": \"3_0\",\n",
+       "      \"distance\": 0.3927,\n",
+       "      \"fields\": {\n",
+       "        \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Google Beam: 3D Communication Powered by AI\",\n",
+       "          \"keywords\": [\n",
+       "            \"Google Beam\",\n",
+       "            \"Project Starline\",\n",
+       "            \"3D video\",\n",
+       "            \"AI communication\",\n",
+       "            \"real-time meetings\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"AI\",\n",
+       "            \"Communication\",\n",
+       "            \"3D Technology\",\n",
+       "            \"Remote Work\",\n",
+       "            \"Enterprise Tech\"\n",
+       "          ]\n",
+       "        }\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": \"2_1\",\n",
+       "      \"distance\": 0.2925,\n",
+       "      \"fields\": {\n",
+       "        \"content\": \"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\",\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n",
+       "          \"keywords\": [\n",
+       "            \"Google Cloud\",\n",
+       "            \"Dataflow\",\n",
+       "            \"Apache Beam\",\n",
+       "            \"serverless\",\n",
+       "            \"stream and batch\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"Cloud Computing\",\n",
+       "            \"Data Pipelines\",\n",
+       "            \"Google Cloud\",\n",
+       "            \"Serverless\",\n",
+       "            \"Enterprise\"\n",
+       "          ]\n",
+       "        }\n",
+       "      }\n",
+       "    },\n",
+       "    {\n",
+       "      \"id\": \"2_0\",\n",
+       "      \"distance\": 0.2342,\n",
+       "      \"fields\": {\n",
+       "        \"content\": \"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\",\n",
+       "        \"metadata\": {\n",
+       "          \"title\": \"Google Cloud Dataflow: Run Apache Beam in the Cloud\",\n",
+       "          \"keywords\": [\n",
+       "            \"Google Cloud\",\n",
+       "            \"Dataflow\",\n",
+       "            \"Apache Beam\",\n",
+       "            \"serverless\",\n",
+       "            \"stream and batch\"\n",
+       "          ],\n",
+       "          \"tags\": [\n",
+       "            \"Cloud Computing\",\n",
+       "            \"Data Pipelines\",\n",
+       "            \"Google Cloud\",\n",
+       "            \"Serverless\",\n",
+       "            \"Enterprise\"\n",
+       "          ]\n",
+       "        }\n",
+       "      }\n",
+       "    }\n",
+       "  ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"query\"\u001b[0m: \u001b[3;35mnull\u001b[0m,\n", + " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", + " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"1_0\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4598\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"1_1\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"3_0\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam\"\u001b[0m,\n", + " \u001b[32m\"Project Starline\"\u001b[0m,\n", + " \u001b[32m\"3D video\"\u001b[0m,\n", + " \u001b[32m\"AI communication\"\u001b[0m,\n", + " \u001b[32m\"real-time meetings\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"AI\"\u001b[0m,\n", + " \u001b[32m\"Communication\"\u001b[0m,\n", + " \u001b[32m\"3D Technology\"\u001b[0m,\n", + " \u001b[32m\"Remote Work\"\u001b[0m,\n", + " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"2_1\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2925\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"For developers, Dataflow provides local testing capabilities and a unified logging system through Cloud Logging. It also supports SQL-based pipeline definitions using BigQuery, which lowers the barrier to entry for analysts and data engineers. Dataflow’s streaming engine significantly improves performance and reduces costs by decoupling compute and state management. In summary, Google Cloud Dataflow not only simplifies the deployment of Apache Beam pipelines but also enhances them with cloud-native features. Its managed runtime, high availability, and integration with the broader Google Cloud ecosystem make it a powerful tool for modern data processing.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"2_0\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.2342\u001b[0m,\n", + " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow is a fully managed service that runs Apache Beam pipelines in the cloud. It abstracts away infrastructure management and handles dynamic scaling, load balancing, and fault tolerance. Developers can focus on writing data logic using the Beam SDK and deploy it easily to Google Cloud. Dataflow supports both batch and stream processing and integrates seamlessly with other Google services like BigQuery, Pub/Sub, and Cloud Storage. Its autoscaling capabilities allow it to adapt to changing data volumes, optimizing for cost and performance. Features like monitoring dashboards, job templates, and built-in logging make it suitable for both development and production use. With support for event time processing, stateful functions, and windowing, Dataflow is well-suited for real-time analytics and data transformation tasks. It’s a key component for architects building scalable, cloud-native data platforms. Dataflow also offers templates for common ETL tasks, helping teams get started quickly with minimal setup. Its integration with Cloud Functions and Cloud Composer enables event-driven and orchestrated workflows. Security and compliance are built-in with IAM roles, encryption at rest and in transit, and audit logging, making it suitable for enterprise environments.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Cloud Dataflow: Run Apache Beam in the Cloud\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Dataflow\"\u001b[0m,\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"serverless\"\u001b[0m,\n", + " \u001b[32m\"stream and batch\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Cloud Computing\"\u001b[0m,\n", + " \u001b[32m\"Data Pipelines\"\u001b[0m,\n", + " \u001b[32m\"Google Cloud\"\u001b[0m,\n", + " \u001b[32m\"Serverless\"\u001b[0m,\n", + " \u001b[32m\"Enterprise\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with beam.Pipeline() as p:\n", + " _ = (\n", + " p\n", + " | \"Create\" >> beam.Create([Chunk(content=Content(),embedding=Embedding(dense_embedding=query))])\n", + " | \"Enrich W/ Milvus Vector Search\" >> Enrichment(milvus_handler)\n", + " | \"Format and Print Results\" >> FormatAndPrintResults())" + ] + }, + { + "cell_type": "markdown", + "id": "cb72f9c6-5a29-4810-9768-574aa7ea5128", + "metadata": {}, + "source": [ + "#### Searching for Apache Beam with Filtered Search" + ] + }, + { + "cell_type": "markdown", + "id": "df64b70f-bad8-469f-8419-723911f7f7cf", + "metadata": {}, + "source": [ + "To precisely target **Apache Beam** and ensure the retrieval of only relevant documents, we can leverage the power of **metadata filtering**. By applying a filter that specifies the document's `keywords` must contain `data pipelines`, we can instruct the undelrying search engine to exclude any documents related to `Google Beam` from the result set. This allows the vector search to operate on a pre-filtered, highly relevant subset of the corpus, guaranteeing that the retrieved information pertains exclusively to `Apache Beam`'s domain, thereby resolving the semantic ambiguity with remarkable precision." + ] + }, + { + "cell_type": "code", + "execution_count": 977, + "id": "6e79ef5c-a121-4e69-9089-0991821f8745", + "metadata": {}, + "outputs": [], + "source": [ + "query = encode_embedding(\"what is beam?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 978, "id": "5314c531-14bb-4d81-92a5-fcf9cca7fa81", "metadata": {}, "outputs": [], "source": [ "search_parameters = MilvusSearchParameters(\n", " collection_name=collection_name,\n", - " search_strategy=VectorSearchParameters(filter=\"ARRAY_CONTAINS(keywords, 'data pipelines')\",limit=10,anns_field=\"vector\"),\n", - " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + " search_strategy=VectorSearchParameters(\n", + " filter=\"ARRAY_CONTAINS(metadata['keywords'], 'data pipelines')\",\n", + " limit=10,\n", + " anns_field=\"embedding\",\n", + " ),\n", + " output_fields=[\"metadata\",\"content\"])" ] }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 979, "id": "0ecf2ac6-cf90-4ce7-b17f-113af90ab950", "metadata": {}, "outputs": [], @@ -2046,7 +2248,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 980, "id": "0cd92b69-b9dc-445c-9bd7-21bb3ceb0fd3", "metadata": {}, "outputs": [], @@ -2059,7 +2261,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 981, "id": "b06ecf64-c314-4c6a-ae1a-4fdf059aeead", "metadata": {}, "outputs": [ @@ -2071,47 +2273,51 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 460862423920279605,\n", - " \"distance\": 0.453,\n", + " \"id\": \"1_0\",\n", + " \"distance\": 0.4598,\n", " \"fields\": {\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", - " \"keywords\": [\n", - " \"Apache Beam\",\n", - " \"stream processing\",\n", - " \"batch processing\",\n", - " \"data pipelines\",\n", - " \"SDK\"\n", - " ],\n", - " \"tags\": [\n", - " \"Data Engineering\",\n", - " \"Open Source\",\n", - " \"Streaming\",\n", - " \"Batch\",\n", - " \"Big Data\"\n", - " ],\n", - " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\n", + " \"content\": \"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\",\n", + " \"metadata\": {\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", + " \"keywords\": [\n", + " \"Apache Beam\",\n", + " \"stream processing\",\n", + " \"batch processing\",\n", + " \"data pipelines\",\n", + " \"SDK\"\n", + " ],\n", + " \"tags\": [\n", + " \"Data Engineering\",\n", + " \"Open Source\",\n", + " \"Streaming\",\n", + " \"Batch\",\n", + " \"Big Data\"\n", + " ]\n", + " }\n", " }\n", " },\n", " {\n", - " \"id\": 460862423920279606,\n", + " \"id\": \"1_1\",\n", " \"distance\": 0.4353,\n", " \"fields\": {\n", - " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", - " \"keywords\": [\n", - " \"Apache Beam\",\n", - " \"stream processing\",\n", - " \"batch processing\",\n", - " \"data pipelines\",\n", - " \"SDK\"\n", - " ],\n", - " \"tags\": [\n", - " \"Data Engineering\",\n", - " \"Open Source\",\n", - " \"Streaming\",\n", - " \"Batch\",\n", - " \"Big Data\"\n", - " ],\n", - " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\n", + " \"content\": \"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\",\n", + " \"metadata\": {\n", + " \"title\": \"Apache Beam: Unified Model for Batch and Streaming Data\",\n", + " \"keywords\": [\n", + " \"Apache Beam\",\n", + " \"stream processing\",\n", + " \"batch processing\",\n", + " \"data pipelines\",\n", + " \"SDK\"\n", + " ],\n", + " \"tags\": [\n", + " \"Data Engineering\",\n", + " \"Open Source\",\n", + " \"Streaming\",\n", + " \"Batch\",\n", + " \"Big Data\"\n", + " ]\n", + " }\n", " }\n", " }\n", " ]\n", @@ -2124,47 +2330,51 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279605\u001b[0m,\n", - " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.453\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"1_0\"\u001b[0m,\n", + " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4598\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"stream processing\"\u001b[0m,\n", - " \u001b[32m\"batch processing\"\u001b[0m,\n", - " \u001b[32m\"data pipelines\"\u001b[0m,\n", - " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Data Engineering\"\u001b[0m,\n", - " \u001b[32m\"Open Source\"\u001b[0m,\n", - " \u001b[32m\"Streaming\"\u001b[0m,\n", - " \u001b[32m\"Batch\"\u001b[0m,\n", - " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Apache Beam is an open-source framework that provides a consistent programming model for both batch and streaming data processing. Developed originally by Google, it allows developers to write pipelines that can run on multiple engines, such as Apache Flink, Spark, and Google Cloud Dataflow. Beam uses abstractions like PCollections (data containers) and PTransforms (operations) to define the flow of data. The framework promotes portability through its runner architecture, letting the same pipeline execute on different backends. Support for multiple SDKs, including Java and Python, makes it accessible for a broad audience. Key features include support for event time, windowing, triggers, and stateful processing, which are essential for handling real-time data effectively. Beam is ideal for building ETL jobs, real-time analytics, and machine learning data pipelines. It helps teams focus on logic rather than infrastructure, offering flexibility and scalability in handling unbounded and bounded data sources. Apache Beam also supports a wide range of connectors for both input and output, including Kafka, BigQuery, and JDBC-based systems. This makes it easy to integrate Beam into existing data ecosystems. Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m,\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279606\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"1_1\"\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.4353\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Apache Beam\"\u001b[0m,\n", - " \u001b[32m\"stream processing\"\u001b[0m,\n", - " \u001b[32m\"batch processing\"\u001b[0m,\n", - " \u001b[32m\"data pipelines\"\u001b[0m,\n", - " \u001b[32m\"SDK\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Data Engineering\"\u001b[0m,\n", - " \u001b[32m\"Open Source\"\u001b[0m,\n", - " \u001b[32m\"Streaming\"\u001b[0m,\n", - " \u001b[32m\"Batch\"\u001b[0m,\n", - " \u001b[32m\"Big Data\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m\n", + " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Developers can build reusable transforms and modularize pipeline logic, improving maintainability and testing. The concept of runners enables developers to write once and run anywhere, which is particularly appealing for organizations that want to avoid vendor lock-in. The Beam model is based on a unified programming model that decouples pipeline logic from execution. This makes it easier to reason about time and state in both batch and streaming pipelines. Advanced features like late data handling, watermarks, and session windowing allow for more accurate and meaningful processing of real-world data. Beam also integrates with orchestration tools and monitoring systems, allowing for production-grade deployments. Community support and contributions have grown significantly, making Beam a stable and evolving ecosystem. Many cloud providers offer native support for Beam pipelines, and it's increasingly a core component in modern data platform architectures.\"\u001b[0m,\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Apache Beam: Unified Model for Batch and Streaming Data\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Apache Beam\"\u001b[0m,\n", + " \u001b[32m\"stream processing\"\u001b[0m,\n", + " \u001b[32m\"batch processing\"\u001b[0m,\n", + " \u001b[32m\"data pipelines\"\u001b[0m,\n", + " \u001b[32m\"SDK\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Data Engineering\"\u001b[0m,\n", + " \u001b[32m\"Open Source\"\u001b[0m,\n", + " \u001b[32m\"Streaming\"\u001b[0m,\n", + " \u001b[32m\"Batch\"\u001b[0m,\n", + " \u001b[32m\"Big Data\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -2189,12 +2399,20 @@ "id": "3e61bcf4-96e7-47dd-bb37-4788e99a2b89", "metadata": {}, "source": [ - "#### Searching for Google Beam" + "#### Searching for Google Beam with Filtered Search" + ] + }, + { + "cell_type": "markdown", + "id": "a782f79b-a1a2-4474-807e-8abad62406b0", + "metadata": {}, + "source": [ + "To precisely target `Google Beam` and ensure the retrieval of only relevant documents, we can leverage the power of `metadata filtering`. By applying a filter that specifies the document's `tags` must contain `Remote Work`, we can instruct the underlying search engine to exclude any documents related to `Apache Beam` from the result set. This allows the vector search to operate on a pre-filtered, highly relevant subset of the corpus, guaranteeing that the retrieved information pertains exclusively to `Google Beam`'s domain, thereby resolving the semantic ambiguity with remarkable precision." ] }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 982, "id": "a8077395-c374-400f-abdc-fe6630eab8a4", "metadata": {}, "outputs": [], @@ -2204,20 +2422,20 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 983, "id": "3b712779-f283-4e37-88ed-d6b65c6c45d2", "metadata": {}, "outputs": [], "source": [ "search_parameters = MilvusSearchParameters(\n", " collection_name=collection_name,\n", - " search_strategy=VectorSearchParameters(filter=\"ARRAY_CONTAINS(tags, 'Remote Work')\",limit=10,anns_field=\"vector\"),\n", - " output_fields=[\"title\",\"keywords\",\"tags\", \"content\"])" + " search_strategy=VectorSearchParameters(filter=\"ARRAY_CONTAINS(metadata['tags'], 'Remote Work')\",limit=10,anns_field=\"embedding\"),\n", + " output_fields=[\"metadata\", \"content\"])" ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 984, "id": "7f0924a3-8832-4138-a599-d3aef648b962", "metadata": {}, "outputs": [], @@ -2227,7 +2445,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 985, "id": "516ecbf0-9bb0-4177-829b-b79300b29bbe", "metadata": {}, "outputs": [], @@ -2240,7 +2458,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 986, "id": "db32dda5-0668-4162-80ea-b6a0c2a79063", "metadata": {}, "outputs": [ @@ -2252,25 +2470,27 @@ " \"query_embedding\": 384,\n", " \"results\": [\n", " {\n", - " \"id\": 460862423920279609,\n", + " \"id\": \"3_0\",\n", " \"distance\": 0.3927,\n", " \"fields\": {\n", - " \"keywords\": [\n", - " \"Google Beam\",\n", - " \"Project Starline\",\n", - " \"3D video\",\n", - " \"AI communication\",\n", - " \"real-time meetings\"\n", - " ],\n", - " \"tags\": [\n", - " \"AI\",\n", - " \"Communication\",\n", - " \"3D Technology\",\n", - " \"Remote Work\",\n", - " \"Enterprise Tech\"\n", - " ],\n", " \"content\": \"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\",\n", - " \"title\": \"Google Beam: 3D Communication Powered by AI\"\n", + " \"metadata\": {\n", + " \"title\": \"Google Beam: 3D Communication Powered by AI\",\n", + " \"keywords\": [\n", + " \"Google Beam\",\n", + " \"Project Starline\",\n", + " \"3D video\",\n", + " \"AI communication\",\n", + " \"real-time meetings\"\n", + " ],\n", + " \"tags\": [\n", + " \"AI\",\n", + " \"Communication\",\n", + " \"3D Technology\",\n", + " \"Remote Work\",\n", + " \"Enterprise Tech\"\n", + " ]\n", + " }\n", " }\n", " }\n", " ]\n", @@ -2283,25 +2503,27 @@ " \u001b[1;34m\"query_embedding\"\u001b[0m: \u001b[1;36m384\u001b[0m,\n", " \u001b[1;34m\"results\"\u001b[0m: \u001b[1m[\u001b[0m\n", " \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"id\"\u001b[0m: \u001b[1;36m460862423920279609\u001b[0m,\n", + " \u001b[1;34m\"id\"\u001b[0m: \u001b[32m\"3_0\"\u001b[0m,\n", " \u001b[1;34m\"distance\"\u001b[0m: \u001b[1;36m0.3927\u001b[0m,\n", " \u001b[1;34m\"fields\"\u001b[0m: \u001b[1m{\u001b[0m\n", - " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"Google Beam\"\u001b[0m,\n", - " \u001b[32m\"Project Starline\"\u001b[0m,\n", - " \u001b[32m\"3D video\"\u001b[0m,\n", - " \u001b[32m\"AI communication\"\u001b[0m,\n", - " \u001b[32m\"real-time meetings\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", - " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", - " \u001b[32m\"AI\"\u001b[0m,\n", - " \u001b[32m\"Communication\"\u001b[0m,\n", - " \u001b[32m\"3D Technology\"\u001b[0m,\n", - " \u001b[32m\"Remote Work\"\u001b[0m,\n", - " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", - " \u001b[1m]\u001b[0m,\n", " \u001b[1;34m\"content\"\u001b[0m: \u001b[32m\"Google Beam is an innovative video communication platform that builds on the research of Project Starline. It uses AI, 3D imaging, and light field rendering to create immersive, lifelike video calls. Designed to replicate in-person interaction, Beam allows users to see life-sized, three-dimensional representations of each other without the need for headsets. This breakthrough makes remote conversations feel natural—capturing facial expressions, eye contact, and subtle gestures that traditional video conferencing often misses. Beam reduces meeting fatigue and enhances engagement, making it ideal for enterprise collaboration, interviews, and virtual presence scenarios. Powered by Google AI, Beam represents a significant leap in communication technology. Major companies like Salesforce, Deloitte, and NEC are already exploring its impact on digital collaboration. Google is partnering with HP to build and distribute Beam hardware, designed to work with existing productivity and video tools. Currently in limited early access for enterprise partners, Google Beam aims to redefine virtual meetings by bridging the gap between digital and physical presence. It’s a promising step toward more human and effective remote interactions.\"\u001b[0m,\n", - " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m\n", + " \u001b[1;34m\"metadata\"\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[1;34m\"title\"\u001b[0m: \u001b[32m\"Google Beam: 3D Communication Powered by AI\"\u001b[0m,\n", + " \u001b[1;34m\"keywords\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"Google Beam\"\u001b[0m,\n", + " \u001b[32m\"Project Starline\"\u001b[0m,\n", + " \u001b[32m\"3D video\"\u001b[0m,\n", + " \u001b[32m\"AI communication\"\u001b[0m,\n", + " \u001b[32m\"real-time meetings\"\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[1;34m\"tags\"\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[32m\"AI\"\u001b[0m,\n", + " \u001b[32m\"Communication\"\u001b[0m,\n", + " \u001b[32m\"3D Technology\"\u001b[0m,\n", + " \u001b[32m\"Remote Work\"\u001b[0m,\n", + " \u001b[32m\"Enterprise Tech\"\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m}\u001b[0m\n", " \u001b[1m]\u001b[0m\n", @@ -2331,7 +2553,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 987, "id": "953e61f4-5188-45a6-b30b-d581f7471d17", "metadata": {}, "outputs": [], @@ -2339,14 +2561,6 @@ "MilvusEnrichmentTestHelper.stop_db_container(db)\n", "db = None" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1c6b76a-4aaa-498d-af24-d5d1e5f7f21f", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -2365,7 +2579,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.22" + "version": "3.10.12" } }, "nbformat": 4, From b7c064f1220b2e1f9bd29c0c1dfde07b05a16332 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sun, 19 Oct 2025 16:23:44 +0000 Subject: [PATCH 19/25] ml/rag: enforce running etcd in milvus itests in standalone mode --- .../ml/rag/enrichment/milvus_search_it_test.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py index 78c6ae987b7a..dd4a50db9d0f 100644 --- a/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py +++ b/sdks/python/apache_beam/ml/rag/enrichment/milvus_search_it_test.py @@ -455,6 +455,13 @@ def create_user_yaml(service_port: int, max_vector_field_num=5): user_config = { 'proxy': { 'maxVectorFieldNum': max_vector_field_num, 'port': service_port + }, + 'etcd': { + 'use': { + 'embed': True + }, 'data': { + 'dir': '/var/lib/milvus/etcd' + } } } @@ -481,10 +488,11 @@ class TestMilvusSearchEnrichment(unittest.TestCase): """Tests for search functionality across all search strategies""" _db: MilvusDBContainerInfo + _version = "milvusdb/milvus:v2.5.10" @classmethod def setUpClass(cls): - cls._db = MilvusEnrichmentTestHelper.start_db_container() + cls._db = MilvusEnrichmentTestHelper.start_db_container(cls._version) cls._connection_params = MilvusConnectionParameters( uri=cls._db.uri, user=cls._db.user, From aa50907d0c9778ab52b877afe02ce3c1a097a815 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sun, 19 Oct 2025 17:32:55 +0000 Subject: [PATCH 20/25] examples: update jupyter notebook mainly to pin milvus db version --- .../beam-ml/milvus_enrichment_transform.ipynb | 456 ++++++++++-------- 1 file changed, 263 insertions(+), 193 deletions(-) diff --git a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb index 3b632892d62b..2dbd038f3086 100644 --- a/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb +++ b/examples/notebooks/beam-ml/milvus_enrichment_transform.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 928, + "execution_count": 1, "id": "47053bac", "metadata": {}, "outputs": [], @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 929, + "execution_count": 2, "id": "e550cd55-e91e-4d43-b1bd-b0e89bb8cbd9", "metadata": {}, "outputs": [], @@ -80,38 +80,17 @@ }, { "cell_type": "code", - "execution_count": 930, + "execution_count": 3, "id": "31747c45-107a-49be-8885-5a6cc9dc1236", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "94580.11s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", - "E0000 00:00:1760812661.675888 2629967 backup_poller.cc:126] Run client channel backup poller: UNKNOWN:pollset_work {created_time:\"2025-10-18T18:37:41.674930542+00:00\", children:[UNKNOWN:epoll_wait: Bad file descriptor (9) {created_time:\"2025-10-18T18:37:41.674662134+00:00\"}]}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "94591.45s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", - "E0000 00:00:1760812676.675094 2630020 backup_poller.cc:126] Run client channel backup poller: UNKNOWN:pollset_work {created_time:\"2025-10-18T18:37:56.674462799+00:00\", children:[UNKNOWN:epoll_wait: Bad file descriptor (9) {created_time:\"2025-10-18T18:37:56.674318855+00:00\"}]}\n" + "\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: There was an error checking the latest version of pip.\u001b[0m\u001b[33m\n", + "\u001b[0m" ] } ], @@ -124,30 +103,52 @@ }, { "cell_type": "code", - "execution_count": 931, + "execution_count": 4, "id": "666e0c2b-0341-4b0e-8d73-561abc39bb10", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/dev/beam/sdks/python/.venv/lib/python3.9/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'validate_default' attribute with value True was provided to the `Field()` function, which has no effect in the context it was used. 'validate_default' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "# Standard library imports.\n", + "# Standard library imports\n", "from collections import defaultdict\n", + "from dataclasses import asdict\n", "from math import ceil\n", - "from typing import List\n", + "from typing import Any, Dict, List\n", + "import tempfile\n", + "import uuid\n", + "import shutil\n", "\n", - "# Third-party imports.\n", + "# Third-party imports\n", "import numpy as np\n", "import pandas as pd\n", - "from pymilvus import DataType, CollectionSchema, FieldSchema, Function, FunctionType, MilvusClient, RRFRanker\n", + "from pymilvus import (\n", + " DataType, \n", + " CollectionSchema, \n", + " FieldSchema, \n", + " Function, \n", + " FunctionType, \n", + " MilvusClient, \n", + " RRFRanker\n", + ")\n", "from pymilvus.milvus_client import IndexParams\n", "from rich import print_json\n", "from sentence_transformers import SentenceTransformer\n", "from torch import cuda\n", "from llama_index.core.text_splitter import SentenceSplitter\n", "\n", - "# Local application imports.\n", + "# Apache Beam imports\n", "import apache_beam as beam\n", "from apache_beam.ml.rag.types import Chunk, Content, Embedding\n", - "from apache_beam.transforms.enrichment import Enrichment\n", + "from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider\n", + "from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings\n", "from apache_beam.ml.rag.enrichment.milvus_search_it_test import MilvusEnrichmentTestHelper\n", "from apache_beam.ml.rag.enrichment.milvus_search import (\n", " HybridSearchParameters, \n", @@ -159,7 +160,12 @@ " MilvusSearchParameters, \n", " SearchStrategy, \n", " VectorSearchMetrics, \n", - " VectorSearchParameters)" + " VectorSearchParameters\n", + ")\n", + "from apache_beam.ml.transforms.base import MLTransform\n", + "from apache_beam.ml.transforms.embeddings import huggingface\n", + "from apache_beam.runners.interactive import interactive_beam as ib\n", + "from apache_beam.transforms.enrichment import Enrichment" ] }, { @@ -188,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 932, + "execution_count": 5, "id": "38781cf5-e18f-40f5-827e-2d441ae7d2fa", "metadata": {}, "outputs": [], @@ -281,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 933, + "execution_count": 6, "id": "489e93b6-de41-4ec3-be33-a15c3cba12e8", "metadata": {}, "outputs": [ @@ -358,7 +364,7 @@ "max 312.000000" ] }, - "execution_count": 933, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -373,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": 934, + "execution_count": 7, "id": "eb32aad0-febd-45af-b4bd-e2176b07e2dc", "metadata": {}, "outputs": [ @@ -418,17 +424,18 @@ }, { "cell_type": "code", - "execution_count": 935, + "execution_count": 8, "id": "5ae9bc82-9ad7-46dd-b254-19cbdcdd0e07", "metadata": {}, "outputs": [], "source": [ - "db = None" + "db = None\n", + "milvus_version = \"milvusdb/milvus:v2.5.10\"" ] }, { "cell_type": "code", - "execution_count": 936, + "execution_count": 9, "id": "aff7b261-3330-4fa9-9a54-3fd87b42521f", "metadata": {}, "outputs": [], @@ -436,12 +443,12 @@ "if db:\n", " # Stop existing Milvus DB container to prevent duplicates.\n", " MilvusEnrichmentTestHelper.stop_db_container(db)\n", - "db = MilvusEnrichmentTestHelper.start_db_container()" + "db = MilvusEnrichmentTestHelper.start_db_container(milvus_version)" ] }, { "cell_type": "code", - "execution_count": 937, + "execution_count": 10, "id": "31496ee0-75a2-48ad-954e-9c4ae5abbf5e", "metadata": {}, "outputs": [], @@ -451,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 938, + "execution_count": 11, "id": "82627714-2425-4058-9b47-d262f015caf7", "metadata": {}, "outputs": [], @@ -461,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 939, + "execution_count": 12, "id": "e8a85f51-5d5f-4533-bf0f-ec825e613dc2", "metadata": {}, "outputs": [ @@ -471,7 +478,7 @@ "'2.5.10'" ] }, - "execution_count": 939, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -498,7 +505,22 @@ }, { "cell_type": "code", - "execution_count": 940, + "execution_count": 13, + "id": "e3847821-069c-412f-8c20-2406bcac1e55", + "metadata": {}, + "outputs": [], + "source": [ + "# Choosing `sentence-transformers/all-MiniLM-L6-v2` as our embedding generator here. It gives\n", + "# a good balance between embedding generation speed, accuracy, and being free to use.\n", + "embedding_model_config = {\n", + " \"name\": 'sentence-transformers/all-MiniLM-L6-v2',\n", + " \"token_limit\": 384\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "id": "c014af94-1bb7-44e4-842c-1039f4a2a11d", "metadata": {}, "outputs": [], @@ -523,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 941, + "execution_count": 15, "id": "54fb3428-b007-4804-9d79-b3933d3256c5", "metadata": {}, "outputs": [], @@ -539,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 942, + "execution_count": 16, "id": "4c2f123a-5949-4974-af48-a5db5b168c11", "metadata": {}, "outputs": [ @@ -549,7 +571,7 @@ "{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': , 'params': {'max_length': 100}, 'is_primary': True, 'auto_id': False}, {'name': 'content', 'description': '', 'type': , 'params': {'max_length': 65279}}, {'name': 'embedding', 'description': '', 'type': , 'params': {'dim': 384}}, {'name': 'sparse_embedding', 'description': '', 'type': , 'is_function_output': True}, {'name': 'metadata', 'description': '', 'type': }, {'name': 'title_and_content', 'description': '', 'type': , 'params': {'max_length': 65535, 'enable_analyzer': True}}], 'enable_dynamic_field': False, 'functions': [{'name': 'content_bm25_emb', 'description': '', 'type': , 'input_field_names': ['title_and_content'], 'output_field_names': ['sparse_embedding'], 'params': {}}]}" ] }, - "execution_count": 942, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -569,7 +591,7 @@ }, { "cell_type": "code", - "execution_count": 943, + "execution_count": 17, "id": "671f4352-2086-4428-83be-0de48926682d", "metadata": {}, "outputs": [], @@ -587,7 +609,7 @@ }, { "cell_type": "code", - "execution_count": 944, + "execution_count": 18, "id": "aa8baae5-7c38-4e78-ace4-304c7dc6b127", "metadata": {}, "outputs": [], @@ -610,7 +632,7 @@ }, { "cell_type": "code", - "execution_count": 945, + "execution_count": 19, "id": "d970a35b-f9b2-4f8f-93ef-8de5c83c31b5", "metadata": {}, "outputs": [], @@ -625,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 946, + "execution_count": 20, "id": "0d45a6ad-2009-4e30-b38d-73266da98a06", "metadata": {}, "outputs": [ @@ -636,7 +658,7 @@ " {'field_name': 'sparse_embedding', 'index_type': 'SPARSE_INVERTED_INDEX', 'index_name': 'sparse_inverted_index', 'inverted_index_algo': 'DAAT_MAXSCORE', 'bm25_k1': 1.2, 'bm25_b': 0.75, 'metric_type': 'BM25'}]" ] }, - "execution_count": 946, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -655,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 947, + "execution_count": 21, "id": "51dd4423-240c-4271-bb8c-6270f399a25c", "metadata": {}, "outputs": [], @@ -665,7 +687,7 @@ }, { "cell_type": "code", - "execution_count": 948, + "execution_count": 22, "id": "9620b1f2-51fa-491c-ad3f-f0676b9b25f6", "metadata": {}, "outputs": [], @@ -675,7 +697,7 @@ }, { "cell_type": "code", - "execution_count": 949, + "execution_count": 23, "id": "e6cf3a1d-265c-44db-aba8-d491fab290d5", "metadata": {}, "outputs": [], @@ -685,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 950, + "execution_count": 24, "id": "94497411-43d3-4300-98b3-1cb33759738e", "metadata": {}, "outputs": [ @@ -695,7 +717,7 @@ "True" ] }, - "execution_count": 950, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -714,45 +736,43 @@ }, { "cell_type": "code", - "execution_count": 951, - "id": "f4632377-3d8c-4d60-891f-52a2c3905364", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Dict, Any\n", - "from apache_beam.ml.rag.chunking.base import ChunkingTransformProvider\n", - "from apache_beam.ml.transforms.base import MLTransform\n", - "from apache_beam.ml.transforms.embeddings import huggingface\n", - "from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings\n", - "from dataclasses import asdict\n", - "import tempfile\n", - "import uuid\n", - "import shutil" - ] - }, - { - "cell_type": "code", - "execution_count": 952, - "id": "b9da8f1b-3657-4869-bdb7-5e4e607bfd7d", - "metadata": {}, - "outputs": [], - "source": [ - "# Choosing `sentence-transformers/all-MiniLM-L6-v2` as our embedding generator here. It gives\n", - "# a good balance between embedding generation speed, accuracy, and being free to use.\n", - "embedding_model_config = {\n", - " \"name\": 'sentence-transformers/all-MiniLM-L6-v2',\n", - " \"token_limit\": 384\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 953, + "execution_count": 25, "id": "25c5c202-abe0-4d11-82df-e731f0d6201e", "metadata": { "scrolled": true }, "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " });\n", + " }" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stderr", "output_type": "stream", @@ -775,7 +795,7 @@ "text/html": [ "\n", " \n", - "
\n", + "
\n", "
\n", " Processing... show\n", "
\n", @@ -810,7 +830,7 @@ " }\n", " \n", " \n", - "
\n", + "
\n", "