From ec36c6012770f0313a89a1161c56c29476b82eff Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Dec 2025 23:29:13 +0000 Subject: [PATCH 1/3] Initial plan From c4c35febc66ae6cfc2773434628f54290666d1c8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Dec 2025 23:38:35 +0000 Subject: [PATCH 2/3] Add process-data quickstart with code snippets Co-authored-by: gewarren <24882762+gewarren@users.noreply.github.com> --- docs/ai/quickstarts/process-data.md | 396 ++++++++++++++++++ .../snippets/process-data/.gitignore | 5 + .../ProcessData/ProcessData.csproj | 21 + .../azure-openai/ProcessData/Program.cs | 113 +++++ .../azure-openai/ProcessData/data/sample.md | 18 + .../openai/ProcessData/ProcessData.csproj | 19 + .../openai/ProcessData/Program.cs | 113 +++++ .../openai/ProcessData/data/sample.md | 18 + docs/ai/toc.yml | 2 + 9 files changed, 705 insertions(+) create mode 100644 docs/ai/quickstarts/process-data.md create mode 100644 docs/ai/quickstarts/snippets/process-data/.gitignore create mode 100644 docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj create mode 100644 docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs create mode 100644 docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/data/sample.md create mode 100644 docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj create mode 100644 docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs create mode 100644 docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md diff --git a/docs/ai/quickstarts/process-data.md b/docs/ai/quickstarts/process-data.md new file mode 100644 index 0000000000000..04d0f99f2c214 --- /dev/null +++ b/docs/ai/quickstarts/process-data.md @@ -0,0 +1,396 @@ +--- +title: Quickstart - Process custom data for AI with .NET +description: Create a data ingestion pipeline to process and prepare custom data for AI applications using Microsoft.Extensions.DataIngestion +ms.date: 12/11/2025 +ms.topic: quickstart +zone_pivot_groups: openai-library +ai-usage: ai-assisted +--- + +# Process custom data for AI applications + +In this quickstart, you learn how to create a data ingestion pipeline to process and prepare custom data for AI applications. The app uses the library to read documents, enrich content with AI, chunk text semantically, and store embeddings in a vector database for semantic search. + +Data ingestion is essential for Retrieval-Augmented Generation (RAG) scenarios where you need to process large amounts of unstructured data and make it searchable for AI applications. + +:::zone target="docs" pivot="openai" + +[!INCLUDE [openai-prereqs](includes/prerequisites-openai.md)] + +:::zone-end + +:::zone target="docs" pivot="azure-openai" + +[!INCLUDE [azure-openai-prereqs](includes/prerequisites-azure-openai.md)] + +:::zone-end + +## Create the app + +Complete the following steps to create a .NET console app that can: + +- Read Markdown documents from a directory +- Enrich content with AI-generated image descriptions +- Chunk text using semantic similarity +- Generate AI summaries for each chunk +- Store embeddings in a SQLite vector database +- Search the vector store using natural language queries + +1. In an empty directory on your computer, use the `dotnet new` command to create a new console app: + + ```dotnetcli + dotnet new console -o ProcessDataAI + ``` + +1. Change directory into the app folder: + + ```dotnetcli + cd ProcessDataAI + ``` + +1. Install the required packages: + + :::zone target="docs" pivot="azure-openai" + + ```bash + dotnet add package Azure.Identity + dotnet add package Azure.AI.OpenAI + dotnet add package Microsoft.Extensions.AI.OpenAI --prerelease + dotnet add package Microsoft.Extensions.DataIngestion --prerelease + dotnet add package Microsoft.Extensions.DataIngestion.Markdig --prerelease + dotnet add package Microsoft.Extensions.Logging.Console + dotnet add package Microsoft.ML.Tokenizers.Data.Cl100kBase + dotnet add package Microsoft.SemanticKernel.Connectors.SqliteVec --prerelease + ``` + + The following list describes each package in the `ProcessDataAI` app: + + - [`Azure.Identity`](https://www.nuget.org/packages/Azure.Identity) provides [`Microsoft Entra ID`](/entra/fundamentals/whatis) token authentication support across the Azure SDK using classes such as `DefaultAzureCredential`. + - [`Azure.AI.OpenAI`](https://www.nuget.org/packages/Azure.AI.OpenAI) is the official package for using OpenAI's .NET library with the Azure OpenAI Service. + - [`Microsoft.Extensions.AI.OpenAI`](https://www.nuget.org/packages/Microsoft.Extensions.AI.OpenAI) provides AI abstractions for OpenAI-compatible models or endpoints. + - [`Microsoft.Extensions.DataIngestion`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion) provides foundational .NET building blocks for data ingestion pipelines. + - [`Microsoft.Extensions.DataIngestion.Markdig`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion.Markdig) provides a Markdown document reader for the data ingestion pipeline. + - [`Microsoft.Extensions.Logging.Console`](https://www.nuget.org/packages/Microsoft.Extensions.Logging.Console) provides logging support for the console. + - [`Microsoft.ML.Tokenizers.Data.Cl100kBase`](https://www.nuget.org/packages/Microsoft.ML.Tokenizers.Data.Cl100kBase) provides tokenizer data for the GPT-4 model. + - [`Microsoft.SemanticKernel.Connectors.SqliteVec`](https://www.nuget.org/packages/Microsoft.SemanticKernel.Connectors.SqliteVec) provides an in-memory vector store using SQLite for storing and searching embeddings. + + :::zone-end + + :::zone target="docs" pivot="openai" + + ```bash + dotnet add package Microsoft.Extensions.AI.OpenAI --prerelease + dotnet add package Microsoft.Extensions.DataIngestion --prerelease + dotnet add package Microsoft.Extensions.DataIngestion.Markdig --prerelease + dotnet add package Microsoft.Extensions.Logging.Console + dotnet add package Microsoft.ML.Tokenizers.Data.Cl100kBase + dotnet add package Microsoft.SemanticKernel.Connectors.SqliteVec --prerelease + ``` + + The following list describes each package in the `ProcessDataAI` app: + + - [`Microsoft.Extensions.AI.OpenAI`](https://www.nuget.org/packages/Microsoft.Extensions.AI.OpenAI) provides AI abstractions for OpenAI-compatible models or endpoints. This library also includes the official [`OpenAI`](https://www.nuget.org/packages/OpenAI) library for the OpenAI service API as a dependency. + - [`Microsoft.Extensions.DataIngestion`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion) provides foundational .NET building blocks for data ingestion pipelines. + - [`Microsoft.Extensions.DataIngestion.Markdig`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion.Markdig) provides a Markdown document reader for the data ingestion pipeline. + - [`Microsoft.Extensions.Logging.Console`](https://www.nuget.org/packages/Microsoft.Extensions.Logging.Console) provides logging support for the console. + - [`Microsoft.ML.Tokenizers.Data.Cl100kBase`](https://www.nuget.org/packages/Microsoft.ML.Tokenizers.Data.Cl100kBase) provides tokenizer data for the GPT-4 model. + - [`Microsoft.SemanticKernel.Connectors.SqliteVec`](https://www.nuget.org/packages/Microsoft.SemanticKernel.Connectors.SqliteVec) provides an in-memory vector store using SQLite for storing and searching embeddings. + + :::zone-end + +1. Open the app in Visual Studio Code (or your editor of choice). + + ```bash + code . + ``` + +:::zone target="docs" pivot="azure-openai" + +[!INCLUDE [create-ai-service](includes/create-ai-service.md)] + +:::zone-end + +:::zone target="docs" pivot="openai" + +## Configure the app + +1. Navigate to the root of your .NET project from a terminal or command prompt. + +1. Run the following commands to configure your OpenAI API key as a secret for the sample app: + + ```bash + dotnet user-secrets init + dotnet user-secrets set OpenAIKey + ``` + +:::zone-end + +## Add the app code + +The data ingestion pipeline consists of several components that work together to process documents: + +- **Document reader**: Reads Markdown files from a directory +- **Document processor**: Enriches images with AI-generated alternative text +- **Chunker**: Splits documents into semantic chunks using embeddings +- **Chunk processor**: Generates AI summaries for each chunk +- **Vector store writer**: Stores chunks with embeddings in a SQLite database + +### Configure the document reader + +1. In the `Program.cs` file, delete any existing code and add the following code to configure the document reader: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureReader"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureReader"::: + + :::zone-end + + The `MarkdownReader` class reads Markdown documents and converts them into a unified format that works well with large language models. + +### Configure logging + +1. Add code to configure logging for the pipeline: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureLogging"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureLogging"::: + + :::zone-end + +### Configure the AI client + +1. Add code to configure the AI client for enrichment and chat: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureChatClient"::: + + > [!NOTE] + > searches for authentication credentials from your local tooling. You'll need to assign the `Azure AI Developer` role to the account you used to sign in to Visual Studio or the Azure CLI. For more information, see [Authenticate to Azure AI services with .NET](../azure-ai-services-authentication.md). + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureChatClient"::: + + :::zone-end + +### Configure the document processor + +1. Add code to configure the document processor that enriches images with AI-generated descriptions: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureDocumentProcessor"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureDocumentProcessor"::: + + :::zone-end + + The `ImageAlternativeTextEnricher` uses large language models to generate descriptive alternative text for images within documents, making them more accessible and improving their semantic meaning. + +### Configure the embedding generator + +1. Add code to configure the embedding generator for creating vector representations: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureEmbeddingGenerator"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureEmbeddingGenerator"::: + + :::zone-end + + Embeddings are numerical representations of the semantic meaning of text, which enables vector similarity search. + +### Configure the chunker + +1. Add code to configure the chunker that splits documents into semantic chunks: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureChunker"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureChunker"::: + + :::zone-end + + The `SemanticSimilarityChunker` intelligently splits documents by analyzing the semantic similarity between sentences, ensuring that related content stays together. This produces chunks that preserve meaning and context better than simple character or token-based chunking. + +### Configure the chunk processor + +1. Add code to configure the chunk processor that generates summaries: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureChunkProcessor"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureChunkProcessor"::: + + :::zone-end + + The `SummaryEnricher` automatically generates concise summaries for each chunk, which can improve retrieval accuracy by providing a high-level overview of the content. + +### Configure the vector store + +1. Add code to configure the SQLite vector store for storing embeddings: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureVectorStore"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureVectorStore"::: + + :::zone-end + + The vector store stores chunks along with their embeddings, enabling fast semantic search capabilities. + +### Compose the pipeline + +1. Add code to compose all the components into a complete pipeline: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ComposePipeline"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ComposePipeline"::: + + :::zone-end + + The `IngestionPipeline` combines all the components into a cohesive workflow that processes documents from start to finish. + +### Process documents + +1. Add code to process documents from a directory: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ProcessDocuments"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ProcessDocuments"::: + + :::zone-end + + The pipeline processes all Markdown files in the `./data` directory and reports the status of each document. + +### Search the vector store + +1. Add code to enable interactive search of the processed documents: + + :::zone target="docs" pivot="azure-openai" + + :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="SearchVectorStore"::: + + :::zone-end + + :::zone target="docs" pivot="openai" + + :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="SearchVectorStore"::: + + :::zone-end + + The search functionality converts user queries into embeddings and finds the most semantically similar chunks in the vector store. + +## Create sample data + +1. Create a `data` folder in your project directory: + + ```bash + mkdir data + ``` + +1. Create a sample Markdown file in the `data` folder. For example, create a file named `sample.md` with the following content: + + ```markdown + # Data Ingestion + + Data ingestion is the process of collecting and preparing data for AI applications. + + ## Key Concepts + + - Extract data from various sources + - Transform data into usable formats + - Load data into storage systems + + ## Benefits + + Data ingestion enables AI applications to work with custom data, improving accuracy and relevance. + ``` + +## Run the app + +1. Use the `dotnet run` command to run the app: + + ```dotnetcli + dotnet run + ``` + + The app processes all Markdown files in the `data` directory and displays the processing status for each document. Once processing is complete, you can enter natural language questions to search the processed content. + +1. Enter a question at the prompt to search the data: + + ```output + Enter your question (or 'exit' to quit): What is data ingestion? + ``` + + The app returns the most relevant chunks from your documents along with their similarity scores. + +1. Type `exit` to quit the application. + +:::zone target="docs" pivot="azure-openai" + +## Clean up resources + +If you no longer need them, delete the Azure OpenAI resource and model deployment. + +1. In the [Azure Portal](https://aka.ms/azureportal), navigate to the Azure OpenAI resource. +1. Select the Azure OpenAI resource, and then select **Delete**. + +:::zone-end + +## Next steps + +- [Data ingestion concepts](../conceptual/data-ingestion.md) +- [Implement RAG using vector search](../tutorials/tutorial-ai-vector-search.md) +- [Build a .NET AI vector search app](build-vector-search-app.md) diff --git a/docs/ai/quickstarts/snippets/process-data/.gitignore b/docs/ai/quickstarts/snippets/process-data/.gitignore new file mode 100644 index 0000000000000..1c41c29ca4c84 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/.gitignore @@ -0,0 +1,5 @@ +bin/ +obj/ +*.db +*.db-shm +*.db-wal diff --git a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj new file mode 100644 index 0000000000000..389b7ec30b7bb --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj @@ -0,0 +1,21 @@ + + + + Exe + net10.0 + enable + enable + + + + + + + + + + + + + + diff --git a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs new file mode 100644 index 0000000000000..233a8fe230e8a --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs @@ -0,0 +1,113 @@ +using Azure.AI.OpenAI; +using Azure.Identity; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.DataIngestion; +using Microsoft.Extensions.DataIngestion.Chunkers; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.VectorData; +using Microsoft.ML.Tokenizers; +using Microsoft.SemanticKernel.Connectors.SqliteVec; + +// +// Configure document reader +IngestionDocumentReader reader = new MarkdownReader(); +// + +// +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole()); +// + +// +// Configure IChatClient to use Azure OpenAI +AzureOpenAIClient azureClient = new( + new Uri(Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT")!), + new DefaultAzureCredential()); + +IChatClient chatClient = + azureClient.GetChatClient(Environment.GetEnvironmentVariable("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")!).AsIChatClient(); +// + +// +// Configure document processor +EnricherOptions enricherOptions = new(chatClient) +{ + // Enricher failures should not fail the whole ingestion pipeline, as they are best-effort enhancements. + // This logger factory can be used to create loggers to log such failures. + LoggerFactory = loggerFactory +}; + +IngestionDocumentProcessor imageAlternativeTextEnricher = new ImageAlternativeTextEnricher(enricherOptions); +// + +// +// Configure embedding generator +IEmbeddingGenerator> embeddingGenerator = + azureClient.GetEmbeddingClient(Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")!).AsIEmbeddingGenerator(); +// + +// +// Configure chunker to split text into semantic chunks +IngestionChunkerOptions chunkerOptions = new(TiktokenTokenizer.CreateForModel("gpt-4")) +{ + MaxTokensPerChunk = 2000, + OverlapTokens = 0 +}; + +IngestionChunker chunker = new SemanticSimilarityChunker(embeddingGenerator, chunkerOptions); +// + +// +// Configure chunk processor to generate summaries for each chunk +IngestionChunkProcessor summaryEnricher = new SummaryEnricher(enricherOptions); +// + +// +// Configure SQLite Vector Store +using SqliteVectorStore vectorStore = new( + "Data Source=vectors.db;Pooling=false", + new() + { + EmbeddingGenerator = embeddingGenerator + }); + +// The writer requires the embedding dimension count to be specified. +// For Azure OpenAI's `text-embedding-ada-002`, the dimension count is 1536. +using VectorStoreWriter writer = new(vectorStore, dimensionCount: 1536, new VectorStoreWriterOptions { CollectionName = "data" }); +// + +// +// Compose data ingestion pipeline +using IngestionPipeline pipeline = new(reader, chunker, writer, loggerFactory: loggerFactory) +{ + DocumentProcessors = { imageAlternativeTextEnricher }, + ChunkProcessors = { summaryEnricher } +}; +// + +// +await foreach (var result in pipeline.ProcessAsync(new DirectoryInfo("./data"), searchPattern: "*.md")) +{ + Console.WriteLine($"Completed processing '{result.DocumentId}'. Succeeded: '{result.Succeeded}'."); +} +// + +// +// Search the vector store collection and display results +var collection = writer.VectorStoreCollection; + +while (true) +{ + Console.Write("Enter your question (or 'exit' to quit): "); + string? searchValue = Console.ReadLine(); + if (string.IsNullOrEmpty(searchValue) || searchValue == "exit") + { + break; + } + + Console.WriteLine("Searching...\n"); + await foreach (var result in collection.SearchAsync(searchValue, top: 3)) + { + Console.WriteLine($"Score: {result.Score}\n\tContent: {result.Record["content"]}"); + } +} +// diff --git a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/data/sample.md b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/data/sample.md new file mode 100644 index 0000000000000..f0bbe90de1d87 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/data/sample.md @@ -0,0 +1,18 @@ +# Sample Document + +This is a sample document for testing the data ingestion pipeline. + +## Introduction + +Data ingestion is the process of collecting and preparing data for AI applications. + +## Key Features + +- Document reading +- AI-powered enrichment +- Semantic chunking +- Vector storage + +## Conclusion + +These building blocks make it easy to create data ingestion pipelines. diff --git a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj new file mode 100644 index 0000000000000..3f61f4efaad96 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj @@ -0,0 +1,19 @@ + + + + Exe + net10.0 + enable + enable + + + + + + + + + + + + diff --git a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs new file mode 100644 index 0000000000000..cdb68bc852867 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs @@ -0,0 +1,113 @@ +using System.ClientModel; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.DataIngestion; +using Microsoft.Extensions.DataIngestion.Chunkers; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.VectorData; +using Microsoft.ML.Tokenizers; +using Microsoft.SemanticKernel.Connectors.SqliteVec; +using OpenAI; + +// +// Configure document reader +IngestionDocumentReader reader = new MarkdownReader(); +// + +// +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole()); +// + +// +// Configure IChatClient to use GitHub Models +OpenAIClient openAIClient = new( + new ApiKeyCredential(Environment.GetEnvironmentVariable("GITHUB_TOKEN")!), + new OpenAIClientOptions { Endpoint = new Uri("https://models.github.ai/inference") }); + +IChatClient chatClient = + openAIClient.GetChatClient("gpt-4.1").AsIChatClient(); +// + +// +// Configure document processor +EnricherOptions enricherOptions = new(chatClient) +{ + // Enricher failures should not fail the whole ingestion pipeline, as they are best-effort enhancements. + // This logger factory can be used to create loggers to log such failures. + LoggerFactory = loggerFactory +}; + +IngestionDocumentProcessor imageAlternativeTextEnricher = new ImageAlternativeTextEnricher(enricherOptions); +// + +// +// Configure embedding generator +IEmbeddingGenerator> embeddingGenerator = + openAIClient.GetEmbeddingClient("text-embedding-3-small").AsIEmbeddingGenerator(); +// + +// +// Configure chunker to split text into semantic chunks +IngestionChunkerOptions chunkerOptions = new(TiktokenTokenizer.CreateForModel("gpt-4")) +{ + MaxTokensPerChunk = 2000, + OverlapTokens = 0 +}; + +IngestionChunker chunker = new SemanticSimilarityChunker(embeddingGenerator, chunkerOptions); +// + +// +// Configure chunk processor to generate summaries for each chunk +IngestionChunkProcessor summaryEnricher = new SummaryEnricher(enricherOptions); +// + +// +// Configure SQLite Vector Store +using SqliteVectorStore vectorStore = new( + "Data Source=vectors.db;Pooling=false", + new() + { + EmbeddingGenerator = embeddingGenerator + }); + +// The writer requires the embedding dimension count to be specified. +// For OpenAI's `text-embedding-3-small`, the dimension count is 1536. +using VectorStoreWriter writer = new(vectorStore, dimensionCount: 1536, new VectorStoreWriterOptions { CollectionName = "data" }); +// + +// +// Compose data ingestion pipeline +using IngestionPipeline pipeline = new(reader, chunker, writer, loggerFactory: loggerFactory) +{ + DocumentProcessors = { imageAlternativeTextEnricher }, + ChunkProcessors = { summaryEnricher } +}; +// + +// +await foreach (var result in pipeline.ProcessAsync(new DirectoryInfo("./data"), searchPattern: "*.md")) +{ + Console.WriteLine($"Completed processing '{result.DocumentId}'. Succeeded: '{result.Succeeded}'."); +} +// + +// +// Search the vector store collection and display results +var collection = writer.VectorStoreCollection; + +while (true) +{ + Console.Write("Enter your question (or 'exit' to quit): "); + string? searchValue = Console.ReadLine(); + if (string.IsNullOrEmpty(searchValue) || searchValue == "exit") + { + break; + } + + Console.WriteLine("Searching...\n"); + await foreach (var result in collection.SearchAsync(searchValue, top: 3)) + { + Console.WriteLine($"Score: {result.Score}\n\tContent: {result.Record["content"]}"); + } +} +// diff --git a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md new file mode 100644 index 0000000000000..f0bbe90de1d87 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md @@ -0,0 +1,18 @@ +# Sample Document + +This is a sample document for testing the data ingestion pipeline. + +## Introduction + +Data ingestion is the process of collecting and preparing data for AI applications. + +## Key Features + +- Document reading +- AI-powered enrichment +- Semantic chunking +- Vector storage + +## Conclusion + +These building blocks make it easy to create data ingestion pipelines. diff --git a/docs/ai/toc.yml b/docs/ai/toc.yml index c18445eb66441..4e086831f136a 100644 --- a/docs/ai/toc.yml +++ b/docs/ai/toc.yml @@ -78,6 +78,8 @@ items: items: - name: Get started with the RAG sample href: get-started-app-chat-template.md + - name: Process custom data for AI + href: quickstarts/process-data.md - name: Implement RAG using vector search href: tutorials/tutorial-ai-vector-search.md - name: Scale Azure OpenAI with Azure Container Apps From 8653a211d02746fc4d99936a9fd3f81965b26347 Mon Sep 17 00:00:00 2001 From: Genevieve Warren <24882762+gewarren@users.noreply.github.com> Date: Fri, 12 Dec 2025 18:59:23 -0800 Subject: [PATCH 3/3] human updates --- .../snippets/access-data/ArgumentsExample.cs | 2 +- docs/ai/quickstarts/process-data.md | 327 ++++-------------- .../snippets/process-data/.gitignore | 5 - .../snippets/process-data/ProcessData.csproj | 29 ++ .../snippets/process-data/Program.cs | 135 ++++++++ .../ProcessData/ProcessData.csproj | 21 -- .../azure-openai/ProcessData/Program.cs | 113 ------ .../ProcessData => }/data/sample.md | 0 .../openai/ProcessData/ProcessData.csproj | 19 - .../openai/ProcessData/Program.cs | 113 ------ .../openai/ProcessData/data/sample.md | 18 - .../snippets/structured-output/Program.cs | 3 +- .../text-to-image/azure-openai/Program.cs | 2 +- docs/ai/quickstarts/structured-output.md | 1 - docs/ai/quickstarts/text-to-image.md | 1 - 15 files changed, 233 insertions(+), 556 deletions(-) delete mode 100644 docs/ai/quickstarts/snippets/process-data/.gitignore create mode 100644 docs/ai/quickstarts/snippets/process-data/ProcessData.csproj create mode 100644 docs/ai/quickstarts/snippets/process-data/Program.cs delete mode 100644 docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj delete mode 100644 docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs rename docs/ai/quickstarts/snippets/process-data/{azure-openai/ProcessData => }/data/sample.md (100%) delete mode 100644 docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj delete mode 100644 docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs delete mode 100644 docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md diff --git a/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs b/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs index 969d18fc26814..645ac0e185431 100644 --- a/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs +++ b/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs @@ -38,7 +38,7 @@ public static async Task UseFICC() string endpoint = config["AZURE_OPENAI_ENDPOINT"]; string apiKey = config["AZURE_OPENAI_API_KEY"]; - string model = config["AZURE_OPENAI_GPT_NAME"]; + string model = "gpt-4o"; // FunctionInvokingChatClient client = new FunctionInvokingChatClient( diff --git a/docs/ai/quickstarts/process-data.md b/docs/ai/quickstarts/process-data.md index 04d0f99f2c214..3d85a2b3e0a06 100644 --- a/docs/ai/quickstarts/process-data.md +++ b/docs/ai/quickstarts/process-data.md @@ -1,9 +1,8 @@ --- -title: Quickstart - Process custom data for AI with .NET -description: Create a data ingestion pipeline to process and prepare custom data for AI applications using Microsoft.Extensions.DataIngestion +title: Quickstart - Process custom data for AI +description: Create a data ingestion pipeline to process and prepare custom data for AI applications using Microsoft.Extensions.DataIngestion. ms.date: 12/11/2025 ms.topic: quickstart -zone_pivot_groups: openai-library ai-usage: ai-assisted --- @@ -11,30 +10,13 @@ ai-usage: ai-assisted In this quickstart, you learn how to create a data ingestion pipeline to process and prepare custom data for AI applications. The app uses the library to read documents, enrich content with AI, chunk text semantically, and store embeddings in a vector database for semantic search. -Data ingestion is essential for Retrieval-Augmented Generation (RAG) scenarios where you need to process large amounts of unstructured data and make it searchable for AI applications. - -:::zone target="docs" pivot="openai" - -[!INCLUDE [openai-prereqs](includes/prerequisites-openai.md)] - -:::zone-end - -:::zone target="docs" pivot="azure-openai" +Data ingestion is essential for retrieval-augmented generation (RAG) scenarios where you need to process large amounts of unstructured data and make it searchable for AI applications. [!INCLUDE [azure-openai-prereqs](includes/prerequisites-azure-openai.md)] -:::zone-end - ## Create the app -Complete the following steps to create a .NET console app that can: - -- Read Markdown documents from a directory -- Enrich content with AI-generated image descriptions -- Chunk text using semantic similarity -- Generate AI summaries for each chunk -- Store embeddings in a SQLite vector database -- Search the vector store using natural language queries +Complete the following steps to create a .NET console app. 1. In an empty directory on your computer, use the `dotnet new` command to create a new console app: @@ -50,336 +32,159 @@ Complete the following steps to create a .NET console app that can: 1. Install the required packages: - :::zone target="docs" pivot="azure-openai" - ```bash - dotnet add package Azure.Identity dotnet add package Azure.AI.OpenAI dotnet add package Microsoft.Extensions.AI.OpenAI --prerelease + dotnet add package Microsoft.Extensions.Configuration + dotnet add package Microsoft.Extensions.Configuration.UserSecrets dotnet add package Microsoft.Extensions.DataIngestion --prerelease dotnet add package Microsoft.Extensions.DataIngestion.Markdig --prerelease dotnet add package Microsoft.Extensions.Logging.Console - dotnet add package Microsoft.ML.Tokenizers.Data.Cl100kBase + dotnet add package Microsoft.ML.Tokenizers.Data.O200kBase dotnet add package Microsoft.SemanticKernel.Connectors.SqliteVec --prerelease - ``` - The following list describes each package in the `ProcessDataAI` app: +## Create the AI service - - [`Azure.Identity`](https://www.nuget.org/packages/Azure.Identity) provides [`Microsoft Entra ID`](/entra/fundamentals/whatis) token authentication support across the Azure SDK using classes such as `DefaultAzureCredential`. - - [`Azure.AI.OpenAI`](https://www.nuget.org/packages/Azure.AI.OpenAI) is the official package for using OpenAI's .NET library with the Azure OpenAI Service. - - [`Microsoft.Extensions.AI.OpenAI`](https://www.nuget.org/packages/Microsoft.Extensions.AI.OpenAI) provides AI abstractions for OpenAI-compatible models or endpoints. - - [`Microsoft.Extensions.DataIngestion`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion) provides foundational .NET building blocks for data ingestion pipelines. - - [`Microsoft.Extensions.DataIngestion.Markdig`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion.Markdig) provides a Markdown document reader for the data ingestion pipeline. - - [`Microsoft.Extensions.Logging.Console`](https://www.nuget.org/packages/Microsoft.Extensions.Logging.Console) provides logging support for the console. - - [`Microsoft.ML.Tokenizers.Data.Cl100kBase`](https://www.nuget.org/packages/Microsoft.ML.Tokenizers.Data.Cl100kBase) provides tokenizer data for the GPT-4 model. - - [`Microsoft.SemanticKernel.Connectors.SqliteVec`](https://www.nuget.org/packages/Microsoft.SemanticKernel.Connectors.SqliteVec) provides an in-memory vector store using SQLite for storing and searching embeddings. +1. To provision an Azure OpenAI service and model, complete the steps in the [Create and deploy an Azure OpenAI Service resource](/azure/ai-services/openai/how-to/create-resource) article. - :::zone-end +1. From a terminal or command prompt, navigate to the root of your project directory. - :::zone target="docs" pivot="openai" +1. Run the following commands to configure your Azure OpenAI endpoint and model name for the sample app: ```bash - dotnet add package Microsoft.Extensions.AI.OpenAI --prerelease - dotnet add package Microsoft.Extensions.DataIngestion --prerelease - dotnet add package Microsoft.Extensions.DataIngestion.Markdig --prerelease - dotnet add package Microsoft.Extensions.Logging.Console - dotnet add package Microsoft.ML.Tokenizers.Data.Cl100kBase - dotnet add package Microsoft.SemanticKernel.Connectors.SqliteVec --prerelease + dotnet user-secrets init + dotnet user-secrets set AZURE_OPENAI_ENDPOINT + dotnet user-secrets set AZURE_OPENAI_API_KEY ``` - The following list describes each package in the `ProcessDataAI` app: - - - [`Microsoft.Extensions.AI.OpenAI`](https://www.nuget.org/packages/Microsoft.Extensions.AI.OpenAI) provides AI abstractions for OpenAI-compatible models or endpoints. This library also includes the official [`OpenAI`](https://www.nuget.org/packages/OpenAI) library for the OpenAI service API as a dependency. - - [`Microsoft.Extensions.DataIngestion`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion) provides foundational .NET building blocks for data ingestion pipelines. - - [`Microsoft.Extensions.DataIngestion.Markdig`](https://www.nuget.org/packages/Microsoft.Extensions.DataIngestion.Markdig) provides a Markdown document reader for the data ingestion pipeline. - - [`Microsoft.Extensions.Logging.Console`](https://www.nuget.org/packages/Microsoft.Extensions.Logging.Console) provides logging support for the console. - - [`Microsoft.ML.Tokenizers.Data.Cl100kBase`](https://www.nuget.org/packages/Microsoft.ML.Tokenizers.Data.Cl100kBase) provides tokenizer data for the GPT-4 model. - - [`Microsoft.SemanticKernel.Connectors.SqliteVec`](https://www.nuget.org/packages/Microsoft.SemanticKernel.Connectors.SqliteVec) provides an in-memory vector store using SQLite for storing and searching embeddings. - - :::zone-end +## Open the app in an editor 1. Open the app in Visual Studio Code (or your editor of choice). - ```bash - code . - ``` - -:::zone target="docs" pivot="azure-openai" - -[!INCLUDE [create-ai-service](includes/create-ai-service.md)] - -:::zone-end - -:::zone target="docs" pivot="openai" - -## Configure the app - -1. Navigate to the root of your .NET project from a terminal or command prompt. - -1. Run the following commands to configure your OpenAI API key as a secret for the sample app: - - ```bash - dotnet user-secrets init - dotnet user-secrets set OpenAIKey - ``` + ```bash + code . + ``` -:::zone-end +1. Copy the [sample.md](https://raw.githubusercontent.com/dotnet/docs/refs/heads/main/docs/ai/quickstarts/snippets/process-data/sample.md) file to your project directory. Configure the project to copy this file to the output directory. If you're using Visual Studio, right-click on the file in Solution Explorer, select **Properties**, and then set **Copy to Output Directory** to **Copy if newer**. ## Add the app code The data ingestion pipeline consists of several components that work together to process documents: -- **Document reader**: Reads Markdown files from a directory -- **Document processor**: Enriches images with AI-generated alternative text -- **Chunker**: Splits documents into semantic chunks using embeddings -- **Chunk processor**: Generates AI summaries for each chunk -- **Vector store writer**: Stores chunks with embeddings in a SQLite database - -### Configure the document reader +- **Document reader**: Reads Markdown files from a directory. +- **Document processor**: Enriches images with AI-generated alternative text. +- **Chunker**: Splits documents into semantic chunks using embeddings. +- **Chunk processor**: Generates AI summaries for each chunk. +- **Vector store writer**: Stores chunks with embeddings in a SQLite database. 1. In the `Program.cs` file, delete any existing code and add the following code to configure the document reader: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureReader"::: - - :::zone-end - - :::zone target="docs" pivot="openai" + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureReader"::: - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureReader"::: - - :::zone-end - - The `MarkdownReader` class reads Markdown documents and converts them into a unified format that works well with large language models. - -### Configure logging + The class reads Markdown documents and converts them into a unified format that works well with large language models. 1. Add code to configure logging for the pipeline: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureLogging"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureLogging"::: - - :::zone-end - -### Configure the AI client + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureLogging"::: 1. Add code to configure the AI client for enrichment and chat: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureChatClient"::: - - > [!NOTE] - > searches for authentication credentials from your local tooling. You'll need to assign the `Azure AI Developer` role to the account you used to sign in to Visual Studio or the Azure CLI. For more information, see [Authenticate to Azure AI services with .NET](../azure-ai-services-authentication.md). - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureChatClient"::: - - :::zone-end - -### Configure the document processor + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureChatClient"::: 1. Add code to configure the document processor that enriches images with AI-generated descriptions: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureDocumentProcessor"::: - - :::zone-end - - :::zone target="docs" pivot="openai" + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureDocumentProcessor"::: - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureDocumentProcessor"::: - - :::zone-end - - The `ImageAlternativeTextEnricher` uses large language models to generate descriptive alternative text for images within documents, making them more accessible and improving their semantic meaning. - -### Configure the embedding generator + The uses large language models to generate descriptive alternative text for images within documents. That text makes them more accessible and improves their semantic meaning. 1. Add code to configure the embedding generator for creating vector representations: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureEmbeddingGenerator"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureEmbeddingGenerator"::: + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureEmbeddingGenerator"::: - :::zone-end - - Embeddings are numerical representations of the semantic meaning of text, which enables vector similarity search. - -### Configure the chunker + Embeddings are numerical representations of the semantic meaning of text, which enables vector similarity search. 1. Add code to configure the chunker that splits documents into semantic chunks: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureChunker"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureChunker"::: + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureChunker"::: - :::zone-end - - The `SemanticSimilarityChunker` intelligently splits documents by analyzing the semantic similarity between sentences, ensuring that related content stays together. This produces chunks that preserve meaning and context better than simple character or token-based chunking. - -### Configure the chunk processor + The intelligently splits documents by analyzing the semantic similarity between sentences, ensuring that related content stays together. This process produces chunks that preserve meaning and context better than simple character or token-based chunking. 1. Add code to configure the chunk processor that generates summaries: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureChunkProcessor"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureChunkProcessor"::: - - :::zone-end + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureChunkProcessor"::: - The `SummaryEnricher` automatically generates concise summaries for each chunk, which can improve retrieval accuracy by providing a high-level overview of the content. - -### Configure the vector store + The automatically generates concise summaries for each chunk, which can improve retrieval accuracy by providing a high-level overview of the content. 1. Add code to configure the SQLite vector store for storing embeddings: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ConfigureVectorStore"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ConfigureVectorStore"::: - - :::zone-end + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureVectorStore"::: - The vector store stores chunks along with their embeddings, enabling fast semantic search capabilities. - -### Compose the pipeline + The vector store stores chunks along with their embeddings, enabling fast semantic search capabilities. 1. Add code to compose all the components into a complete pipeline: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ComposePipeline"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ComposePipeline"::: - - :::zone-end - - The `IngestionPipeline` combines all the components into a cohesive workflow that processes documents from start to finish. + :::code language="csharp" source="snippets/process-data/Program.cs" id="ComposePipeline"::: -### Process documents + The combines all the components into a cohesive workflow that processes documents from start to finish. 1. Add code to process documents from a directory: - :::zone target="docs" pivot="azure-openai" - - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="ProcessDocuments"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="ProcessDocuments"::: - - :::zone-end - - The pipeline processes all Markdown files in the `./data` directory and reports the status of each document. + :::code language="csharp" source="snippets/process-data/Program.cs" id="ProcessDocuments"::: -### Search the vector store + The pipeline processes all Markdown files in the `./data` directory and reports the status of each document. 1. Add code to enable interactive search of the processed documents: - :::zone target="docs" pivot="azure-openai" + :::code language="csharp" source="snippets/process-data/Program.cs" id="SearchVectorStore"::: - :::code language="csharp" source="snippets/process-data/azure-openai/ProcessData/Program.cs" id="SearchVectorStore"::: - - :::zone-end - - :::zone target="docs" pivot="openai" - - :::code language="csharp" source="snippets/process-data/openai/ProcessData/Program.cs" id="SearchVectorStore"::: - - :::zone-end - - The search functionality converts user queries into embeddings and finds the most semantically similar chunks in the vector store. + The search functionality converts user queries into embeddings and finds the most semantically similar chunks in the vector store. ## Create sample data 1. Create a `data` folder in your project directory: - ```bash - mkdir data - ``` + ```bash + mkdir data + ``` 1. Create a sample Markdown file in the `data` folder. For example, create a file named `sample.md` with the following content: - ```markdown - # Data Ingestion + ```markdown + # Data Ingestion - Data ingestion is the process of collecting and preparing data for AI applications. + Data ingestion is the process of collecting and preparing data for AI applications. - ## Key Concepts + ## Key Concepts - - Extract data from various sources - - Transform data into usable formats - - Load data into storage systems + - Extract data from various sources + - Transform data into usable formats + - Load data into storage systems - ## Benefits + ## Benefits - Data ingestion enables AI applications to work with custom data, improving accuracy and relevance. - ``` + Data ingestion enables AI applications to work with custom data, improving accuracy and relevance. + ``` ## Run the app 1. Use the `dotnet run` command to run the app: - ```dotnetcli - dotnet run - ``` + ```dotnetcli + dotnet run + ``` - The app processes all Markdown files in the `data` directory and displays the processing status for each document. Once processing is complete, you can enter natural language questions to search the processed content. + The app processes all Markdown files in the `data` directory and displays the processing status for each document. Once processing is complete, you can enter natural language questions to search the processed content. 1. Enter a question at the prompt to search the data: - ```output - Enter your question (or 'exit' to quit): What is data ingestion? - ``` + ```output + Enter your question (or 'exit' to quit): What is data ingestion? + ``` - The app returns the most relevant chunks from your documents along with their similarity scores. + The app returns the most relevant chunks from your documents along with their similarity scores. 1. Type `exit` to quit the application. -:::zone target="docs" pivot="azure-openai" - ## Clean up resources If you no longer need them, delete the Azure OpenAI resource and model deployment. @@ -387,8 +192,6 @@ If you no longer need them, delete the Azure OpenAI resource and model deploymen 1. In the [Azure Portal](https://aka.ms/azureportal), navigate to the Azure OpenAI resource. 1. Select the Azure OpenAI resource, and then select **Delete**. -:::zone-end - ## Next steps - [Data ingestion concepts](../conceptual/data-ingestion.md) diff --git a/docs/ai/quickstarts/snippets/process-data/.gitignore b/docs/ai/quickstarts/snippets/process-data/.gitignore deleted file mode 100644 index 1c41c29ca4c84..0000000000000 --- a/docs/ai/quickstarts/snippets/process-data/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -bin/ -obj/ -*.db -*.db-shm -*.db-wal diff --git a/docs/ai/quickstarts/snippets/process-data/ProcessData.csproj b/docs/ai/quickstarts/snippets/process-data/ProcessData.csproj new file mode 100644 index 0000000000000..939ba7d02bc68 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/ProcessData.csproj @@ -0,0 +1,29 @@ + + + + Exe + net10.0 + enable + enable + 2e2133d7-2b33-48e1-9938-79092b54ead4 + + + + + + + + + + + + + + + + + Always + + + + diff --git a/docs/ai/quickstarts/snippets/process-data/Program.cs b/docs/ai/quickstarts/snippets/process-data/Program.cs new file mode 100644 index 0000000000000..2dc1637312706 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/Program.cs @@ -0,0 +1,135 @@ +using Azure; +using Azure.AI.OpenAI; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DataIngestion; +using Microsoft.Extensions.DataIngestion.Chunkers; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.VectorData; +using Microsoft.ML.Tokenizers; +using Microsoft.SemanticKernel.Connectors.SqliteVec; + +class DataIngestionExample +{ + public static async Task Main() + { + // + // Configure document reader. + IngestionDocumentReader reader = new MarkdownReader(); + // + + // + using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole()); + // + + // + // Configure IChatClient to use Azure OpenAI. + IConfigurationRoot config = new ConfigurationBuilder() + .AddUserSecrets() + .Build(); + + string endpoint = config["AZURE_OPENAI_ENDPOINT"]; + string apiKey = config["AZURE_OPENAI_API_KEY"]; + string model = "gpt-4o"; + string embeddingModel = "text-embedding-3-small"; + + AzureOpenAIClient azureClient = new( + new Uri(endpoint), + new AzureKeyCredential(apiKey)); + + IChatClient chatClient = + azureClient.GetChatClient(model).AsIChatClient(); + // + + // + // Configure document processor. + EnricherOptions enricherOptions = new(chatClient) + { + // Enricher failures should not fail the whole ingestion pipeline, + // as they are best-effort enhancements. + // This logger factory can be used to create loggers to log such failures. + LoggerFactory = loggerFactory + }; + + IngestionDocumentProcessor imageAlternativeTextEnricher = new ImageAlternativeTextEnricher(enricherOptions); + // + + // + // Configure embedding generator. + IEmbeddingGenerator> embeddingGenerator = + azureClient.GetEmbeddingClient(embeddingModel).AsIEmbeddingGenerator(); + // + + // + // Configure chunker to split text into semantic chunks. + IngestionChunkerOptions chunkerOptions = new(TiktokenTokenizer.CreateForModel(model)) + { + MaxTokensPerChunk = 2000, + OverlapTokens = 0 + }; + + IngestionChunker chunker = new SemanticSimilarityChunker(embeddingGenerator, chunkerOptions); + // + + // + // Configure chunk processor to generate summaries for each chunk + IngestionChunkProcessor summaryEnricher = new SummaryEnricher(enricherOptions); + // + + // + // Configure SQLite Vector Store + using SqliteVectorStore vectorStore = new( + "Data Source=vectors.db;Pooling=false", + new() + { + EmbeddingGenerator = embeddingGenerator + }); + + // The writer requires the embedding dimension count to be specified. + // For Azure OpenAI's `text-embedding-ada-002`, the dimension count is 1536. + using VectorStoreWriter writer = new( + vectorStore, + dimensionCount: 1536, + new VectorStoreWriterOptions { CollectionName = "data" }); + // + + // + // Compose data ingestion pipeline + using IngestionPipeline pipeline = new(reader, chunker, writer, loggerFactory: loggerFactory) + { + DocumentProcessors = { imageAlternativeTextEnricher }, + ChunkProcessors = { summaryEnricher } + }; + // + + // + await foreach (IngestionResult result in pipeline.ProcessAsync( + new DirectoryInfo("./data"), + searchPattern: "*.md")) + { + Console.WriteLine($"Completed processing '{result.DocumentId}'. Succeeded: '{result.Succeeded}'."); + } + // + + // + // Search the vector store collection and display results + VectorStoreCollection> collection = writer.VectorStoreCollection; + + while (true) + { + Console.Write("Enter your question (or 'exit' to quit): "); + string? searchValue = Console.ReadLine(); + if (string.IsNullOrEmpty(searchValue) || searchValue == "exit") + { + break; + } + + Console.WriteLine("Searching...\n"); + await foreach (VectorSearchResult> result in collection.SearchAsync(searchValue, top: 3)) + { + Console.WriteLine($"Score: {result.Score}\n\tContent: {result.Record["content"]}"); + } + } + // + } +} diff --git a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj deleted file mode 100644 index 389b7ec30b7bb..0000000000000 --- a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/ProcessData.csproj +++ /dev/null @@ -1,21 +0,0 @@ - - - - Exe - net10.0 - enable - enable - - - - - - - - - - - - - - diff --git a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs b/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs deleted file mode 100644 index 233a8fe230e8a..0000000000000 --- a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/Program.cs +++ /dev/null @@ -1,113 +0,0 @@ -using Azure.AI.OpenAI; -using Azure.Identity; -using Microsoft.Extensions.AI; -using Microsoft.Extensions.DataIngestion; -using Microsoft.Extensions.DataIngestion.Chunkers; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.VectorData; -using Microsoft.ML.Tokenizers; -using Microsoft.SemanticKernel.Connectors.SqliteVec; - -// -// Configure document reader -IngestionDocumentReader reader = new MarkdownReader(); -// - -// -using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole()); -// - -// -// Configure IChatClient to use Azure OpenAI -AzureOpenAIClient azureClient = new( - new Uri(Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT")!), - new DefaultAzureCredential()); - -IChatClient chatClient = - azureClient.GetChatClient(Environment.GetEnvironmentVariable("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")!).AsIChatClient(); -// - -// -// Configure document processor -EnricherOptions enricherOptions = new(chatClient) -{ - // Enricher failures should not fail the whole ingestion pipeline, as they are best-effort enhancements. - // This logger factory can be used to create loggers to log such failures. - LoggerFactory = loggerFactory -}; - -IngestionDocumentProcessor imageAlternativeTextEnricher = new ImageAlternativeTextEnricher(enricherOptions); -// - -// -// Configure embedding generator -IEmbeddingGenerator> embeddingGenerator = - azureClient.GetEmbeddingClient(Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")!).AsIEmbeddingGenerator(); -// - -// -// Configure chunker to split text into semantic chunks -IngestionChunkerOptions chunkerOptions = new(TiktokenTokenizer.CreateForModel("gpt-4")) -{ - MaxTokensPerChunk = 2000, - OverlapTokens = 0 -}; - -IngestionChunker chunker = new SemanticSimilarityChunker(embeddingGenerator, chunkerOptions); -// - -// -// Configure chunk processor to generate summaries for each chunk -IngestionChunkProcessor summaryEnricher = new SummaryEnricher(enricherOptions); -// - -// -// Configure SQLite Vector Store -using SqliteVectorStore vectorStore = new( - "Data Source=vectors.db;Pooling=false", - new() - { - EmbeddingGenerator = embeddingGenerator - }); - -// The writer requires the embedding dimension count to be specified. -// For Azure OpenAI's `text-embedding-ada-002`, the dimension count is 1536. -using VectorStoreWriter writer = new(vectorStore, dimensionCount: 1536, new VectorStoreWriterOptions { CollectionName = "data" }); -// - -// -// Compose data ingestion pipeline -using IngestionPipeline pipeline = new(reader, chunker, writer, loggerFactory: loggerFactory) -{ - DocumentProcessors = { imageAlternativeTextEnricher }, - ChunkProcessors = { summaryEnricher } -}; -// - -// -await foreach (var result in pipeline.ProcessAsync(new DirectoryInfo("./data"), searchPattern: "*.md")) -{ - Console.WriteLine($"Completed processing '{result.DocumentId}'. Succeeded: '{result.Succeeded}'."); -} -// - -// -// Search the vector store collection and display results -var collection = writer.VectorStoreCollection; - -while (true) -{ - Console.Write("Enter your question (or 'exit' to quit): "); - string? searchValue = Console.ReadLine(); - if (string.IsNullOrEmpty(searchValue) || searchValue == "exit") - { - break; - } - - Console.WriteLine("Searching...\n"); - await foreach (var result in collection.SearchAsync(searchValue, top: 3)) - { - Console.WriteLine($"Score: {result.Score}\n\tContent: {result.Record["content"]}"); - } -} -// diff --git a/docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/data/sample.md b/docs/ai/quickstarts/snippets/process-data/data/sample.md similarity index 100% rename from docs/ai/quickstarts/snippets/process-data/azure-openai/ProcessData/data/sample.md rename to docs/ai/quickstarts/snippets/process-data/data/sample.md diff --git a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj deleted file mode 100644 index 3f61f4efaad96..0000000000000 --- a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/ProcessData.csproj +++ /dev/null @@ -1,19 +0,0 @@ - - - - Exe - net10.0 - enable - enable - - - - - - - - - - - - diff --git a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs deleted file mode 100644 index cdb68bc852867..0000000000000 --- a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/Program.cs +++ /dev/null @@ -1,113 +0,0 @@ -using System.ClientModel; -using Microsoft.Extensions.AI; -using Microsoft.Extensions.DataIngestion; -using Microsoft.Extensions.DataIngestion.Chunkers; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.VectorData; -using Microsoft.ML.Tokenizers; -using Microsoft.SemanticKernel.Connectors.SqliteVec; -using OpenAI; - -// -// Configure document reader -IngestionDocumentReader reader = new MarkdownReader(); -// - -// -using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole()); -// - -// -// Configure IChatClient to use GitHub Models -OpenAIClient openAIClient = new( - new ApiKeyCredential(Environment.GetEnvironmentVariable("GITHUB_TOKEN")!), - new OpenAIClientOptions { Endpoint = new Uri("https://models.github.ai/inference") }); - -IChatClient chatClient = - openAIClient.GetChatClient("gpt-4.1").AsIChatClient(); -// - -// -// Configure document processor -EnricherOptions enricherOptions = new(chatClient) -{ - // Enricher failures should not fail the whole ingestion pipeline, as they are best-effort enhancements. - // This logger factory can be used to create loggers to log such failures. - LoggerFactory = loggerFactory -}; - -IngestionDocumentProcessor imageAlternativeTextEnricher = new ImageAlternativeTextEnricher(enricherOptions); -// - -// -// Configure embedding generator -IEmbeddingGenerator> embeddingGenerator = - openAIClient.GetEmbeddingClient("text-embedding-3-small").AsIEmbeddingGenerator(); -// - -// -// Configure chunker to split text into semantic chunks -IngestionChunkerOptions chunkerOptions = new(TiktokenTokenizer.CreateForModel("gpt-4")) -{ - MaxTokensPerChunk = 2000, - OverlapTokens = 0 -}; - -IngestionChunker chunker = new SemanticSimilarityChunker(embeddingGenerator, chunkerOptions); -// - -// -// Configure chunk processor to generate summaries for each chunk -IngestionChunkProcessor summaryEnricher = new SummaryEnricher(enricherOptions); -// - -// -// Configure SQLite Vector Store -using SqliteVectorStore vectorStore = new( - "Data Source=vectors.db;Pooling=false", - new() - { - EmbeddingGenerator = embeddingGenerator - }); - -// The writer requires the embedding dimension count to be specified. -// For OpenAI's `text-embedding-3-small`, the dimension count is 1536. -using VectorStoreWriter writer = new(vectorStore, dimensionCount: 1536, new VectorStoreWriterOptions { CollectionName = "data" }); -// - -// -// Compose data ingestion pipeline -using IngestionPipeline pipeline = new(reader, chunker, writer, loggerFactory: loggerFactory) -{ - DocumentProcessors = { imageAlternativeTextEnricher }, - ChunkProcessors = { summaryEnricher } -}; -// - -// -await foreach (var result in pipeline.ProcessAsync(new DirectoryInfo("./data"), searchPattern: "*.md")) -{ - Console.WriteLine($"Completed processing '{result.DocumentId}'. Succeeded: '{result.Succeeded}'."); -} -// - -// -// Search the vector store collection and display results -var collection = writer.VectorStoreCollection; - -while (true) -{ - Console.Write("Enter your question (or 'exit' to quit): "); - string? searchValue = Console.ReadLine(); - if (string.IsNullOrEmpty(searchValue) || searchValue == "exit") - { - break; - } - - Console.WriteLine("Searching...\n"); - await foreach (var result in collection.SearchAsync(searchValue, top: 3)) - { - Console.WriteLine($"Score: {result.Score}\n\tContent: {result.Record["content"]}"); - } -} -// diff --git a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md b/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md deleted file mode 100644 index f0bbe90de1d87..0000000000000 --- a/docs/ai/quickstarts/snippets/process-data/openai/ProcessData/data/sample.md +++ /dev/null @@ -1,18 +0,0 @@ -# Sample Document - -This is a sample document for testing the data ingestion pipeline. - -## Introduction - -Data ingestion is the process of collecting and preparing data for AI applications. - -## Key Features - -- Document reading -- AI-powered enrichment -- Semantic chunking -- Vector storage - -## Conclusion - -These building blocks make it easy to create data ingestion pipelines. diff --git a/docs/ai/quickstarts/snippets/structured-output/Program.cs b/docs/ai/quickstarts/snippets/structured-output/Program.cs index ae5f6a125a24f..7be9627e26659 100644 --- a/docs/ai/quickstarts/snippets/structured-output/Program.cs +++ b/docs/ai/quickstarts/snippets/structured-output/Program.cs @@ -9,9 +9,10 @@ .Build(); string endpoint = config["AZURE_OPENAI_ENDPOINT"]; -string model = config["AZURE_OPENAI_GPT_NAME"]; string tenantId = config["AZURE_TENANT_ID"]; +string model = "gpt-4o"; + // Get a chat client for the Azure OpenAI endpoint. AzureOpenAIClient azureClient = new( diff --git a/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs b/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs index 118bf15e7ed55..870da58cf90d6 100644 --- a/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs +++ b/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs @@ -10,7 +10,7 @@ string endpoint = config["AZURE_OPENAI_ENDPOINT"]; string apiKey = config["AZURE_OPENAI_API_KEY"]; -string model = config["AZURE_OPENAI_GPT_NAME"]; +string model = "gpt-image-1"; // Create the Azure OpenAI client and convert to IImageGenerator. AzureOpenAIClient azureClient = new( diff --git a/docs/ai/quickstarts/structured-output.md b/docs/ai/quickstarts/structured-output.md index f68feff535f00..54b4caa5f68ff 100644 --- a/docs/ai/quickstarts/structured-output.md +++ b/docs/ai/quickstarts/structured-output.md @@ -44,7 +44,6 @@ Complete the following steps to create a console app that connects to the `gpt-4 ```bash dotnet user-secrets init dotnet user-secrets set AZURE_OPENAI_ENDPOINT - dotnet user-secrets set AZURE_OPENAI_GPT_NAME gpt-4o dotnet user-secrets set AZURE_TENANT_ID ``` diff --git a/docs/ai/quickstarts/text-to-image.md b/docs/ai/quickstarts/text-to-image.md index 89acb7f041ae7..b1e36e4ec3bc6 100644 --- a/docs/ai/quickstarts/text-to-image.md +++ b/docs/ai/quickstarts/text-to-image.md @@ -54,7 +54,6 @@ Complete the following steps to create a .NET console application that generates ```bash dotnet user-secrets init dotnet user-secrets set AZURE_OPENAI_ENDPOINT - dotnet user-secrets set AZURE_OPENAI_GPT_NAME gpt-image-1 dotnet user-secrets set AZURE_OPENAI_API_KEY ```