diff --git a/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs b/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs index 969d18fc26814..645ac0e185431 100644 --- a/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs +++ b/docs/ai/how-to/snippets/access-data/ArgumentsExample.cs @@ -38,7 +38,7 @@ public static async Task UseFICC() string endpoint = config["AZURE_OPENAI_ENDPOINT"]; string apiKey = config["AZURE_OPENAI_API_KEY"]; - string model = config["AZURE_OPENAI_GPT_NAME"]; + string model = "gpt-4o"; // FunctionInvokingChatClient client = new FunctionInvokingChatClient( diff --git a/docs/ai/quickstarts/process-data.md b/docs/ai/quickstarts/process-data.md new file mode 100644 index 0000000000000..3d85a2b3e0a06 --- /dev/null +++ b/docs/ai/quickstarts/process-data.md @@ -0,0 +1,199 @@ +--- +title: Quickstart - Process custom data for AI +description: Create a data ingestion pipeline to process and prepare custom data for AI applications using Microsoft.Extensions.DataIngestion. +ms.date: 12/11/2025 +ms.topic: quickstart +ai-usage: ai-assisted +--- + +# Process custom data for AI applications + +In this quickstart, you learn how to create a data ingestion pipeline to process and prepare custom data for AI applications. The app uses the library to read documents, enrich content with AI, chunk text semantically, and store embeddings in a vector database for semantic search. + +Data ingestion is essential for retrieval-augmented generation (RAG) scenarios where you need to process large amounts of unstructured data and make it searchable for AI applications. + +[!INCLUDE [azure-openai-prereqs](includes/prerequisites-azure-openai.md)] + +## Create the app + +Complete the following steps to create a .NET console app. + +1. In an empty directory on your computer, use the `dotnet new` command to create a new console app: + + ```dotnetcli + dotnet new console -o ProcessDataAI + ``` + +1. Change directory into the app folder: + + ```dotnetcli + cd ProcessDataAI + ``` + +1. Install the required packages: + + ```bash + dotnet add package Azure.AI.OpenAI + dotnet add package Microsoft.Extensions.AI.OpenAI --prerelease + dotnet add package Microsoft.Extensions.Configuration + dotnet add package Microsoft.Extensions.Configuration.UserSecrets + dotnet add package Microsoft.Extensions.DataIngestion --prerelease + dotnet add package Microsoft.Extensions.DataIngestion.Markdig --prerelease + dotnet add package Microsoft.Extensions.Logging.Console + dotnet add package Microsoft.ML.Tokenizers.Data.O200kBase + dotnet add package Microsoft.SemanticKernel.Connectors.SqliteVec --prerelease + +## Create the AI service + +1. To provision an Azure OpenAI service and model, complete the steps in the [Create and deploy an Azure OpenAI Service resource](/azure/ai-services/openai/how-to/create-resource) article. + +1. From a terminal or command prompt, navigate to the root of your project directory. + +1. Run the following commands to configure your Azure OpenAI endpoint and model name for the sample app: + + ```bash + dotnet user-secrets init + dotnet user-secrets set AZURE_OPENAI_ENDPOINT + dotnet user-secrets set AZURE_OPENAI_API_KEY + ``` + +## Open the app in an editor + +1. Open the app in Visual Studio Code (or your editor of choice). + + ```bash + code . + ``` + +1. Copy the [sample.md](https://raw.githubusercontent.com/dotnet/docs/refs/heads/main/docs/ai/quickstarts/snippets/process-data/sample.md) file to your project directory. Configure the project to copy this file to the output directory. If you're using Visual Studio, right-click on the file in Solution Explorer, select **Properties**, and then set **Copy to Output Directory** to **Copy if newer**. + +## Add the app code + +The data ingestion pipeline consists of several components that work together to process documents: + +- **Document reader**: Reads Markdown files from a directory. +- **Document processor**: Enriches images with AI-generated alternative text. +- **Chunker**: Splits documents into semantic chunks using embeddings. +- **Chunk processor**: Generates AI summaries for each chunk. +- **Vector store writer**: Stores chunks with embeddings in a SQLite database. + +1. In the `Program.cs` file, delete any existing code and add the following code to configure the document reader: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureReader"::: + + The class reads Markdown documents and converts them into a unified format that works well with large language models. + +1. Add code to configure logging for the pipeline: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureLogging"::: + +1. Add code to configure the AI client for enrichment and chat: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureChatClient"::: + +1. Add code to configure the document processor that enriches images with AI-generated descriptions: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureDocumentProcessor"::: + + The uses large language models to generate descriptive alternative text for images within documents. That text makes them more accessible and improves their semantic meaning. + +1. Add code to configure the embedding generator for creating vector representations: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureEmbeddingGenerator"::: + + Embeddings are numerical representations of the semantic meaning of text, which enables vector similarity search. + +1. Add code to configure the chunker that splits documents into semantic chunks: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureChunker"::: + + The intelligently splits documents by analyzing the semantic similarity between sentences, ensuring that related content stays together. This process produces chunks that preserve meaning and context better than simple character or token-based chunking. + +1. Add code to configure the chunk processor that generates summaries: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureChunkProcessor"::: + + The automatically generates concise summaries for each chunk, which can improve retrieval accuracy by providing a high-level overview of the content. + +1. Add code to configure the SQLite vector store for storing embeddings: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ConfigureVectorStore"::: + + The vector store stores chunks along with their embeddings, enabling fast semantic search capabilities. + +1. Add code to compose all the components into a complete pipeline: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ComposePipeline"::: + + The combines all the components into a cohesive workflow that processes documents from start to finish. + +1. Add code to process documents from a directory: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="ProcessDocuments"::: + + The pipeline processes all Markdown files in the `./data` directory and reports the status of each document. + +1. Add code to enable interactive search of the processed documents: + + :::code language="csharp" source="snippets/process-data/Program.cs" id="SearchVectorStore"::: + + The search functionality converts user queries into embeddings and finds the most semantically similar chunks in the vector store. + +## Create sample data + +1. Create a `data` folder in your project directory: + + ```bash + mkdir data + ``` + +1. Create a sample Markdown file in the `data` folder. For example, create a file named `sample.md` with the following content: + + ```markdown + # Data Ingestion + + Data ingestion is the process of collecting and preparing data for AI applications. + + ## Key Concepts + + - Extract data from various sources + - Transform data into usable formats + - Load data into storage systems + + ## Benefits + + Data ingestion enables AI applications to work with custom data, improving accuracy and relevance. + ``` + +## Run the app + +1. Use the `dotnet run` command to run the app: + + ```dotnetcli + dotnet run + ``` + + The app processes all Markdown files in the `data` directory and displays the processing status for each document. Once processing is complete, you can enter natural language questions to search the processed content. + +1. Enter a question at the prompt to search the data: + + ```output + Enter your question (or 'exit' to quit): What is data ingestion? + ``` + + The app returns the most relevant chunks from your documents along with their similarity scores. + +1. Type `exit` to quit the application. + +## Clean up resources + +If you no longer need them, delete the Azure OpenAI resource and model deployment. + +1. In the [Azure Portal](https://aka.ms/azureportal), navigate to the Azure OpenAI resource. +1. Select the Azure OpenAI resource, and then select **Delete**. + +## Next steps + +- [Data ingestion concepts](../conceptual/data-ingestion.md) +- [Implement RAG using vector search](../tutorials/tutorial-ai-vector-search.md) +- [Build a .NET AI vector search app](build-vector-search-app.md) diff --git a/docs/ai/quickstarts/snippets/process-data/ProcessData.csproj b/docs/ai/quickstarts/snippets/process-data/ProcessData.csproj new file mode 100644 index 0000000000000..939ba7d02bc68 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/ProcessData.csproj @@ -0,0 +1,29 @@ + + + + Exe + net10.0 + enable + enable + 2e2133d7-2b33-48e1-9938-79092b54ead4 + + + + + + + + + + + + + + + + + Always + + + + diff --git a/docs/ai/quickstarts/snippets/process-data/Program.cs b/docs/ai/quickstarts/snippets/process-data/Program.cs new file mode 100644 index 0000000000000..2dc1637312706 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/Program.cs @@ -0,0 +1,135 @@ +using Azure; +using Azure.AI.OpenAI; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DataIngestion; +using Microsoft.Extensions.DataIngestion.Chunkers; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.VectorData; +using Microsoft.ML.Tokenizers; +using Microsoft.SemanticKernel.Connectors.SqliteVec; + +class DataIngestionExample +{ + public static async Task Main() + { + // + // Configure document reader. + IngestionDocumentReader reader = new MarkdownReader(); + // + + // + using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => builder.AddSimpleConsole()); + // + + // + // Configure IChatClient to use Azure OpenAI. + IConfigurationRoot config = new ConfigurationBuilder() + .AddUserSecrets() + .Build(); + + string endpoint = config["AZURE_OPENAI_ENDPOINT"]; + string apiKey = config["AZURE_OPENAI_API_KEY"]; + string model = "gpt-4o"; + string embeddingModel = "text-embedding-3-small"; + + AzureOpenAIClient azureClient = new( + new Uri(endpoint), + new AzureKeyCredential(apiKey)); + + IChatClient chatClient = + azureClient.GetChatClient(model).AsIChatClient(); + // + + // + // Configure document processor. + EnricherOptions enricherOptions = new(chatClient) + { + // Enricher failures should not fail the whole ingestion pipeline, + // as they are best-effort enhancements. + // This logger factory can be used to create loggers to log such failures. + LoggerFactory = loggerFactory + }; + + IngestionDocumentProcessor imageAlternativeTextEnricher = new ImageAlternativeTextEnricher(enricherOptions); + // + + // + // Configure embedding generator. + IEmbeddingGenerator> embeddingGenerator = + azureClient.GetEmbeddingClient(embeddingModel).AsIEmbeddingGenerator(); + // + + // + // Configure chunker to split text into semantic chunks. + IngestionChunkerOptions chunkerOptions = new(TiktokenTokenizer.CreateForModel(model)) + { + MaxTokensPerChunk = 2000, + OverlapTokens = 0 + }; + + IngestionChunker chunker = new SemanticSimilarityChunker(embeddingGenerator, chunkerOptions); + // + + // + // Configure chunk processor to generate summaries for each chunk + IngestionChunkProcessor summaryEnricher = new SummaryEnricher(enricherOptions); + // + + // + // Configure SQLite Vector Store + using SqliteVectorStore vectorStore = new( + "Data Source=vectors.db;Pooling=false", + new() + { + EmbeddingGenerator = embeddingGenerator + }); + + // The writer requires the embedding dimension count to be specified. + // For Azure OpenAI's `text-embedding-ada-002`, the dimension count is 1536. + using VectorStoreWriter writer = new( + vectorStore, + dimensionCount: 1536, + new VectorStoreWriterOptions { CollectionName = "data" }); + // + + // + // Compose data ingestion pipeline + using IngestionPipeline pipeline = new(reader, chunker, writer, loggerFactory: loggerFactory) + { + DocumentProcessors = { imageAlternativeTextEnricher }, + ChunkProcessors = { summaryEnricher } + }; + // + + // + await foreach (IngestionResult result in pipeline.ProcessAsync( + new DirectoryInfo("./data"), + searchPattern: "*.md")) + { + Console.WriteLine($"Completed processing '{result.DocumentId}'. Succeeded: '{result.Succeeded}'."); + } + // + + // + // Search the vector store collection and display results + VectorStoreCollection> collection = writer.VectorStoreCollection; + + while (true) + { + Console.Write("Enter your question (or 'exit' to quit): "); + string? searchValue = Console.ReadLine(); + if (string.IsNullOrEmpty(searchValue) || searchValue == "exit") + { + break; + } + + Console.WriteLine("Searching...\n"); + await foreach (VectorSearchResult> result in collection.SearchAsync(searchValue, top: 3)) + { + Console.WriteLine($"Score: {result.Score}\n\tContent: {result.Record["content"]}"); + } + } + // + } +} diff --git a/docs/ai/quickstarts/snippets/process-data/data/sample.md b/docs/ai/quickstarts/snippets/process-data/data/sample.md new file mode 100644 index 0000000000000..f0bbe90de1d87 --- /dev/null +++ b/docs/ai/quickstarts/snippets/process-data/data/sample.md @@ -0,0 +1,18 @@ +# Sample Document + +This is a sample document for testing the data ingestion pipeline. + +## Introduction + +Data ingestion is the process of collecting and preparing data for AI applications. + +## Key Features + +- Document reading +- AI-powered enrichment +- Semantic chunking +- Vector storage + +## Conclusion + +These building blocks make it easy to create data ingestion pipelines. diff --git a/docs/ai/quickstarts/snippets/structured-output/Program.cs b/docs/ai/quickstarts/snippets/structured-output/Program.cs index ae5f6a125a24f..7be9627e26659 100644 --- a/docs/ai/quickstarts/snippets/structured-output/Program.cs +++ b/docs/ai/quickstarts/snippets/structured-output/Program.cs @@ -9,9 +9,10 @@ .Build(); string endpoint = config["AZURE_OPENAI_ENDPOINT"]; -string model = config["AZURE_OPENAI_GPT_NAME"]; string tenantId = config["AZURE_TENANT_ID"]; +string model = "gpt-4o"; + // Get a chat client for the Azure OpenAI endpoint. AzureOpenAIClient azureClient = new( diff --git a/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs b/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs index 118bf15e7ed55..870da58cf90d6 100644 --- a/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs +++ b/docs/ai/quickstarts/snippets/text-to-image/azure-openai/Program.cs @@ -10,7 +10,7 @@ string endpoint = config["AZURE_OPENAI_ENDPOINT"]; string apiKey = config["AZURE_OPENAI_API_KEY"]; -string model = config["AZURE_OPENAI_GPT_NAME"]; +string model = "gpt-image-1"; // Create the Azure OpenAI client and convert to IImageGenerator. AzureOpenAIClient azureClient = new( diff --git a/docs/ai/quickstarts/structured-output.md b/docs/ai/quickstarts/structured-output.md index f68feff535f00..54b4caa5f68ff 100644 --- a/docs/ai/quickstarts/structured-output.md +++ b/docs/ai/quickstarts/structured-output.md @@ -44,7 +44,6 @@ Complete the following steps to create a console app that connects to the `gpt-4 ```bash dotnet user-secrets init dotnet user-secrets set AZURE_OPENAI_ENDPOINT - dotnet user-secrets set AZURE_OPENAI_GPT_NAME gpt-4o dotnet user-secrets set AZURE_TENANT_ID ``` diff --git a/docs/ai/quickstarts/text-to-image.md b/docs/ai/quickstarts/text-to-image.md index 89acb7f041ae7..b1e36e4ec3bc6 100644 --- a/docs/ai/quickstarts/text-to-image.md +++ b/docs/ai/quickstarts/text-to-image.md @@ -54,7 +54,6 @@ Complete the following steps to create a .NET console application that generates ```bash dotnet user-secrets init dotnet user-secrets set AZURE_OPENAI_ENDPOINT - dotnet user-secrets set AZURE_OPENAI_GPT_NAME gpt-image-1 dotnet user-secrets set AZURE_OPENAI_API_KEY ``` diff --git a/docs/ai/toc.yml b/docs/ai/toc.yml index c18445eb66441..4e086831f136a 100644 --- a/docs/ai/toc.yml +++ b/docs/ai/toc.yml @@ -78,6 +78,8 @@ items: items: - name: Get started with the RAG sample href: get-started-app-chat-template.md + - name: Process custom data for AI + href: quickstarts/process-data.md - name: Implement RAG using vector search href: tutorials/tutorial-ai-vector-search.md - name: Scale Azure OpenAI with Azure Container Apps