diff --git a/huggingface_pipelines/hf_example.ipynb b/huggingface_pipelines/hf_example.ipynb new file mode 100644 index 0000000..6b41576 --- /dev/null +++ b/huggingface_pipelines/hf_example.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from datasets import load_dataset\n", + "ds = load_dataset(\"HuggingFaceTB/cosmopedia\", \"stories\", split='train[0:10]') # , streaming=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/home/artyomko/miniconda3/envs/sonar/lib/python3.10/site-packages/torch/cuda/__init__.py:628: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n", + "/data/home/artyomko/miniconda3/envs/sonar/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + " from huggingface_pipelines.text import (\n", + " EmbeddingToTextPipelineConfig,\n", + " HFEmbeddingToTextPipeline,\n", + " HFTextToEmbeddingPipeline,\n", + " TextToEmbeddingPipelineConfig, \n", + " TextSegmentationPipelineConfig, TextSegmentationPipeline,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:huggingface_pipelines.text:Text preprocessing model initialized.\n" + ] + } + ], + "source": [ + "# !python -m spacy download en_core_web_sm\n", + "text_to_segment_config = TextSegmentationPipelineConfig(columns=['text'], output_path='./output',\n", + " fill_value='N/A', source_lang='eng_Latn', handle_missing='fill', \n", + " output_column_suffix=\"sentences\")\n", + "pipeline_text2sent = TextSegmentationPipeline(text_to_segment_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "text_to_embedding_config = TextToEmbeddingPipelineConfig(\n", + " encoder_model=\"text_sonar_basic_encoder\",\n", + " columns=[\"text_sentences\"],\n", + " output_column_suffix=\"embedding\",\n", + " batch_size=2,\n", + " device=\"cpu\",\n", + " source_lang=\"eng_Latn\",\n", + " output_path=\"test\",\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:huggingface_pipelines.text:Initializing text to embedding model...\n" + ] + } + ], + "source": [ + "pipeline_sent2emb = HFTextToEmbeddingPipeline(text_to_embedding_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:huggingface_pipelines.pipeline:Starting to process dataset...\n", + "Processing dataset: 100%|██████████| 10/10 [02:40<00:00, 16.04s/ examples]\n" + ] + } + ], + "source": [ + "ds = pipeline_text2sent(ds)\n", + "ds = pipeline_sent2emb(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['text',\n", + " 'prompt',\n", + " 'text_token_length',\n", + " 'seed_data',\n", + " 'format',\n", + " 'audience',\n", + " 'text_sentences',\n", + " 'text_sentences_embedding']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.column_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | text | \n", + "prompt | \n", + "text_token_length | \n", + "seed_data | \n", + "format | \n", + "audience | \n", + "text_sentences | \n", + "text_sentences_embedding | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "Once upon a time, in a village called Kiwilan... | \n", + "Write an educational story (3-5 paragraphs) ta... | \n", + "520 | \n", + "ultrachat | \n", + "story_children | \n", + "young_children | \n", + "[Once upon a time, in a village called Kiwilan... | \n", + "[[0.009837754, 0.003806148, -0.008218781, 0.00... | \n", + "
| 1 | \n", + "In a bustling town full of curious creatures ... | \n", + "Write an educational story (3-5 paragraphs) ta... | \n", + "381 | \n", + "openhermes2.5 | \n", + "story_children | \n", + "young_children | \n", + "[In a bustling town full of curious creatures ... | \n", + "[[0.0026006007, -0.008539987, 0.009180809, 0.0... | \n", + "
| 2 | \n", + "Step 3: Embracing an Unconventional Warmup Ro... | \n", + "Write a real-life story shared by someone in a... | \n", + "580 | \n", + "openhermes2.5 | \n", + "story_reddit | \n", + "general | \n", + "[Step 3: Embracing an Unconventional Warmup Ro... | \n", + "[[-0.0052720755, 0.0026521664, 0.018520469, 0.... | \n", + "
| 3 | \n", + "Once upon a time, in a small town named Harmo... | \n", + "Write an educational story (3-5 paragraphs) ta... | \n", + "439 | \n", + "ultrachat | \n", + "story_children | \n", + "young_children | \n", + "[Once upon a time, in a small town named Harmo... | \n", + "[[-0.0006834645, 0.0034085037, -0.011185894, 0... | \n", + "
| 4 | \n", + "On a bright, sunny day, two best friends, Tim... | \n", + "Write an educational story (3-5 paragraphs) ta... | \n", + "414 | \n", + "openhermes2.5 | \n", + "story_children | \n", + "young_children | \n", + "[On a bright, sunny day, two best friends, Tim... | \n", + "[[-0.013423425, 0.007448226, -0.010567971, 0.0... | \n", + "
| 5 | \n", + "In the bustling city of New York, there was a... | \n", + "Write a story that explores a situation slight... | \n", + "677 | \n", + "openhermes2.5 | \n", + "story_life_lessons | \n", + "general | \n", + "[In the bustling city of New York, there was a... | \n", + "[[0.01869477, -0.0056158905, 0.0034837904, -0.... | \n", + "
| 6 | \n", + "Once upon a time in a small village named Pix... | \n", + "Write an educational story (3-5 paragraphs) ta... | \n", + "618 | \n", + "ultrachat | \n", + "story_children | \n", + "young_children | \n", + "[Once upon a time in a small village named Pix... | \n", + "[[0.010998854, -0.003700477, 0.010283143, 0.01... | \n", + "
| 7 | \n", + "A couple of months ago, I stumbled upon an in... | \n", + "Write a real-life story shared by someone in a... | \n", + "755 | \n", + "openhermes2.5 | \n", + "story_reddit | \n", + "general | \n", + "[A couple of months ago, I stumbled upon an in... | \n", + "[[0.0069475807, 0.005131935, 0.0044895806, 0.0... | \n", + "
| 8 | \n", + "I've always loved living in Murcia, Spain - i... | \n", + "Write a story in the style of real-life situat... | \n", + "629 | \n", + "ultrachat | \n", + "story_forums | \n", + "general | \n", + "[I've always loved living in Murcia, Spain - i... | \n", + "[[0.00025026768, 0.0066834814, 0.009650952, 0.... | \n", + "
| 9 | \n", + "Once upon a time, in a bustling city called N... | \n", + "Write an educational story (3-5 paragraphs) ta... | \n", + "509 | \n", + "ultrachat | \n", + "story_children | \n", + "young_children | \n", + "[Once upon a time, in a bustling city called N... | \n", + "[[0.0053570517, 0.0042380523, 0.0048442776, 0.... | \n", + "