diff --git a/huggingface_pipelines/hf_example.ipynb b/huggingface_pipelines/hf_example.ipynb new file mode 100644 index 0000000..6b41576 --- /dev/null +++ b/huggingface_pipelines/hf_example.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from datasets import load_dataset\n", + "ds = load_dataset(\"HuggingFaceTB/cosmopedia\", \"stories\", split='train[0:10]') # , streaming=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/home/artyomko/miniconda3/envs/sonar/lib/python3.10/site-packages/torch/cuda/__init__.py:628: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n", + "/data/home/artyomko/miniconda3/envs/sonar/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + " from huggingface_pipelines.text import (\n", + " EmbeddingToTextPipelineConfig,\n", + " HFEmbeddingToTextPipeline,\n", + " HFTextToEmbeddingPipeline,\n", + " TextToEmbeddingPipelineConfig, \n", + " TextSegmentationPipelineConfig, TextSegmentationPipeline,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:huggingface_pipelines.text:Text preprocessing model initialized.\n" + ] + } + ], + "source": [ + "# !python -m spacy download en_core_web_sm\n", + "text_to_segment_config = TextSegmentationPipelineConfig(columns=['text'], output_path='./output',\n", + " fill_value='N/A', source_lang='eng_Latn', handle_missing='fill', \n", + " output_column_suffix=\"sentences\")\n", + "pipeline_text2sent = TextSegmentationPipeline(text_to_segment_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "text_to_embedding_config = TextToEmbeddingPipelineConfig(\n", + " encoder_model=\"text_sonar_basic_encoder\",\n", + " columns=[\"text_sentences\"],\n", + " output_column_suffix=\"embedding\",\n", + " batch_size=2,\n", + " device=\"cpu\",\n", + " source_lang=\"eng_Latn\",\n", + " output_path=\"test\",\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:huggingface_pipelines.text:Initializing text to embedding model...\n" + ] + } + ], + "source": [ + "pipeline_sent2emb = HFTextToEmbeddingPipeline(text_to_embedding_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:huggingface_pipelines.pipeline:Starting to process dataset...\n", + "Processing dataset: 100%|██████████| 10/10 [02:40<00:00, 16.04s/ examples]\n" + ] + } + ], + "source": [ + "ds = pipeline_text2sent(ds)\n", + "ds = pipeline_sent2emb(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['text',\n", + " 'prompt',\n", + " 'text_token_length',\n", + " 'seed_data',\n", + " 'format',\n", + " 'audience',\n", + " 'text_sentences',\n", + " 'text_sentences_embedding']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.column_names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textprompttext_token_lengthseed_dataformataudiencetext_sentencestext_sentences_embedding
0Once upon a time, in a village called Kiwilan...Write an educational story (3-5 paragraphs) ta...520ultrachatstory_childrenyoung_children[Once upon a time, in a village called Kiwilan...[[0.009837754, 0.003806148, -0.008218781, 0.00...
1In a bustling town full of curious creatures ...Write an educational story (3-5 paragraphs) ta...381openhermes2.5story_childrenyoung_children[In a bustling town full of curious creatures ...[[0.0026006007, -0.008539987, 0.009180809, 0.0...
2Step 3: Embracing an Unconventional Warmup Ro...Write a real-life story shared by someone in a...580openhermes2.5story_redditgeneral[Step 3: Embracing an Unconventional Warmup Ro...[[-0.0052720755, 0.0026521664, 0.018520469, 0....
3Once upon a time, in a small town named Harmo...Write an educational story (3-5 paragraphs) ta...439ultrachatstory_childrenyoung_children[Once upon a time, in a small town named Harmo...[[-0.0006834645, 0.0034085037, -0.011185894, 0...
4On a bright, sunny day, two best friends, Tim...Write an educational story (3-5 paragraphs) ta...414openhermes2.5story_childrenyoung_children[On a bright, sunny day, two best friends, Tim...[[-0.013423425, 0.007448226, -0.010567971, 0.0...
5In the bustling city of New York, there was a...Write a story that explores a situation slight...677openhermes2.5story_life_lessonsgeneral[In the bustling city of New York, there was a...[[0.01869477, -0.0056158905, 0.0034837904, -0....
6Once upon a time in a small village named Pix...Write an educational story (3-5 paragraphs) ta...618ultrachatstory_childrenyoung_children[Once upon a time in a small village named Pix...[[0.010998854, -0.003700477, 0.010283143, 0.01...
7A couple of months ago, I stumbled upon an in...Write a real-life story shared by someone in a...755openhermes2.5story_redditgeneral[A couple of months ago, I stumbled upon an in...[[0.0069475807, 0.005131935, 0.0044895806, 0.0...
8I've always loved living in Murcia, Spain - i...Write a story in the style of real-life situat...629ultrachatstory_forumsgeneral[I've always loved living in Murcia, Spain - i...[[0.00025026768, 0.0066834814, 0.009650952, 0....
9Once upon a time, in a bustling city called N...Write an educational story (3-5 paragraphs) ta...509ultrachatstory_childrenyoung_children[Once upon a time, in a bustling city called N...[[0.0053570517, 0.0042380523, 0.0048442776, 0....
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Once upon a time, in a village called Kiwilan... \n", + "1 In a bustling town full of curious creatures ... \n", + "2 Step 3: Embracing an Unconventional Warmup Ro... \n", + "3 Once upon a time, in a small town named Harmo... \n", + "4 On a bright, sunny day, two best friends, Tim... \n", + "5 In the bustling city of New York, there was a... \n", + "6 Once upon a time in a small village named Pix... \n", + "7 A couple of months ago, I stumbled upon an in... \n", + "8 I've always loved living in Murcia, Spain - i... \n", + "9 Once upon a time, in a bustling city called N... \n", + "\n", + " prompt text_token_length \\\n", + "0 Write an educational story (3-5 paragraphs) ta... 520 \n", + "1 Write an educational story (3-5 paragraphs) ta... 381 \n", + "2 Write a real-life story shared by someone in a... 580 \n", + "3 Write an educational story (3-5 paragraphs) ta... 439 \n", + "4 Write an educational story (3-5 paragraphs) ta... 414 \n", + "5 Write a story that explores a situation slight... 677 \n", + "6 Write an educational story (3-5 paragraphs) ta... 618 \n", + "7 Write a real-life story shared by someone in a... 755 \n", + "8 Write a story in the style of real-life situat... 629 \n", + "9 Write an educational story (3-5 paragraphs) ta... 509 \n", + "\n", + " seed_data format audience \\\n", + "0 ultrachat story_children young_children \n", + "1 openhermes2.5 story_children young_children \n", + "2 openhermes2.5 story_reddit general \n", + "3 ultrachat story_children young_children \n", + "4 openhermes2.5 story_children young_children \n", + "5 openhermes2.5 story_life_lessons general \n", + "6 ultrachat story_children young_children \n", + "7 openhermes2.5 story_reddit general \n", + "8 ultrachat story_forums general \n", + "9 ultrachat story_children young_children \n", + "\n", + " text_sentences \\\n", + "0 [Once upon a time, in a village called Kiwilan... \n", + "1 [In a bustling town full of curious creatures ... \n", + "2 [Step 3: Embracing an Unconventional Warmup Ro... \n", + "3 [Once upon a time, in a small town named Harmo... \n", + "4 [On a bright, sunny day, two best friends, Tim... \n", + "5 [In the bustling city of New York, there was a... \n", + "6 [Once upon a time in a small village named Pix... \n", + "7 [A couple of months ago, I stumbled upon an in... \n", + "8 [I've always loved living in Murcia, Spain - i... \n", + "9 [Once upon a time, in a bustling city called N... \n", + "\n", + " text_sentences_embedding \n", + "0 [[0.009837754, 0.003806148, -0.008218781, 0.00... \n", + "1 [[0.0026006007, -0.008539987, 0.009180809, 0.0... \n", + "2 [[-0.0052720755, 0.0026521664, 0.018520469, 0.... \n", + "3 [[-0.0006834645, 0.0034085037, -0.011185894, 0... \n", + "4 [[-0.013423425, 0.007448226, -0.010567971, 0.0... \n", + "5 [[0.01869477, -0.0056158905, 0.0034837904, -0.... \n", + "6 [[0.010998854, -0.003700477, 0.010283143, 0.01... \n", + "7 [[0.0069475807, 0.005131935, 0.0044895806, 0.0... \n", + "8 [[0.00025026768, 0.0066834814, 0.009650952, 0.... \n", + "9 [[0.0053570517, 0.0042380523, 0.0048442776, 0.... " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# !pip install polars\n", + "# ds.to_pandas()\n", + "ds.to_polars()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sonar", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}