From 342e1668d39adf56ac29de4b1113420ba0def982 Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Thu, 27 Nov 2025 01:07:00 +0000
Subject: [PATCH 01/14] dedup working

---
 nemo_curator_semantic_dedup/README.md         | 175 ++++++++++
 .../__pycache__/helper.cpython-312.pyc        | Bin 0 -> 15239 bytes
 nemo_curator_semantic_dedup/helper.py         | 327 ++++++++++++++++++
 .../image_dedup_example.py                    | 301 ++++++++++++++++
 4 files changed, 803 insertions(+)
 create mode 100644 nemo_curator_semantic_dedup/README.md
 create mode 100644 nemo_curator_semantic_dedup/__pycache__/helper.cpython-312.pyc
 create mode 100644 nemo_curator_semantic_dedup/helper.py
 create mode 100644 nemo_curator_semantic_dedup/image_dedup_example.py

diff --git a/nemo_curator_semantic_dedup/README.md b/nemo_curator_semantic_dedup/README.md
new file mode 100644
index 0000000..2cda472
--- /dev/null
+++ b/nemo_curator_semantic_dedup/README.md
@@ -0,0 +1,175 @@
+# Semantic Deduplication with NeMo Curator
+
+This example demonstrates how to perform GPU-accelerated semantic deduplication on large text datasets using [NVIDIA NeMo Curator](https://github.com/NVIDIA-NeMo/Curator) on Anyscale.
+
+## What is Semantic Deduplication?
+
+Unlike exact or fuzzy deduplication that matches text patterns, **semantic deduplication** identifies documents that are conceptually similar even if they use different words. This is achieved by:
+
+1. **Computing embeddings**: Converting text into dense vector representations using neural network models
+2. **Clustering**: Grouping similar embeddings using GPU-accelerated k-means
+3. **Similarity matching**: Identifying near-duplicates within clusters based on cosine similarity
+
+This approach is particularly effective for:
+- Removing paraphrased content
+- Identifying translated duplicates
+- Cleaning datasets with rephrased information
+- Improving LLM training data quality
+
+## Performance
+
+NeMo Curator leverages NVIDIA RAPIDS™ libraries (cuDF, cuML, cuGraph) for GPU acceleration:
+
+- **16× faster** fuzzy deduplication compared to CPU-based alternatives
+- **40% lower** total cost of ownership (TCO)
+- **Near-linear scaling** across multiple GPUs
+
+## Install the Anyscale CLI
+
+```bash
+pip install -U anyscale
+anyscale login
+```
+
+## Submit the Job
+
+Clone the example from GitHub:
+
+```bash
+git clone https://github.com/anyscale/examples.git
+cd examples/nemo_curator_semantic_dedup
+```
+
+Submit the job:
+
+```bash
+anyscale job submit -f job.yaml
+```
+
+### Using Your Own Data
+
+To process your own dataset, set the `INPUT_DATA_PATH` environment variable:
+
+```bash
+anyscale job submit -f job.yaml \
+  --env INPUT_DATA_PATH=s3://your-bucket/your-data/ \
+  --env OUTPUT_DATA_PATH=s3://your-bucket/output/
+```
+
+Your input data should be in Parquet or JSONL format with at least two columns:
+- `id`: Unique document identifier
+- `text`: The text content to deduplicate
+
+## Configuration Options
+
+You can customize the pipeline behavior via environment variables:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `INPUT_DATA_PATH` | `/mnt/cluster_storage/semantic_dedup/input` | Path to input dataset |
+| `OUTPUT_DATA_PATH` | `/mnt/cluster_storage/semantic_dedup/output` | Path for output data |
+| `EMBEDDING_MODEL` | `sentence-transformers/all-MiniLM-L6-v2` | HuggingFace model for embeddings |
+| `EMBEDDING_BATCH_SIZE` | `64` | Batch size per GPU for embedding computation |
+| `NUM_CLUSTERS` | `1000` | Number of k-means clusters |
+| `SIMILARITY_THRESHOLD` | `0.8` | Cosine similarity threshold (0.0-1.0) |
+
+### Embedding Model Options
+
+| Model | Quality | Speed | Use Case |
+|-------|---------|-------|----------|
+| `sentence-transformers/all-MiniLM-L6-v2` | Good | Fast | Quick experiments, large datasets |
+| `intfloat/e5-large-v2` | Better | Medium | Production workloads |
+| `BAAI/bge-large-en-v1.5` | Best | Slower | High-quality deduplication |
+
+### Tuning the Similarity Threshold
+
+- **Higher threshold (e.g., 0.9)**: Stricter matching, fewer duplicates removed
+- **Lower threshold (e.g., 0.7)**: Looser matching, more duplicates removed
+
+Start with 0.8 and adjust based on your quality requirements.
+
+## Understanding the Example
+
+### Pipeline Architecture
+
+```
+┌─────────────────┐
+│   Input Data    │  Parquet/JSONL files
+└────────┬────────┘
+         │
+         ▼
+┌─────────────────┐
+│    Embedding    │  GPU-accelerated transformer inference
+│    Creation     │  (sentence-transformers, E5, BGE, etc.)
+└────────┬────────┘
+         │
+         ▼
+┌─────────────────┐
+│   Clustering    │  GPU-accelerated k-means (cuML)
+│    (k-means)    │  Groups similar embeddings
+└────────┬────────┘
+         │
+         ▼
+┌─────────────────┐
+│   Duplicate     │  Pairwise similarity within clusters
+│   Extraction    │  Identifies semantic duplicates
+└────────┬────────┘
+         │
+         ▼
+┌─────────────────┐
+│  Deduplicated   │  Original data minus duplicates
+│     Output      │
+└─────────────────┘
+```
+
+### Key Components
+
+- **EmbeddingCreator**: Computes dense vector embeddings using pre-trained models
+- **ClusteringModel**: GPU-accelerated k-means clustering with cuML
+- **SemanticClusterLevelDedup**: Finds duplicates within clusters using cosine similarity
+
+### Scaling Considerations
+
+- **Number of clusters**: Should be roughly √(n_documents) for balanced cluster sizes
+- **Memory**: Each GPU should have enough memory for the embedding model (~4GB for MiniLM, ~8GB for larger models)
+- **Batch size**: Increase for faster processing, decrease if running out of GPU memory
+
+## Output
+
+The pipeline produces:
+
+1. **Deduplicated dataset**: Parquet files in `{OUTPUT_DATA_PATH}/{timestamp}/deduplicated/`
+2. **Cache files**: Intermediate embeddings and clusters for debugging
+
+Example output log:
+
+```
+============================================================
+Semantic Deduplication Complete!
+============================================================
+Original documents: 1,000,000
+Duplicates removed: 127,543
+Final documents: 872,457
+Reduction: 12.75%
+Output saved to: /mnt/cluster_storage/semantic_dedup/output/20250115T143022Z/deduplicated
+============================================================
+```
+
+## Monitoring
+
+View your job progress in the [Anyscale Console](https://console.anyscale.com/jobs). The Dask dashboard link will be printed in the logs for detailed task monitoring.
+
+## Cost Optimization Tips
+
+1. **Use spot instances**: The job.yaml is configured with `market_type: PREFER_SPOT` for cost savings
+2. **Start small**: Test with a subset of your data before running on the full dataset
+3. **Choose the right GPU**: A10G instances offer a good balance of cost and performance
+4. **Tune batch size**: Larger batches = better GPU utilization = faster processing
+
+## Learn More
+
+- [NeMo Curator Documentation](https://docs.nvidia.com/nemo/curator/latest/)
+- [Semantic Deduplication Guide](https://docs.nvidia.com/nemo/curator/latest/curate-text/process-data/deduplication/index.html)
+- [NeMo Curator GitHub](https://github.com/NVIDIA-NeMo/Curator)
+- [Anyscale Documentation](https://docs.anyscale.com/)
+
diff --git a/nemo_curator_semantic_dedup/__pycache__/helper.cpython-312.pyc b/nemo_curator_semantic_dedup/__pycache__/helper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0e10dfe70c3ed15c17633f6b4060378992a8e3a
GIT binary patch
literal 15239
zcmbt*YjhJ=nqZYwl1gvM?>B{CSb%NJBY+_R;|CbKZ4yWb4vCI%l|e?9oGKZxB6mBT
zKHK&zXAslb!I?A@XFKOa*|6K*=V51GBq4Ly&g@w!5|O6j>|uBEW7so4oQ63`(|gYD
z_uW#Jgi&w0cgyy@U)_7Z?>_3g-~ArfpV{pe3Z9=VZ1VrAgQEThU(`ph0Q7@PG(}yb
zc#5Y3)D%5N(+KMVx-lKe^<#RHGh+<N4Pyq98^??!H;tJf*9X`s^O%|7G6Bn!b<9d&
zL%=p=AG7CZI>sCX$2sPL+!%0Am5h~;I#Zx@s%)%`z-*v=s(h@RrUbiagLdXX#Z={3
zC8@Ins-~*PssXl+)$mn(HE;WgVXT&~;q5<R#_IT5-T`^NnKDy+9j1@}@b$bC+BZC2
z3T+#B7vOE+H}G!AIiBN7Am7Mu<VzuMRNC{o{0ZzPz6_`~VL9LYcrD+;mqT4M-^y>|
zD*$TY+jtLO2~g{-&a?SGeg+RMTYbS`DC`USLqTZ--c8d!G3@sR0A!AZLYUi6oI2L;
z?LE@pJ2W_Q7z&Jm(Al$s2#{fL%6C@4w&7R#Dafs)VD!wSFdl|i3wb@{3y+_>e+N1o
z&wYKu1IQz9P!#>(?{pM3MA5(vVc-en;pwQ3rz7z4A=Kj2N4=b+s5!lvic*t>bd+i;
zN{0<OkeW2AP(J5%H}oIo=-*&esynGSXr6iv=>4hwn*JXc4AdMGWhU7?o(hEtPvhJ9
zHi{CrKr6dim(M@Ufo}je6RM0tbsV`Ge$;;7fcL^)X=+?YQ^%>Oo~O_E0sXeHQ>8lT
zQlWg#8>0HRftRB+q^9B{ew(^(RA)@M`2X5}R3FuyhqadI%l|rDuJ$xpr9%0fr&X%j
zEOp8(Y3g!{p}uG2=BN(FY{8LVXH-fwHB0TLF5iN`Yw))Se?v`F7e&^Ifv-c>aW`dv
z@m++KItBa!y}five~P+DH&J0|qdb!(DwNN|Why*bp+fn5Lf1@*?Q^iAxwUWjo*5Lb
zJsUMVzRss$mFEmUG+h2b$uH6uDP@!}r$70p`ir#3I&vQw@jiax4`3ICVF%()Gll(A
zLTDx&F~8Ik4hvJ$VJ=esGB@D!2LzrAhd4fTF&GH>crN1T=8l{=ag38-@6JfvBV9Ka
zVR`}qK;V1|uEb3!h!Y`^^VnsB=nI||Wc^toESr4((7AAUTDJEF{6a8%Lg`pGD)eL%
z68DY{$&4uYc+ms{l?}XbX6CGHRWSNRF(d*VX*M|S56QM11*O;^1q5N*W0XyjAW0xw
zWc`d7kWHcx7X5+*Xvz>}RurUZ5IurS3leTxjy#VJcAg7O37w*Ewv!hod^3S?r*O$P
zH60M7&Y&<A@{Z4lzHmtNO2U*c81|2Qd4Zpq?mQ<1rUkKMdRBH%2q1O5eiTsNGqYhq
z5;p;T5By2Pu=;;XeK$|7mR2pfmkuxc-WvL%v~9t(ren>=>1<`gwb!n`w!GtCnKIk@
zliT|597=B+jlGtsJeICJmN;=bTUvYVz|{lqo&CJDJ-gBKe*L@kzu1te-S(}4s(O}&
zW~s_!YeuS~e(BXed1<w*=GySp;pM&YlkvgCuA_H$+-bS<!^A5SnOCNguS_Lg4kk8+
z(q+@%nsdl&C6ui+b~0tHTP>-+)_%2pnNF27W^9c~TjN?aRpxnEM;WX?9e#Z{wsXl8
zn@w=LR>oJJPq_D`nf+O-bFCH$eRr>R+jnbZK;I=Qk9`NDg5PB2vEM$lQ0}UXt10Pf
zdhcYWWpA=&Z^GIAYf6VxcUCO9|LC<<SH(5kRol{RyfSW1bPe2+ZV7j`-|^i!o~VB|
z?RxHi17&RfXw5>|n;uF35mxobrTshUKi$%|LH7|&_vzS=bb5phoBAw<kG9f%4Es^L
z9^q{^EZ<M}8QG7XW&r+})&uItOgq$m+-1cQsAY{GliE*c1Jr(Es={;w-M`)V3D*el
zZ5<6QZ!<Jt-8K@KC8@>OS7*NMqWel6x7{X$t4!E(LsNgJ;Zt2x|7OFdElBgz%?2##
zWB`8bWHQ5a$jMX%ESWo*ph$TpbLWxzj`^f^^5jdS<OF-0;dR$_s??v;pM~=!jN6Gk
z#m*UC52sV^oSHKfwAIcZ6`P`X18=-x(x@8?THEriR48hQ8gH;_>w=RnC%xxPFgi9b
z`{0~)tGM}GJ>k?b!nr&`QKw;5=FB9u%vqyW-dqhRb2fl11hV6K{x%ae^VYYS>$*8d
z)E>1(9lY)5`Z*_@U#0mTR4D3H`=F^PTdYO?oQj&FcHVx&@nKFg$YWkU1}9RbO0(E`
z7G-%S@AA`sOl#+vstcgLSnSl)&bx4}M!8Of@;RI+ZgstPQFHG6+WyF;j<Ao4(J#NC
zuHalrc!P=`EeUfftWtr~xhh%$Y3=&;EEq|nij~i!4mfj5uIp};s{PKDhFjG7Xo*@H
zE%j<&BI=$*ak=h^mWH>eMUz`qD4*wP-6&H>O;h1cwM7(o06Bs$$5icivP-Sc=X`})
zJ6HC5%&zjW*_9W}u1uv9E&txxm4DCdG+K|%ZihPR$(<^c&(#^Lqxhb=R{S1wt$J*(
zl?8Lv)+}22y>qSnp1EqY3g;T5FP~!2;&^Olz!%=F&UkW<3gz=?UZz!rwc<ZmmR1+Y
z(&}i{<UWm*21KjXbyjrBnC^0LH|(*?e@oIYAJZzyK3}g@LfgMn^~$ct^-W1BCt6(1
zh^*hXi;plJVV@|XE+?X*ejoHw>OMjqooopBr~F~rK{79>Cf;))DeTdSsHDo~@lY@b
zmJ%pZ%!&RNPssY`pBo0EJ9&>@+=wkq<GyLM&1C&~VfH@CHV-Z0_LkkCOU;NuuW)Hx
zAWftbGJWw8(=j=H7IaD`=$jHCH%)-<7&<d4ZU>}ChMDk0`yL$erSKy|$D|YrM%WH8
z`Cgq7!kz|M7m{RV+7~`2Gm{~IP()=}MA-@YUGK48ZL~S9@pw+TGYi3)DFGBRL1z77
z0W`yll5F%%PYXewXkB7G(lDL{$^s~juwFPUnF&D)+2WV{LC`yc<AQ7;nwo4n;S&dg
z6Cqh2fCh}@j|iYM@;t%Pfx2lrgc{x8sH~U5qHGj`;~^gC;e<d>otc`JWZg6`oBMoW
z-+%~9AzQ*BuR75h=m%|5*#gS;^i0^x`$gGIio7rr*#-mjVqdt3vUy4f`*<KEk*_9#
zM}rbMoR9&^&Py`wmuau8hgp+VCSOiOi6AqGifXR}8muJifv#*+-eeX&l-G}wwC6su
zqNd6$d>p01g7e9hnh8wWn~<uPpO9_23Bkm|=cF3wQBj|ZrD7`%$UZHG#^I}Z$z;U<
zl<kK<=`tMOIgPmd+U~2nukF9Of7ueBy;&L$-8_-ldMH)eJ3oBSRWd*LwWB=aXi7So
z;*DuX`~2ZmS7pZ4oOCtEcia@yu0znmS&?xzC!Njllef$n=h3^)qj%0`T_p>K)!OFe
z=ih(j-B(gAyS}J>YH?_xXYo>uj_t@+R4=t&J-=``TT&hqu5MoFy;t6_B*jLTdzQzS
zcPC1=KuK*weD}>A$<hN0eOX(1##WcK)h(S!+ZylL+*gJdhtt)Y<GWXSZ@JUey?1SW
zYo%1(?uQjrHTRC=El1pts2aFcmaH1clngAGzOAKPLo^KTVFP7xU#VNHiv^Z@)8@8#
z?`n19^6pIamc_w^o`nlvJ7Hw+wcK^K{gW56P_tvn8K>h-%NG+BoiY7GBURV*&VjcM
ztn656SqZP$(zQ>=*j27+dEgh;82hF@>ngowy=q-La^uV|CqJ0HS&`~E7_+8bJ%G8f
zd3iAIdv|!HI@!E0#qG~<y-BY3*5MR49J8<1HWsD(gA}*D7~%R<$6+LV1PC`YzBBgL
zSlqm_@dI0`epjY`f3kl6&6BD6{+MMI#yk4f=t}R(uA7FH`gGlan0XaOGWLrNG4q=a
zb-aT&#9u}}h}_(q>g<nMzi<tF<DqJ{eygV}o`>5gr~67^F|f2Z<!Av0D1DM;UQBEq
zNR=L*AI>t?3{#P0Dq_*QO!L}t8khYyYc*8qVaUiw_zipo`2E~@^wGEQF+2%#+y9bY
z2d?~aPv@|O{-}!{+F<->yB^_PhYUk)`cJy)p+@5;`xt}|SfS#!g&t~QZd+>*ZfP2N
zM*pc<hpDaa0A#<i^^`%)XN@#qeAYrE+-4lwXZoyz9_q4v)@efcDHhA0F$|kbf2B78
zJPx};dr)Zy0ZcytZ5LcsQAOp|&FS+_t76Al-YEqR7BC5(R8VI@rjLNYCo)nm&IHM}
z^f0nv{M=0Nym$)w6UQ)l5fTX<9k{`j4PCBoXjK#_IMdJw|0Fy`=BaFDU6LuAA6%IE
znz8<L==GsD?MuS)*h+QEweK#o{{b|j#x)NLyr2a{CVLRlLcd(GDMo3YzM<JTpb2Q!
zP%%GTRMYvu62gAS!|TIV70&AlptCa9HLo2|);w-`O`10p!+dg~`=~31E5>oKz{Yhi
z!)Z-y6~mkntRg0AgdZJx8*8Wu-BsOXH$%;t=AgD<gvNq?OsbASyl?2wVy_$2G8m_3
zH%0X~%xXDU0BpEaon_u96{T-jR84u#oVQiNs34OEtRKzJP|GMPYL05QU$L?AR^E2Q
z{$cLBs-sx%353yVBqxj5Fq4`+qBR3wEAP-oY%zZy1!Tf?Pt>B0N8DPlfr?o!T1%;-
zpk?zqg7BKBiJBk&pigyNKWCgXT%=I%d(3wjRdu{D9*yJyKVZw8HLO`Zlbh9=e9k+g
z7Wm$-sP%FGCGq=Ne`uWry?74tyx-5`9ytN37%N=zOJQ&5e8e;iK0NT~MQS+ky`e{p
zn-D`&Ty76@sCtMfdm`53(*b{&^9RBC7qN0=b4y%kf{WNSRKYiX4xESPoFot=P!UmP
zMBK-e!`>e}o5K`CP!6I7*;8hCK^m8tnII^r`ta0rq&n|t6v4-KLExa21Ko%dk!w8_
z;xmZHPWjFYP%4Sf<EsrlgsN&K8>U4VtT+rcVlO7>c@<G0h{%T`3MH8dz(R<)_hkkg
zbkpJitU8GA2ET;Lm<d}TU=XLs`{;4vStOoF5%+?}CZ0zuoT=>0EhEveQOUA{8t2s(
zMAq>WiUfTHTN{;S1WgwL95OR86C4MfbZQ#aKoM<X37sb>W7S8IwWX^2)q7DmLli=A
zXD&k13y5=pMoIh7Mj1>fVPlr%6L*=-a2i`|S2iqeSgK2zTjvk0GUkQDZ}u+fes(yv
zH&ORg!m>Nf?0Lvi1N12US2Ky{Ur4|BQsVS0sTaKo-*js9)pWo3P_J_s|GHoN=ciXr
z-J%z&V~1j|E_E&aC|TQ)DC_*3-InMV*Yw~M{Dy*t(4)b!F!Xc%(!kr+*xd5aiZju9
z@RmJc8A&svI3CB6{jRw+-n{CpNLOyXS(A45Eih1FwqHJxwYjbgE)K@{rGb>KX%)1w
ziJxuBx|_4^s)uHS!}5?an62Qdv}}4<g=)ar#j`{e=w6;lp*FatV+=j?s?~mF_u}p=
zPcJ^bRGGGJywUi}%^z%DIho%4^s3Eq<;dcZ*vY%LdKmPUXA;(pzj@$+fq!cQstI%J
zZy$2t0R1&ZBO%st#kOcmRBl_TTruA)x!H4bN22^dhCP^M5B_#dk8K`F=!N)bQ(1q#
z@#B^reSeeww!6pN-${MiWa!^&`n1zLP_O@VPwhaJ{x7Qx06*sJL$)MG-4c92IM-=#
zdqL6q2TBXIfm}6DgZl~sC!+IrU_l)!*iei8tb1NZtRm&$Hd3+BC%*0>SVhVKOP}kj
zNcigYUC>vN@=5L3&3E%*4g`t}Qp_K_#c8Ob9NHUlBXH!YsgPJ3)faOp0?i_=Rhr=X
z(;ufi#>m!Vq5uL?;8b&r6NT}R$a5Ip@C7;I>F2=I1%tZ7qnCAlUbcor|5<;~7x2Q1
zU3(!GuG1$-V2)z!Kms~oAqJ(1i#gaVOq~&U9weu1*FcaDQyw#zxFn_k6fFvb<itOQ
zX^K}ci9sS8!yzK%P!NK|%vq4Kab_9>hXBf*S;2+z%Eq8@5eH}q1$fB4;~<rsDvDQ`
z<vO*m3WOV34ez>k;@i;Sb@-G1D<t#OnuBtdXDhi(r6*bG0Rtve-JYy&|Hi~r8s|s9
zaZqeYwzV_Ux+mGXXTiB-NwQ54xnSKH6PGk`3DeeW`%{_rfn@u@f-_O&NwS->CRfH(
zn>5uXOq;T8U75E1$+rD~YD=;$q^>4us!5ny?`_(a+0>of)ScPXlibvk+SCh;tVy<c
zbxTKlHq&`1*?B0n1q+=?wk_9b_2;JQEL)jj8<K3p(rji!S8_vFn%#cST9aUE#8+WX
zk16Ha=B3Eyg{X1S&3l0o>%y?fm8W<co<?;`WjoYqk(Fn`e_y=)??qGhJ80_vKWOTU
z)0|~IhDdV{&vQ9<LT6aG6vn8b$|AW9(?Prx5VD7Au@mMv0{*Cf1iAPuBmiGS_|G7b
zP~xeUMwJ&FFkaPF0p1jqtoY~93L`+$k06<+?wMS%!Ax0e5`Imst1WHu>WpV!(zEa8
zVA3;?YB>y_-26G)m^C|Ndo!giN%%Fl6t){kdU{hWeScs(I87|A3X5o;8V01Xh%qPM
zi%Iv_XY%Y(R3g*GuvBIN8&C`BL^X4z(7c(nz)03TBn%G0q5QQ32={0P4unLU>k!~o
zmCS0-QG-{L>A+H&agoPF^Co}Y0Tg3;mIg*?Lq<_VCTcRsCZIPF&6D#_aJ~x=I`*9j
z2puCL`gKKg>5AQucr2>eLp%|Cm(8fXf)CU;D?4)$SKtpN=z16thcHn@Aw-`q_yRM6
z_ya6MTiRn3#}OuMw*-(X8>jsN*{X<1#VweVx99{b2o@{KnVYbu(i%F4p^5JUtvm21
z1t1}OsR^#H%c`%9UL9Q?%rx&wHt$K5?OiahR<tbpQx)45te^{3)L-*n^~PU_A6e;(
zcc#m`7c8qa^-Je74cn6q+fxlYQ#HF5ESH^G)_TRdXpJ3>TNkW%*_~@{ifwz63Guil
zO2)4N_`~dJP%rew;){GCqiO6{?1(3f1~IoAnuQ9CrXBlvYa5tJQ!9X(im+eQkhg4$
z-LuHkx?xo9Ltr_RhmABK!yP?7I+7Ph1w7YrA4LsVF-CuoM`a#mulO`38t098EaGu2
zIf2P@m^{v0d6p7KvGy1wDs!PY6BXtnj#R?D=djQZ$rIe<BHTn;=9uXQG`ovUUxH+w
z`r1`?&2rVUbYP_`?b<azn041=+#8ebjmw)-?#=T@!7F;rdDR&=E?<hzBwBW*T~Eyq
zu9jCV)%?smKlqc8ETdQn=J&S0-~DcPd@|LvcYf$D)2;B1=0SnZ&`u(}697^6hw%<g
zK}6RO#k*7BT|!?+F;0pai}`6(5d*huHZWAwAlV9hAsX+&*Ddto6!MkEZD1{fcLhBq
zkNHS6QCCd;%L~F{x1Yef1Hv31HJ>x_Y@v6BXA3x78%f?PM53-IF=h<{(hNU2@6Vj2
zAg0U`?@dna2fskM(kH0%P{UhpSXF1<oUOozQsjowBp%u89xybl>K>W1|3Bf|G<*kq
zXJ<j&ux?#v@QSf^wLN&ETm^Av&BFqG@?y?2=7vc0MwJT8*A4DVQ{I~d(c>nqwj^2#
zJ0ExypSiNAa}urSbx%}_oPx(`lUf;dMFCsIg&L1mQvw*dwTJR8TDdD)R)}$Vj6@?D
z9)1eix2x?ZJ5(s2=WiCwl}F2q#be<%f#Zg=fFABb@_^@u<zY2|pG&Um;eJPH0jFsz
z3AJU9aeE%WJZb~ak;cI=w|rd%@hTP6L0^>-*eP=?L?vOS1#PPe`qelb`mK(#kk<T;
z-fPLdyMi8OnI}AB?<k%zPo3{`*k%^=f*kjOJ7^FMA99;BXJ<*MH=GpC^kH;N2%{-B
z<qLBc{o!+<Y)a>RBAQSg7DA#&JS%k*2npuSbqvD^Qa3jY{vPye73o58jrpgAfIld3
zO2kehlMCRX3D(W^`N3F%XzDC@(qIG^A=n5$taC!)5T$dWnE=n75jY8^#A}2k<#rkI
zAd!K0b0afg1d2GSTqVxd9~=+N@B-u<I^Hy`Nh37nyW}MVU{<3duMmKR12-Gu0X1mB
zDJ{4*G+rZu7;2ve?HV0*YS>ZoXcY5iVK+AjgP9h_{S$tH-wF#2H~zp9lmrpIbud%a
z#?7s-AC$ZxRQM$htjd|GAR&4}lz^2Z?h%*{ep2!oaXCgJ)&id)_!x85ivO`hTL%ba
zD>EQoN;tMH`B@WX4y=fO23^Q{v@xCnNG&KQ4cD*r0FW@?$$x3a1&#m5&woZ3j#oP2
zWO|!S^AV%MSP@noN5sK#?c9l+-U>G$OYr@e63!+o+0BU#oNAfn_{=y4J||`Zfm!9#
zqeC@6{fH^|O(G?uBq)rIPlcPhIZrileTuU<CnL-W0^8s}MFsdZC{k2g23Q{g=i>sX
ztp>Ov-L;K4KS3qNFfw{b!+wnA=e?bZ{&EB<^de;|nEq2|fXl%PKn!`YAL~$$B}SHr
zN(k|~iZ37p=NEWoWjmRL64HgsmDYkiA-kX8Yw}^lw*Xak`vTw@A>ItdZz7QplemJF
zEMZX`32~a*a<7D6WLM5rtwqU2REvm57UI_sJtr;}mue!aP7J~uvPBsakiL{x`ia3Q
zTQv-RLc)U-Ma7ekqVm@x%PQknV)vk`6?kU<643t}{7DDl6amj{qw1NpF7tPpE#Pse
zu1zrIU%DF>3|Y21!*WTMTP}aU=G~gR>{hTQOE!FCq-^!8tUck_m<EHDb$ku(TX3x=
z>e^FP9f``$_*~-Yk;MMd#HcS}Ig@6_zpmrn8GLK-J^ua4cPA5Fhf=M*sk*)dQw`m@
z-1A3P?Ip3UH^&n-TT&HU6CDRK_Ja#>;j}I8j(35ZHo4_sqOxbf1{O%&miV4qmUQh<
zf~{JuY+b%|^XQ%abmemkw$;k&YZtFxycWG0T^>qw^`t5fL194@)*3&ZD(zliq0hPo
zKnH4$@++qoPcOAWjN6VeZqN3W{a<(vCS5Ht%hKT2)y>(mx@=iPc7rFo`2cu&Yf56q
zhc2pV-_5<ZChkn88c)TXSr?aaZA!W}#hq!_9w6#&Ty9#PU1?3b_Trl-UcEA!cJ(2=
zak+Fkv~uCD>lp|fZ|<QXa_qp!F$_I$HIQi9k*e6aV9m0nIag0~Wmkf2NV9vthShd)
z%T@7>@uTtmE3d{kB)GjdD{kv=4W|x1n>g@X;<>Se<E1ov8oo=+u@qejCLHZ)wgW$t
zs}U$C8h0ifyVC4a_gGh~XVLyg?Aj@sa`ey-Pt)KUPuMoT*OX}8ood{Z*tqv*?X4GY
z!S#fM<%KkJ61?*XTf@?8OH&EU*1JsmxBY<VN!Z&SNvN&fZtmH1WV7MUX2X#k=HgD|
z8fb>WVemxZ&3qlv5WooxCqB9{naHlgLXcdQ!Dx>n5Hy*u$TtXEpnNSn6*|O!0d&QG
z35iU1d;lRF*{Pw!T@etR%C`O`w2)8{gFQnwiTH035tHq3luH}{;1sk`&W0u5vSle6
z?^^OE?CtSq<5FBmSa&9vor(nlwubf~3`FS%+W|lqIm}KU0n<N9MZi*tKq>s!1k92^
z;BWw+8QhBJ^*DPn>!*4Z!Il%~z95KpatLsPeG(i|aIpF~h(y6XB+icpARcgH$9;60
z<mWE_XLtpR?ms~$n>8t-Oq~cVFdyOW3@@`maMqsnho$wFo%k0>5tDtm9y<X5&w#yR
zsWH*8BU!U!<x;ZxK+=9-ejw{?UXCQ4UGqn>j<VRZu}iVZWjgkQguOArG%0J8(@ud#
zft%?9ynLUPrn+(F(6?A<>k(40t~!Vb(|PSPs+-g-{9F^r_yR31s)KdbN4CKotqXz}
z=8g>xE4mcwEV;{4Ii-y_%t1z@rqO{Wx1zNvM>&%>o#EaU$X7@O;XK3)--8}yn{rwx
z7_#xq%*2EsKF)Ce7}|-L9Kr=gH5{yEyYtG};@Cet4a>^de>(E|NbITQt@99&>dZ10
z5-y{`Cbqige?V3iBAEDPJgmq;7AepDw?c>VuLrqnXUaV*^jRu5;`H$C@pctni;LKZ
z$B2t^BcY%Gbik~kBDQJ1165UWBTTthLD0vL#uu1mAdwk@V<f@SNL6?S{_Ef+puLqV
zM1Q#eAF;$NBp%&;7nnBU2MAk~043?+6t*DHj}an$y`e#Qd{#LS3EQK$Lczhq`jG`f
zrn&j#hROTkpR9}p7=MNsUt&^7aU8Lo>IXmJ69+K7d<~ne>t{WdxQdPPRP!Usx73S&
z4UO(26OTYl_TNDsabf7=r9o5`h_&#t`0tSrng4%4h|C`qL<Pe6i83a6x;pVIC?Nlj
zMlL6^;;*#!c}$dd3SEMUJ%FZ(nD<d=!8brzq4e@!G$@acLPM#|<HP02qsRIW!^$Gd
zN0fhQ>maic)7YNO1KUR|{!mA5b9pw&7O!_=2Dne~dPNYI6uHr(>?h*4`Zrki3?_#m
z0k0_pp2HA=mPlwELOyUw(4PxhgDF6w<`77loyHgR%#r(;vT+*3CtOB+9*N-p;?0ON
z%75tSVb>FntP)!<Df~|vxTFeq70DNutDdarARAjmub^0s39vCmE+UbTJqdFwD(RnN
z#rv2LZ4|XE<&gOcgvjnkF-`WdqNlO@r$YQpKzK&{pHKl(0i0-%LiCoV|Gt5uoBoD!
zd`X$Vq^#r@0LNEU{Z|wRzoe@4E2{b{svLg*yWaH`)A1Fv15_KUW8U=8=%QT@TD$0$
z2j^%#{WSdVOx?z8b<LW&neL(CGU1vYfqSmfH3I^GR#ClXA|OkZRVwY^zgUcx2Ub!#
zLO0O0S!d0f9`buO=b8a>Ky%lwnGj;JsTrULhiDVsPd_NL(7PVg80md$Ra8S$wxTjy
zzj4j_EKPT0OPkj82;6g*tr-viI#oc2fGh>2W&*-L<kTuCa54jVJ1Gt5sLHCWvue#`
oq<3Y@tJd_8lbJvclq_~qgCjCOFhkkH5{7R6dxwE8A&dWi02XPW5&!@I

literal 0
HcmV?d00001

diff --git a/nemo_curator_semantic_dedup/helper.py b/nemo_curator_semantic_dedup/helper.py
new file mode 100644
index 0000000..a83a1fa
--- /dev/null
+++ b/nemo_curator_semantic_dedup/helper.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import asyncio
+import io
+import json
+import math
+import os
+import tarfile
+from functools import partial
+from multiprocessing import Pool
+from typing import TYPE_CHECKING
+
+import aiohttp
+import pandas as pd
+from loguru import logger
+from PIL import Image
+from tqdm import tqdm
+
+if TYPE_CHECKING:
+    from nemo_curator.tasks import ImageObject
+    from nemo_curator.tasks.image import ImageBatch
+
+# HTTP status codes
+HTTP_OK = 200
+
+
+async def fetch_image_bytes(session: aiohttp.ClientSession, url: str, retries: int = 3) -> bytes | None:
+    for attempt in range(1, retries + 1):
+        try:
+            async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response:
+                if response.status == HTTP_OK:
+                    return await response.read()
+                elif attempt > 1:
+                    logger.debug(f"[Attempt {attempt}] Failed to download {url}: HTTP status {response.status}")
+        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+            if attempt > 1:
+                logger.debug(f"[Attempt {attempt}] Failed to download {url}: {e}")
+
+        if attempt < retries:
+            await asyncio.sleep(1)
+
+    logger.debug(f"All {retries} attempts failed for {url}")
+    return None
+
+
+async def process_batch(batch: pd.DataFrame, output_dir: str, batch_num: int) -> None:
+    tar_filename = os.path.join(output_dir, f"{batch_num:05d}.tar")
+
+    metadatas = []
+    # Set timeout and connection limits for the session
+    timeout = aiohttp.ClientTimeout(total=15)
+    connector = aiohttp.TCPConnector(limit=256, limit_per_host=16)
+
+    async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
+        tasks = []
+        for i, (_, row) in enumerate(batch.iterrows()):
+            caption = row["TEXT"]
+            url = row["URL"]
+
+            key = f"{batch_num:05d}{i:04d}"
+
+            meta = {"url": url, "caption": caption, "key": key}
+            metadatas.append(meta)
+
+            tasks.append(fetch_image_bytes(session, url, retries=3))
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+    with tarfile.open(tar_filename, "w") as tar:
+        for i, result in enumerate(results):
+            # Only proceed for successful downloads (bytes)
+            if isinstance(result, bytes) and result:
+                key = f"{batch_num:05d}{i:04d}"
+
+                # Add image bytes
+                jpg_info = tarfile.TarInfo(name=f"{key}.jpg")
+                jpg_info.size = len(result)
+                tar.addfile(jpg_info, fileobj=io.BytesIO(result))
+
+                # Add caption text
+                caption_bytes = str(metadatas[i]["caption"]).encode("utf-8")
+                txt_info = tarfile.TarInfo(name=f"{key}.txt")
+                txt_info.size = len(caption_bytes)
+                tar.addfile(txt_info, fileobj=io.BytesIO(caption_bytes))
+
+                # Add JSON metadata
+                json_bytes = json.dumps(metadatas[i]).encode("utf-8")
+                json_info = tarfile.TarInfo(name=f"{key}.json")
+                json_info.size = len(json_bytes)
+                tar.addfile(json_info, fileobj=io.BytesIO(json_bytes))
+
+    # Write parquet
+    meta_df = pd.DataFrame(metadatas)
+    parquet_path = os.path.join(output_dir, f"{batch_num:05d}.parquet")
+    meta_df.to_parquet(parquet_path)
+
+
+def process_parquet_chunk(chunk: tuple[int, pd.DataFrame], output_dir: str) -> None:
+    batch_num, batch = chunk
+
+    asyncio.run(process_batch(batch, output_dir, batch_num))
+
+
+def download_webdataset(
+    parquet_path: str,
+    output_dir: str,
+    entries_per_tar: int = 10000,
+    num_processes: int = 2,
+) -> None:
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Read the parquet file
+    df = pd.read_parquet(parquet_path)
+    print(f"Loaded {len(df)} entries from parquet file")
+
+    # Split the dataframe into chunks for multiprocessing
+    chunks = [
+        (batch_num, df[i : i + entries_per_tar]) for batch_num, i in enumerate(range(0, len(df), entries_per_tar))
+    ]
+    print(f"Split into {len(chunks)} chunks of {entries_per_tar} entries each")
+
+    # Use multiprocessing to process chunks in parallel with progress tracking
+    with Pool(processes=num_processes) as pool:
+        func = partial(process_parquet_chunk, output_dir=output_dir)
+
+        # Use tqdm to track progress of chunk processing
+        list(tqdm(
+            pool.imap(func, chunks),
+            total=len(chunks),
+            desc="Processing chunks",
+            unit="chunk"
+        ))
+
+    # Best-effort cleanup of legacy tmp dir from previous versions
+    tmp_dir = os.path.join(output_dir, "tmp")
+    try:
+        if os.path.isdir(tmp_dir) and not os.listdir(tmp_dir):
+            os.rmdir(tmp_dir)
+    except OSError as e:
+        logger.debug(f"Failed to remove tmp dir {tmp_dir}: {e}")
+
+
+def _prepare_metadata_record(
+    image_obj: ImageObject,
+    new_id: str,
+    old_id_col: str | None,
+) -> dict:
+    """Prepare metadata record for an image object."""
+    metadata_record = {
+        "id": new_id,
+        "original_id": image_obj.image_id,
+        "original_path": image_obj.image_path,
+    }
+
+    # Preserve original ID in specified column if requested
+    if old_id_col:
+        metadata_record[old_id_col] = image_obj.image_id
+
+    # Add scores and embeddings to metadata
+    if image_obj.aesthetic_score is not None:
+        metadata_record["aesthetic_score"] = image_obj.aesthetic_score
+    if image_obj.nsfw_score is not None:
+        metadata_record["nsfw_score"] = image_obj.nsfw_score
+    if image_obj.embedding is not None:
+        # Convert embedding to list for JSON serialization
+        metadata_record["embedding"] = image_obj.embedding.tolist()
+        metadata_record["embedding_dim"] = len(image_obj.embedding)
+
+    # Add original metadata
+    if image_obj.metadata:
+        metadata_record.update(image_obj.metadata)
+
+    return metadata_record
+
+
+def _add_caption_to_metadata(image_obj: ImageObject, metadata_record: dict) -> None:
+    """Add caption/text to metadata record."""
+    if "caption" in image_obj.metadata:
+        metadata_record["caption"] = str(image_obj.metadata["caption"])
+    elif "text" in image_obj.metadata:
+        metadata_record["caption"] = str(image_obj.metadata["text"])
+    elif "TEXT" in image_obj.metadata:
+        metadata_record["caption"] = str(image_obj.metadata["TEXT"])
+
+
+def _add_image_to_tar(tar: tarfile.TarFile, image_obj: ImageObject, new_id: str) -> None:
+    """Add image data to tar file if available."""
+    if image_obj.image_data is not None:
+        # Convert numpy array to PIL Image and save as bytes
+        image_pil = Image.fromarray(image_obj.image_data)
+        image_bytes = _image_to_bytes(image_pil)
+
+        # Add image to tar
+        image_info = tarfile.TarInfo(name=f"{new_id}.jpg")
+        image_info.size = len(image_bytes.getvalue())
+        tar.addfile(image_info, fileobj=image_bytes)
+
+
+def _add_json_to_tar(tar: tarfile.TarFile, metadata_record: dict, new_id: str) -> None:
+    """Add JSON metadata to tar file."""
+    json_data = json.dumps(metadata_record, indent=2)
+    json_bytes = json_data.encode("utf-8")
+    json_info = tarfile.TarInfo(name=f"{new_id}.json")
+    json_info.size = len(json_bytes)
+    tar.addfile(json_info, fileobj=io.BytesIO(json_bytes))
+
+
+def save_imagebatch_to_webdataset(
+    image_batches: list[ImageBatch],
+    output_path: str,
+    samples_per_shard: int = 10000,
+    max_shards: int = 5,
+    old_id_col: str | None = None,
+) -> None:
+    """
+    Save ImageBatch objects to WebDataset format with resharding.
+
+    Args:
+        image_batches: List of ImageBatch objects from pipeline output
+        output_path: Directory path where the WebDataset should be saved
+        samples_per_shard: Number of samples to include in each tar file
+        max_shards: Order of magnitude of max shards (for zero-padding filenames)
+        old_id_col: If specified, will preserve the original image_id in this column
+    """
+    os.makedirs(output_path, exist_ok=True)
+
+    # Flatten all ImageObjects from all batches
+    all_image_objects = []
+    for batch in image_batches:
+        all_image_objects.extend(batch.data)
+
+    if not all_image_objects:
+        print("No images to save")
+        return
+
+    print(f"Processing {len(all_image_objects)} images into {samples_per_shard} samples per shard")
+
+    max_samples_per_shard = math.ceil(math.log10(samples_per_shard))
+
+    # Process images in shards
+    shard_id = 0
+    for i in range(0, len(all_image_objects), samples_per_shard):
+        shard_images = all_image_objects[i:i + samples_per_shard]
+
+        # Create output file paths
+        parquet_filename = _name_partition(shard_id, max_shards=max_shards)
+        tar_filename = _name_partition(shard_id, max_shards=max_shards, ext="tar")
+        parquet_path = os.path.join(output_path, parquet_filename)
+        tar_path = os.path.join(output_path, tar_filename)
+
+        # Prepare metadata for parquet
+        metadata_records = []
+
+        # Create tar file with images and metadata
+        with tarfile.open(tar_path, "w") as tar:
+            for sample_idx, image_obj in enumerate(shard_images):
+                # Generate new ID combining shard and sample indices
+                new_id = _combine_id(
+                    shard_id,
+                    sample_idx,
+                    max_shards=max_shards,
+                    max_samples_per_shard=max_samples_per_shard
+                )
+
+                # Prepare metadata record for parquet
+                metadata_record = _prepare_metadata_record(image_obj, new_id, old_id_col)
+                metadata_records.append(metadata_record)
+
+                # Save image data if available and requested
+                _add_image_to_tar(tar, image_obj, new_id)
+
+                # Store caption/text in metadata (no separate .txt file)
+                _add_caption_to_metadata(image_obj, metadata_record)
+
+                # Add JSON metadata to tar
+                _add_json_to_tar(tar, metadata_record, new_id)
+
+        # Save metadata to parquet
+        metadata_df = pd.DataFrame(metadata_records)
+        metadata_df.to_parquet(parquet_path, index=False)
+
+        print(f"✓ Saved shard {shard_id:0{max_shards}d} with {len(shard_images)} samples")
+        print(f"  - Tar file: {tar_filename}")
+        print(f"  - Parquet file: {parquet_filename}")
+
+        shard_id += 1
+
+    print(f"\nSuccessfully saved {len(all_image_objects)} images to {shard_id} shards")
+    print(f"Output directory: {output_path}")
+
+
+def _name_partition(
+    partition_index: int,
+    max_shards: int = 5,
+    ext: str = "parquet",
+) -> str:
+    """Generate partition filename with proper zero-padding."""
+    return f"{partition_index:0{max_shards}d}.{ext}"
+
+
+def _combine_id(shard_id: int, sample_id: int, max_shards: int = 5, max_samples_per_shard: int = 4) -> str:
+    """Combine shard and sample IDs into a unique identifier."""
+    int_id = sample_id + (10**max_samples_per_shard) * shard_id
+    n_digits = max_samples_per_shard + max_shards
+    return f"{int_id:0{n_digits}d}"
+
+
+def _image_to_bytes(image_pil: Image.Image, image_format: str = "JPEG") -> io.BytesIO:
+    """Convert PIL Image to BytesIO object for tarfile."""
+    buffer = io.BytesIO()
+    image_pil.save(buffer, format=image_format)
+    buffer.seek(0)
+    return buffer
\ No newline at end of file
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
new file mode 100644
index 0000000..474f465
--- /dev/null
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+
+from helper import download_webdataset
+
+from nemo_curator.core.client import RayClient
+from nemo_curator.pipeline import Pipeline
+from nemo_curator.stages.deduplication.semantic import SemanticDeduplicationWorkflow
+from nemo_curator.stages.file_partitioning import FilePartitioningStage
+from nemo_curator.stages.image.deduplication.removal import ImageDuplicatesRemovalStage
+from nemo_curator.stages.image.embedders.clip_embedder import ImageEmbeddingStage
+from nemo_curator.stages.image.io.convert import ConvertImageBatchToDocumentBatchStage
+from nemo_curator.stages.image.io.image_reader import ImageReaderStage
+from nemo_curator.stages.image.io.image_writer import ImageWriterStage
+from nemo_curator.stages.text.io.writer.parquet import ParquetWriter
+
+
+def create_image_embedding_pipeline(args: argparse.Namespace) -> Pipeline:
+    """Create image curation pipeline with file partitioning, image reading, embedding, deduplication."""
+
+    # Define pipeline
+    pipeline = Pipeline(name="image_curation", description="Curate images with embeddings and quality scoring")
+
+    # Stage 0: Partition tar files for parallel processing
+    pipeline.add_stage(FilePartitioningStage(
+        file_paths=args.input_wds_dataset_dir,
+        files_per_partition=args.tar_files_per_partition,
+        file_extensions=[".tar"],
+    ))
+
+    # Stage 1: Read images from webdataset tar files (now runs in parallel)
+    pipeline.add_stage(ImageReaderStage(
+        task_batch_size=args.batch_size,
+        verbose=args.verbose,
+        num_threads=16,  # More threads for I/O
+        num_gpus_per_worker=0.25,
+    ))
+
+    # Stage 2: Generate CLIP embeddings for images
+    pipeline.add_stage(ImageEmbeddingStage(
+        model_dir=args.model_dir,
+        num_gpus_per_worker=args.embedding_gpus_per_worker,
+        model_inference_batch_size=args.embedding_batch_size,
+        verbose=args.verbose,
+    ))
+
+    # Stage 3: Convert embeddings to document batch
+    pipeline.add_stage(ConvertImageBatchToDocumentBatchStage(fields=["image_id", "embedding"]))
+
+    # Stage 4: Save embeddings to parquet file
+    pipeline.add_stage(ParquetWriter(
+        path=args.embeddings_dir,
+    ))
+
+    return pipeline
+
+def create_embedding_deduplication_workflow(args: argparse.Namespace) -> Pipeline:
+    """Create image deduplication pipeline with embedding deduplication."""
+    return SemanticDeduplicationWorkflow(
+        input_path=args.embeddings_dir,
+        output_path=args.removal_parquets_dir,
+        id_field="image_id",
+        embedding_field="embedding",
+        n_clusters=100,
+        eps=0.01,
+        read_kwargs={"storage_options": {}},
+        write_kwargs={"storage_options": {}},
+        verbose=args.verbose,
+    )
+
+def create_image_deduplication_pipeline(args: argparse.Namespace) -> Pipeline:
+    """Create image deduplication pipeline with image deduplication."""
+    # Define pipeline
+    pipeline = Pipeline(name="image_deduplication", description="Deduplicate images with image deduplication")
+
+    # Stage 0: Partition tar files for parallel processing
+    pipeline.add_stage(FilePartitioningStage(
+        file_paths=args.input_wds_dataset_dir,
+        files_per_partition=args.tar_files_per_partition,
+        file_extensions=[".tar"],
+    ))
+
+    # Stage 1: Read images from webdataset tar files (now runs in parallel)
+    pipeline.add_stage(ImageReaderStage(
+        task_batch_size=args.batch_size,
+        verbose=args.verbose,
+        num_threads=16,  # More threads for I/O
+        num_gpus_per_worker=0.25,
+    ))
+
+    # Stage 2: Read removal list from parquet file and filter images
+    pipeline.add_stage(ImageDuplicatesRemovalStage(
+        removal_parquets_dir=args.removal_parquets_dir + "/duplicates",
+        duplicate_id_field="id",
+        verbose=args.verbose,
+    ))
+
+    # Stage 3: Write filtered images to disk
+    pipeline.add_stage(ImageWriterStage(
+        output_dir=args.output_dataset_dir,
+        remove_image_data=True,
+        verbose=args.verbose,
+    ))
+
+    return pipeline
+
+
+def main(args: argparse.Namespace) -> None:
+    """Main execution function for image curation pipeline."""
+
+    ray_client = RayClient()
+    ray_client.start()
+
+    print("Starting image curation pipeline...")
+    print(f"Input parquet file: {args.input_parquet}")
+    print(f"Input webdataset directory: {args.input_wds_dataset_dir}")
+    print(f"Output webdataset directory: {args.output_dataset_dir}")
+    print(f"Model directory: {args.model_dir}")
+    print(f"Tar files per partition: {args.tar_files_per_partition}")
+    print(f"Task batch size: {args.batch_size}")
+    print("\n" + "=" * 50 + "\n")
+
+    # Step 1: Download and prepare webdataset from parquet file
+    if not args.skip_download:
+        print("Step 1: Downloading webdataset from parquet file...")
+        download_start = time.time()
+
+        # Create output directory if it doesn't exist
+        os.makedirs(args.input_wds_dataset_dir, exist_ok=True)
+
+        # Download webdataset using helper function
+        download_webdataset(
+            parquet_path=args.input_parquet,
+            output_dir=args.input_wds_dataset_dir,
+            num_processes=args.download_processes,
+            entries_per_tar=args.entries_per_tar,
+        )
+
+        download_time = time.time() - download_start
+        print(f"✓ Dataset download completed in {download_time:.2f} seconds")
+        print(f"✓ Webdataset saved to: {args.input_wds_dataset_dir}")
+        print("\n" + "=" * 50 + "\n")
+    else:
+        print("Step 1: Skipping download (using existing dataset)")
+        print(f"Using existing dataset at: {args.input_wds_dataset_dir}")
+        print("\n" + "=" * 50 + "\n")
+
+    # Step 2: Create and run curation pipelines
+    # Step 2.1: Create image embedding pipeline
+    print("Step 2.1: Running image embedding pipeline...")
+    start_time = time.time()
+    pipeline = create_image_embedding_pipeline(args)
+    print(pipeline.describe())
+    print("\n" + "=" * 50 + "\n")
+    pipeline.run()
+
+    # Step 2.2: Create image deduplication pipeline (pairwise executor is XennaExecutor by default)
+    print("Step 2.2: Running image deduplication pipeline...")
+    start_time = time.time()
+    pipeline = create_embedding_deduplication_workflow(args)
+    print("\n" + "=" * 50 + "\n")
+    pipeline.run()
+
+    # Step 2.3: Create image deduplication pipeline
+    print("Step 2.3: Running image deduplication pipeline...")
+    start_time = time.time()
+    pipeline = create_image_deduplication_pipeline(args)
+    print(pipeline.describe())
+    print("\n" + "=" * 50 + "\n")
+    pipeline.run()
+
+    end_time = time.time()
+
+    # Calculate and print execution time
+    execution_time = end_time - start_time
+    hours, remainder = divmod(execution_time, 3600)
+    minutes, seconds = divmod(remainder, 60)
+
+    print("\nImage curation pipeline completed!")
+    print(f"Total execution time: {int(hours):02d}:{int(minutes):02d}:{seconds:.2f}")
+    print(f"Total execution time: {execution_time:.2f} seconds")
+    print(f"\nProcessed dataset available at: {args.output_dataset_dir}")
+
+    ray_client.stop()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Image curation pipeline with embedding generation and quality scoring"
+    )
+
+    # Dataset arguments
+    parser.add_argument(
+        "--input-parquet",
+        type=str,
+        required=False,
+        default=None,
+        help="Path to input parquet file containing image URLs and metadata"
+    )
+    parser.add_argument(
+        "--input-wds-dataset-dir",
+        type=str,
+        required=True,
+        help="Directory to save the downloaded webdataset"
+    )
+    parser.add_argument(
+        "--output-dataset-dir",
+        type=str,
+        required=True,
+        help="Directory to save the resulting webdataset"
+    )
+    parser.add_argument(
+        "--embeddings-dir",
+        type=str,
+        required=True,
+        help="Directory to save the embeddings"
+    )
+    parser.add_argument(
+        "--removal-parquets-dir",
+        type=str,
+        required=True,
+        help="Directory to save the remove parquets"
+    )
+    parser.add_argument(
+        "--download-processes",
+        type=int,
+        default=8,
+        help="Number of parallel processes for downloading images"
+    )
+    parser.add_argument(
+        "--entries-per-tar",
+        type=int,
+        default=1000,
+        help="Number of entries per tar shard during download"
+    )
+    parser.add_argument(
+        "--skip-download",
+        action="store_true",
+        default=False,
+        help="Skip dataset download and use existing webdataset"
+    )
+
+    # Image reader arguments
+    parser.add_argument(
+        "--tar-files-per-partition",
+        type=int,
+        default=1,
+        help="Number of tar files to process per partition (controls parallelism) for FilePartitioningStage"
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=100,
+        help="Number of images per ImageBatch for the reader stage"
+    )
+
+    # General arguments
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        required=True,
+        help="Path to model directory containing all model weights"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="Enable verbose logging for all stages"
+    )
+
+    # Embedding stage arguments
+    parser.add_argument(
+        "--embedding-batch-size",
+        type=int,
+        default=32,
+        help="Batch size for embedding generation"
+    )
+    parser.add_argument(
+        "--embedding-gpus-per-worker",
+        type=float,
+        default=0.25,
+        help="GPU allocation per worker for embedding generation"
+    )
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file

From a812e2cad5f740126343b673f6e3c0b0b2158f5f Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Sun, 30 Nov 2025 00:52:08 +0000
Subject: [PATCH 02/14] working job submit version

---
 nemo_curator_semantic_dedup/Dockerfile        |  64 +++++++
 nemo_curator_semantic_dedup/README.md         | 168 +++---------------
 .../image_dedup_example.py                    | 119 ++++++++++---
 nemo_curator_semantic_dedup/job.yaml          |  66 +++++++
 4 files changed, 248 insertions(+), 169 deletions(-)
 create mode 100644 nemo_curator_semantic_dedup/Dockerfile
 create mode 100644 nemo_curator_semantic_dedup/job.yaml

diff --git a/nemo_curator_semantic_dedup/Dockerfile b/nemo_curator_semantic_dedup/Dockerfile
new file mode 100644
index 0000000..200ef69
--- /dev/null
+++ b/nemo_curator_semantic_dedup/Dockerfile
@@ -0,0 +1,64 @@
+# NeMo Curator Image Deduplication Example
+# Uses CUDA 12.8 for GPU-accelerated processing
+FROM anyscale/ray:2.52.0-slim-py312-cu128
+
+# Install system dependencies
+RUN sudo apt-get update && \
+    sudo apt-get install -y --no-install-recommends \
+        build-essential \
+        unzip \
+        wget \
+        curl && \
+    sudo apt-get clean && \
+    sudo rm -rf /var/lib/apt/lists/*
+
+# Install uv for fast package management
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install Python dependencies
+# NeMo Curator with CUDA 12 support for image processing
+RUN uv pip install --system "nemo-curator[image_cuda12]"
+
+# Additional dependencies for image downloading and processing
+RUN uv pip install --system \
+    loguru \
+    Pillow \
+    aiohttp \
+    tqdm \
+    pandas \
+    pyarrow \
+    huggingface_hub \
+    transformers
+
+# Pre-download CLIP model weights to avoid runtime downloads
+# This makes job startup faster and more reliable
+RUN python -c "\
+from huggingface_hub import snapshot_download; \
+import os; \
+model_dir = '/home/ray/model_weights/openai/clip-vit-large-patch14'; \
+os.makedirs(model_dir, exist_ok=True); \
+snapshot_download('openai/clip-vit-large-patch14', local_dir=model_dir)"
+
+# Set environment variable for model directory
+ENV MODEL_DIR=/home/ray/model_weights
+
+# Download and prepare the example dataset from HuggingFace
+# Downloads MS COCO parquet, deduplicates URLs, and truncates to 100k rows
+RUN mkdir -p /home/ray/data && \
+    curl -L https://huggingface.co/datasets/ChristophSchuhmann/MS_COCO_2017_URL_TEXT/resolve/main/mscoco.parquet \
+         -o /home/ray/data/mscoco.parquet && \
+    python -c "\
+import pandas as pd; \
+df = pd.read_parquet('/home/ray/data/mscoco.parquet'); \
+deduped = df[~df['URL'].duplicated()]; \
+truncated = deduped[:100000]; \
+truncated.to_parquet('/home/ray/data/truncated_100k_mscoco.parquet'); \
+print(f'Created truncated dataset with {len(truncated)} rows')" && \
+    rm /home/ray/data/mscoco.parquet
+
+# Create output directories
+RUN mkdir -p /home/ray/data/webdataset \
+             /home/ray/data/results \
+             /home/ray/data/embeddings \
+             /home/ray/data/removal_ids
+
diff --git a/nemo_curator_semantic_dedup/README.md b/nemo_curator_semantic_dedup/README.md
index 2cda472..84b872e 100644
--- a/nemo_curator_semantic_dedup/README.md
+++ b/nemo_curator_semantic_dedup/README.md
@@ -1,28 +1,8 @@
-# Semantic Deduplication with NeMo Curator
+# Image Semantic Deduplication with NeMo Curator
 
-This example demonstrates how to perform GPU-accelerated semantic deduplication on large text datasets using [NVIDIA NeMo Curator](https://github.com/NVIDIA-NeMo/Curator) on Anyscale.
+This example uses [NVIDIA NeMo Curator](https://github.com/NVIDIA-NeMo/Curator) to perform GPU-accelerated semantic deduplication on image datasets.
 
-## What is Semantic Deduplication?
-
-Unlike exact or fuzzy deduplication that matches text patterns, **semantic deduplication** identifies documents that are conceptually similar even if they use different words. This is achieved by:
-
-1. **Computing embeddings**: Converting text into dense vector representations using neural network models
-2. **Clustering**: Grouping similar embeddings using GPU-accelerated k-means
-3. **Similarity matching**: Identifying near-duplicates within clusters based on cosine similarity
-
-This approach is particularly effective for:
-- Removing paraphrased content
-- Identifying translated duplicates
-- Cleaning datasets with rephrased information
-- Improving LLM training data quality
-
-## Performance
-
-NeMo Curator leverages NVIDIA RAPIDS™ libraries (cuDF, cuML, cuGraph) for GPU acceleration:
-
-- **16× faster** fuzzy deduplication compared to CPU-based alternatives
-- **40% lower** total cost of ownership (TCO)
-- **Near-linear scaling** across multiple GPUs
+NeMo Curator is a scalable data curation library that leverages NVIDIA RAPIDS™ for GPU acceleration. This example downloads images from a parquet file, generates CLIP embeddings, and removes near-duplicate images based on semantic similarity.
 
 ## Install the Anyscale CLI
 
@@ -31,145 +11,47 @@ pip install -U anyscale
 anyscale login
 ```
 
-## Submit the Job
+## Run the job
 
-Clone the example from GitHub:
+Clone the example from GitHub.
 
 ```bash
 git clone https://github.com/anyscale/examples.git
 cd examples/nemo_curator_semantic_dedup
 ```
 
-Submit the job:
+Submit the job.
 
 ```bash
 anyscale job submit -f job.yaml
 ```
 
-### Using Your Own Data
-
-To process your own dataset, set the `INPUT_DATA_PATH` environment variable:
+## Understanding the example
 
-```bash
-anyscale job submit -f job.yaml \
-  --env INPUT_DATA_PATH=s3://your-bucket/your-data/ \
-  --env OUTPUT_DATA_PATH=s3://your-bucket/output/
-```
+- The [Dockerfile](./Dockerfile) builds a custom image with NeMo Curator CUDA dependencies (`nemo-curator[image_cuda12]`), downloads the MS COCO sample dataset from HuggingFace, and pre-downloads the CLIP model weights to speed up job startup.
 
-Your input data should be in Parquet or JSONL format with at least two columns:
-- `id`: Unique document identifier
-- `text`: The text content to deduplicate
+- The entrypoint defined in [job.yaml](./job.yaml) runs `image_dedup_example.py` which executes a 3-step pipeline:
+  1. **Download WebDataset**: Fetches images from URLs in the parquet file and saves them as WebDataset tar files to `/mnt/cluster_storage/nemo_curator/webdataset`
+  2. **Generate CLIP embeddings**: Uses OpenAI's CLIP ViT-L/14 model to create 768-dimensional embeddings for each image
+  3. **Semantic deduplication**: Clusters embeddings with k-means and removes near-duplicates based on cosine similarity
 
-## Configuration Options
+- The `/mnt/cluster_storage/` directory is an ephemeral shared filesystem attached to the cluster for the duration of the job. All outputs (embeddings, duplicate IDs, and deduplicated images) are saved here.
 
-You can customize the pipeline behavior via environment variables:
+- To use your own data, prepare a parquet file with `URL` and `TEXT` columns, upload it to cluster storage, and override the `INPUT_PARQUET` environment variable:
+  ```bash
+  anyscale job submit -f job.yaml \
+    --env INPUT_PARQUET=/mnt/cluster_storage/your_data.parquet \
+    --env OUTPUT_DIR=/mnt/cluster_storage/your_results
+  ```
 
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `INPUT_DATA_PATH` | `/mnt/cluster_storage/semantic_dedup/input` | Path to input dataset |
-| `OUTPUT_DATA_PATH` | `/mnt/cluster_storage/semantic_dedup/output` | Path for output data |
-| `EMBEDDING_MODEL` | `sentence-transformers/all-MiniLM-L6-v2` | HuggingFace model for embeddings |
-| `EMBEDDING_BATCH_SIZE` | `64` | Batch size per GPU for embedding computation |
-| `NUM_CLUSTERS` | `1000` | Number of k-means clusters |
-| `SIMILARITY_THRESHOLD` | `0.8` | Cosine similarity threshold (0.0-1.0) |
+- The [helper.py](./helper.py) module provides utilities for downloading images in parallel and converting them to [WebDataset](https://github.com/webdataset/webdataset) format, which is optimized for streaming large-scale image datasets.
 
-### Embedding Model Options
+## View the job
 
-| Model | Quality | Speed | Use Case |
-|-------|---------|-------|----------|
-| `sentence-transformers/all-MiniLM-L6-v2` | Good | Fast | Quick experiments, large datasets |
-| `intfloat/e5-large-v2` | Better | Medium | Production workloads |
-| `BAAI/bge-large-en-v1.5` | Best | Slower | High-quality deduplication |
-
-### Tuning the Similarity Threshold
-
-- **Higher threshold (e.g., 0.9)**: Stricter matching, fewer duplicates removed
-- **Lower threshold (e.g., 0.7)**: Looser matching, more duplicates removed
-
-Start with 0.8 and adjust based on your quality requirements.
-
-## Understanding the Example
-
-### Pipeline Architecture
-
-```
-┌─────────────────┐
-│   Input Data    │  Parquet/JSONL files
-└────────┬────────┘
-         │
-         ▼
-┌─────────────────┐
-│    Embedding    │  GPU-accelerated transformer inference
-│    Creation     │  (sentence-transformers, E5, BGE, etc.)
-└────────┬────────┘
-         │
-         ▼
-┌─────────────────┐
-│   Clustering    │  GPU-accelerated k-means (cuML)
-│    (k-means)    │  Groups similar embeddings
-└────────┬────────┘
-         │
-         ▼
-┌─────────────────┐
-│   Duplicate     │  Pairwise similarity within clusters
-│   Extraction    │  Identifies semantic duplicates
-└────────┬────────┘
-         │
-         ▼
-┌─────────────────┐
-│  Deduplicated   │  Original data minus duplicates
-│     Output      │
-└─────────────────┘
-```
+View the job in the [jobs tab](https://console.anyscale.com/jobs) of the Anyscale console.
 
-### Key Components
-
-- **EmbeddingCreator**: Computes dense vector embeddings using pre-trained models
-- **ClusteringModel**: GPU-accelerated k-means clustering with cuML
-- **SemanticClusterLevelDedup**: Finds duplicates within clusters using cosine similarity
-
-### Scaling Considerations
-
-- **Number of clusters**: Should be roughly √(n_documents) for balanced cluster sizes
-- **Memory**: Each GPU should have enough memory for the embedding model (~4GB for MiniLM, ~8GB for larger models)
-- **Batch size**: Increase for faster processing, decrease if running out of GPU memory
-
-## Output
-
-The pipeline produces:
-
-1. **Deduplicated dataset**: Parquet files in `{OUTPUT_DATA_PATH}/{timestamp}/deduplicated/`
-2. **Cache files**: Intermediate embeddings and clusters for debugging
-
-Example output log:
-
-```
-============================================================
-Semantic Deduplication Complete!
-============================================================
-Original documents: 1,000,000
-Duplicates removed: 127,543
-Final documents: 872,457
-Reduction: 12.75%
-Output saved to: /mnt/cluster_storage/semantic_dedup/output/20250115T143022Z/deduplicated
-============================================================
-```
-
-## Monitoring
-
-View your job progress in the [Anyscale Console](https://console.anyscale.com/jobs). The Dask dashboard link will be printed in the logs for detailed task monitoring.
-
-## Cost Optimization Tips
-
-1. **Use spot instances**: The job.yaml is configured with `market_type: PREFER_SPOT` for cost savings
-2. **Start small**: Test with a subset of your data before running on the full dataset
-3. **Choose the right GPU**: A10G instances offer a good balance of cost and performance
-4. **Tune batch size**: Larger batches = better GPU utilization = faster processing
-
-## Learn More
+## Learn more
 
 - [NeMo Curator Documentation](https://docs.nvidia.com/nemo/curator/latest/)
-- [Semantic Deduplication Guide](https://docs.nvidia.com/nemo/curator/latest/curate-text/process-data/deduplication/index.html)
-- [NeMo Curator GitHub](https://github.com/NVIDIA-NeMo/Curator)
-- [Anyscale Documentation](https://docs.anyscale.com/)
-
+- [NeMo Curator Image Tutorials](https://github.com/NVIDIA-NeMo/Curator/tree/main/tutorials/image/getting-started)
+- [Anyscale Jobs Documentation](https://docs.anyscale.com/platform/jobs/)
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
index 474f465..077b94e 100644
--- a/nemo_curator_semantic_dedup/image_dedup_example.py
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -16,6 +16,7 @@
 import os
 import time
 
+import ray
 from helper import download_webdataset
 
 from nemo_curator.core.client import RayClient
@@ -199,9 +200,44 @@ def main(args: argparse.Namespace) -> None:
     ray_client.stop()
 
 
+def get_env_or_arg(env_var: str, arg_value, default=None):
+    """Get value from environment variable or command-line argument."""
+    env_value = os.environ.get(env_var)
+    if env_value is not None:
+        return env_value
+    if arg_value is not None:
+        return arg_value
+    return default
+
+
+def get_env_bool(env_var: str, arg_value: bool, default: bool = False) -> bool:
+    """Get boolean value from environment variable or command-line argument."""
+    env_value = os.environ.get(env_var)
+    if env_value is not None:
+        return env_value.lower() in ("true", "1", "yes")
+    return arg_value if arg_value is not None else default
+
+
+def get_env_int(env_var: str, arg_value: int, default: int) -> int:
+    """Get integer value from environment variable or command-line argument."""
+    env_value = os.environ.get(env_var)
+    if env_value is not None:
+        return int(env_value)
+    return arg_value if arg_value is not None else default
+
+
+def get_env_float(env_var: str, arg_value: float, default: float) -> float:
+    """Get float value from environment variable or command-line argument."""
+    env_value = os.environ.get(env_var)
+    if env_value is not None:
+        return float(env_value)
+    return arg_value if arg_value is not None else default
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Image curation pipeline with embedding generation and quality scoring"
+        description="Image curation pipeline with embedding generation and quality scoring. "
+                    "Arguments can also be set via environment variables (see job.yaml)."
     )
 
     # Dataset arguments
@@ -210,71 +246,76 @@ def main(args: argparse.Namespace) -> None:
         type=str,
         required=False,
         default=None,
-        help="Path to input parquet file containing image URLs and metadata"
+        help="Path to input parquet file containing image URLs and metadata (env: INPUT_PARQUET)"
     )
     parser.add_argument(
         "--input-wds-dataset-dir",
         type=str,
-        required=True,
-        help="Directory to save the downloaded webdataset"
+        required=False,
+        default=None,
+        help="Directory to save the downloaded webdataset (env: INPUT_WDS_DIR)"
     )
     parser.add_argument(
         "--output-dataset-dir",
         type=str,
-        required=True,
-        help="Directory to save the resulting webdataset"
+        required=False,
+        default=None,
+        help="Directory to save the resulting webdataset (env: OUTPUT_DIR)"
     )
     parser.add_argument(
         "--embeddings-dir",
         type=str,
-        required=True,
-        help="Directory to save the embeddings"
+        required=False,
+        default=None,
+        help="Directory to save the embeddings (env: EMBEDDINGS_DIR)"
     )
     parser.add_argument(
         "--removal-parquets-dir",
         type=str,
-        required=True,
-        help="Directory to save the remove parquets"
+        required=False,
+        default=None,
+        help="Directory to save the remove parquets (env: REMOVAL_DIR)"
     )
     parser.add_argument(
         "--download-processes",
         type=int,
-        default=8,
-        help="Number of parallel processes for downloading images"
+        default=None,
+        help="Number of parallel processes for downloading images (env: DOWNLOAD_PROCESSES)"
     )
     parser.add_argument(
         "--entries-per-tar",
         type=int,
-        default=1000,
-        help="Number of entries per tar shard during download"
+        default=None,
+        help="Number of entries per tar shard during download (env: ENTRIES_PER_TAR)"
     )
     parser.add_argument(
         "--skip-download",
         action="store_true",
-        default=False,
-        help="Skip dataset download and use existing webdataset"
+        default=None,
+        help="Skip dataset download and use existing webdataset (env: SKIP_DOWNLOAD)"
     )
 
     # Image reader arguments
     parser.add_argument(
         "--tar-files-per-partition",
         type=int,
-        default=1,
-        help="Number of tar files to process per partition (controls parallelism) for FilePartitioningStage"
+        default=None,
+        help="Number of tar files to process per partition (env: TAR_FILES_PER_PARTITION)"
     )
     parser.add_argument(
         "--batch-size",
         type=int,
-        default=100,
-        help="Number of images per ImageBatch for the reader stage"
+        default=None,
+        help="Number of images per ImageBatch for the reader stage (env: BATCH_SIZE)"
     )
 
     # General arguments
     parser.add_argument(
         "--model-dir",
         type=str,
-        required=True,
-        help="Path to model directory containing all model weights"
+        required=False,
+        default=None,
+        help="Path to model directory containing all model weights (env: MODEL_DIR)"
     )
     parser.add_argument(
         "--verbose",
@@ -287,15 +328,41 @@ def main(args: argparse.Namespace) -> None:
     parser.add_argument(
         "--embedding-batch-size",
         type=int,
-        default=32,
-        help="Batch size for embedding generation"
+        default=None,
+        help="Batch size for embedding generation (env: EMBEDDING_BATCH_SIZE)"
     )
     parser.add_argument(
         "--embedding-gpus-per-worker",
         type=float,
-        default=0.25,
+        default=None,
         help="GPU allocation per worker for embedding generation"
     )
 
-    args = parser.parse_args()
+    cli_args = parser.parse_args()
+    
+    # Resolve arguments from environment variables or command-line args
+    args = argparse.Namespace(
+        input_parquet=get_env_or_arg("INPUT_PARQUET", cli_args.input_parquet),
+        input_wds_dataset_dir=get_env_or_arg("INPUT_WDS_DIR", cli_args.input_wds_dataset_dir),
+        output_dataset_dir=get_env_or_arg("OUTPUT_DIR", cli_args.output_dataset_dir),
+        embeddings_dir=get_env_or_arg("EMBEDDINGS_DIR", cli_args.embeddings_dir),
+        removal_parquets_dir=get_env_or_arg("REMOVAL_DIR", cli_args.removal_parquets_dir),
+        model_dir=get_env_or_arg("MODEL_DIR", cli_args.model_dir, "/home/ray/model_weights"),
+        download_processes=get_env_int("DOWNLOAD_PROCESSES", cli_args.download_processes, 8),
+        entries_per_tar=get_env_int("ENTRIES_PER_TAR", cli_args.entries_per_tar, 1000),
+        skip_download=get_env_bool("SKIP_DOWNLOAD", cli_args.skip_download, False),
+        tar_files_per_partition=get_env_int("TAR_FILES_PER_PARTITION", cli_args.tar_files_per_partition, 1),
+        batch_size=get_env_int("BATCH_SIZE", cli_args.batch_size, 100),
+        embedding_batch_size=get_env_int("EMBEDDING_BATCH_SIZE", cli_args.embedding_batch_size, 32),
+        embedding_gpus_per_worker=get_env_float("EMBEDDING_GPUS_PER_WORKER", cli_args.embedding_gpus_per_worker, 0.25),
+        verbose=cli_args.verbose,
+    )
+    
+    # Validate required arguments
+    required_args = ["input_wds_dataset_dir", "output_dataset_dir", "embeddings_dir", "removal_parquets_dir"]
+    missing = [arg for arg in required_args if getattr(args, arg) is None]
+    if missing:
+        parser.error(f"Missing required arguments: {', '.join(missing)}. "
+                     "Set them via command-line or environment variables.")
+    
     main(args)
\ No newline at end of file
diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
new file mode 100644
index 0000000..78fbd5c
--- /dev/null
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -0,0 +1,66 @@
+# NeMo Curator Image Semantic Deduplication Job
+# View the docs: https://docs.anyscale.com/reference/job-api#jobconfig
+
+name: nemo-curator-image-dedup
+
+# Build custom image with NeMo Curator CUDA dependencies
+containerfile: ./Dockerfile
+
+# Use named compute config with L40S GPU
+compute_config: "nemo-compute-config"
+
+# Working directory - upload only the example code, not data
+working_dir: .
+
+# Environment variables for job configuration
+# Override these when submitting to use your own data paths
+env_vars:
+  # Input parquet file with image URLs (TEXT and URL columns)
+  # This file is copied into the Docker image during build
+  INPUT_PARQUET: "/home/ray/data/truncated_100k_mscoco.parquet"
+  
+  # Directory for WebDataset tar files (created from parquet)
+  # Use /mnt/cluster_storage for persistence, or /home/ray/data for ephemeral
+  INPUT_WDS_DIR: "/mnt/cluster_storage/nemo_curator/webdataset"
+  
+  # Output directory for deduplicated images
+  OUTPUT_DIR: "/mnt/cluster_storage/nemo_curator/results"
+  
+  # Directory to store CLIP embeddings
+  EMBEDDINGS_DIR: "/mnt/cluster_storage/nemo_curator/embeddings"
+  
+  # Directory for duplicate removal parquets
+  REMOVAL_DIR: "/mnt/cluster_storage/nemo_curator/removal_ids"
+  
+  # Model weights directory (pre-downloaded in Docker image)
+  MODEL_DIR: "/home/ray/model_weights"
+  
+  # Processing settings
+  BATCH_SIZE: "32"
+  EMBEDDING_BATCH_SIZE: "32"
+  TAR_FILES_PER_PARTITION: "10"
+  DOWNLOAD_PROCESSES: "8"
+  ENTRIES_PER_TAR: "1000"
+  
+  # Set to "true" to skip downloading (use existing WebDataset)
+  # WebDataset already exists from previous run
+  SKIP_DOWNLOAD: "false"
+  
+  # Ray memory settings to avoid OOM
+  RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION: "0.5"
+  
+  # Increase Ray API server limit for cosmos_xenna monitoring
+  RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
+
+# When empty, uses the default Anyscale Cloud
+cloud:
+
+# The entrypoint script
+entrypoint: python image_dedup_example.py
+
+# Don't retry on failure - easier to debug
+max_retries: 0
+
+# Kill after 4 hours to control costs (adjust based on dataset size)
+timeout_s: 14400
+

From a5cd5b37d1c77b4fc76fd11f3c13119fe7087ad2 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Sun, 30 Nov 2025 15:34:16 -0800
Subject: [PATCH 03/14] remove unnecessary file

---
 .../__pycache__/helper.cpython-312.pyc          | Bin 15239 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 nemo_curator_semantic_dedup/__pycache__/helper.cpython-312.pyc

diff --git a/nemo_curator_semantic_dedup/__pycache__/helper.cpython-312.pyc b/nemo_curator_semantic_dedup/__pycache__/helper.cpython-312.pyc
deleted file mode 100644
index b0e10dfe70c3ed15c17633f6b4060378992a8e3a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15239
zcmbt*YjhJ=nqZYwl1gvM?>B{CSb%NJBY+_R;|CbKZ4yWb4vCI%l|e?9oGKZxB6mBT
zKHK&zXAslb!I?A@XFKOa*|6K*=V51GBq4Ly&g@w!5|O6j>|uBEW7so4oQ63`(|gYD
z_uW#Jgi&w0cgyy@U)_7Z?>_3g-~ArfpV{pe3Z9=VZ1VrAgQEThU(`ph0Q7@PG(}yb
zc#5Y3)D%5N(+KMVx-lKe^<#RHGh+<N4Pyq98^??!H;tJf*9X`s^O%|7G6Bn!b<9d&
zL%=p=AG7CZI>sCX$2sPL+!%0Am5h~;I#Zx@s%)%`z-*v=s(h@RrUbiagLdXX#Z={3
zC8@Ins-~*PssXl+)$mn(HE;WgVXT&~;q5<R#_IT5-T`^NnKDy+9j1@}@b$bC+BZC2
z3T+#B7vOE+H}G!AIiBN7Am7Mu<VzuMRNC{o{0ZzPz6_`~VL9LYcrD+;mqT4M-^y>|
zD*$TY+jtLO2~g{-&a?SGeg+RMTYbS`DC`USLqTZ--c8d!G3@sR0A!AZLYUi6oI2L;
z?LE@pJ2W_Q7z&Jm(Al$s2#{fL%6C@4w&7R#Dafs)VD!wSFdl|i3wb@{3y+_>e+N1o
z&wYKu1IQz9P!#>(?{pM3MA5(vVc-en;pwQ3rz7z4A=Kj2N4=b+s5!lvic*t>bd+i;
zN{0<OkeW2AP(J5%H}oIo=-*&esynGSXr6iv=>4hwn*JXc4AdMGWhU7?o(hEtPvhJ9
zHi{CrKr6dim(M@Ufo}je6RM0tbsV`Ge$;;7fcL^)X=+?YQ^%>Oo~O_E0sXeHQ>8lT
zQlWg#8>0HRftRB+q^9B{ew(^(RA)@M`2X5}R3FuyhqadI%l|rDuJ$xpr9%0fr&X%j
zEOp8(Y3g!{p}uG2=BN(FY{8LVXH-fwHB0TLF5iN`Yw))Se?v`F7e&^Ifv-c>aW`dv
z@m++KItBa!y}five~P+DH&J0|qdb!(DwNN|Why*bp+fn5Lf1@*?Q^iAxwUWjo*5Lb
zJsUMVzRss$mFEmUG+h2b$uH6uDP@!}r$70p`ir#3I&vQw@jiax4`3ICVF%()Gll(A
zLTDx&F~8Ik4hvJ$VJ=esGB@D!2LzrAhd4fTF&GH>crN1T=8l{=ag38-@6JfvBV9Ka
zVR`}qK;V1|uEb3!h!Y`^^VnsB=nI||Wc^toESr4((7AAUTDJEF{6a8%Lg`pGD)eL%
z68DY{$&4uYc+ms{l?}XbX6CGHRWSNRF(d*VX*M|S56QM11*O;^1q5N*W0XyjAW0xw
zWc`d7kWHcx7X5+*Xvz>}RurUZ5IurS3leTxjy#VJcAg7O37w*Ewv!hod^3S?r*O$P
zH60M7&Y&<A@{Z4lzHmtNO2U*c81|2Qd4Zpq?mQ<1rUkKMdRBH%2q1O5eiTsNGqYhq
z5;p;T5By2Pu=;;XeK$|7mR2pfmkuxc-WvL%v~9t(ren>=>1<`gwb!n`w!GtCnKIk@
zliT|597=B+jlGtsJeICJmN;=bTUvYVz|{lqo&CJDJ-gBKe*L@kzu1te-S(}4s(O}&
zW~s_!YeuS~e(BXed1<w*=GySp;pM&YlkvgCuA_H$+-bS<!^A5SnOCNguS_Lg4kk8+
z(q+@%nsdl&C6ui+b~0tHTP>-+)_%2pnNF27W^9c~TjN?aRpxnEM;WX?9e#Z{wsXl8
zn@w=LR>oJJPq_D`nf+O-bFCH$eRr>R+jnbZK;I=Qk9`NDg5PB2vEM$lQ0}UXt10Pf
zdhcYWWpA=&Z^GIAYf6VxcUCO9|LC<<SH(5kRol{RyfSW1bPe2+ZV7j`-|^i!o~VB|
z?RxHi17&RfXw5>|n;uF35mxobrTshUKi$%|LH7|&_vzS=bb5phoBAw<kG9f%4Es^L
z9^q{^EZ<M}8QG7XW&r+})&uItOgq$m+-1cQsAY{GliE*c1Jr(Es={;w-M`)V3D*el
zZ5<6QZ!<Jt-8K@KC8@>OS7*NMqWel6x7{X$t4!E(LsNgJ;Zt2x|7OFdElBgz%?2##
zWB`8bWHQ5a$jMX%ESWo*ph$TpbLWxzj`^f^^5jdS<OF-0;dR$_s??v;pM~=!jN6Gk
z#m*UC52sV^oSHKfwAIcZ6`P`X18=-x(x@8?THEriR48hQ8gH;_>w=RnC%xxPFgi9b
z`{0~)tGM}GJ>k?b!nr&`QKw;5=FB9u%vqyW-dqhRb2fl11hV6K{x%ae^VYYS>$*8d
z)E>1(9lY)5`Z*_@U#0mTR4D3H`=F^PTdYO?oQj&FcHVx&@nKFg$YWkU1}9RbO0(E`
z7G-%S@AA`sOl#+vstcgLSnSl)&bx4}M!8Of@;RI+ZgstPQFHG6+WyF;j<Ao4(J#NC
zuHalrc!P=`EeUfftWtr~xhh%$Y3=&;EEq|nij~i!4mfj5uIp};s{PKDhFjG7Xo*@H
zE%j<&BI=$*ak=h^mWH>eMUz`qD4*wP-6&H>O;h1cwM7(o06Bs$$5icivP-Sc=X`})
zJ6HC5%&zjW*_9W}u1uv9E&txxm4DCdG+K|%ZihPR$(<^c&(#^Lqxhb=R{S1wt$J*(
zl?8Lv)+}22y>qSnp1EqY3g;T5FP~!2;&^Olz!%=F&UkW<3gz=?UZz!rwc<ZmmR1+Y
z(&}i{<UWm*21KjXbyjrBnC^0LH|(*?e@oIYAJZzyK3}g@LfgMn^~$ct^-W1BCt6(1
zh^*hXi;plJVV@|XE+?X*ejoHw>OMjqooopBr~F~rK{79>Cf;))DeTdSsHDo~@lY@b
zmJ%pZ%!&RNPssY`pBo0EJ9&>@+=wkq<GyLM&1C&~VfH@CHV-Z0_LkkCOU;NuuW)Hx
zAWftbGJWw8(=j=H7IaD`=$jHCH%)-<7&<d4ZU>}ChMDk0`yL$erSKy|$D|YrM%WH8
z`Cgq7!kz|M7m{RV+7~`2Gm{~IP()=}MA-@YUGK48ZL~S9@pw+TGYi3)DFGBRL1z77
z0W`yll5F%%PYXewXkB7G(lDL{$^s~juwFPUnF&D)+2WV{LC`yc<AQ7;nwo4n;S&dg
z6Cqh2fCh}@j|iYM@;t%Pfx2lrgc{x8sH~U5qHGj`;~^gC;e<d>otc`JWZg6`oBMoW
z-+%~9AzQ*BuR75h=m%|5*#gS;^i0^x`$gGIio7rr*#-mjVqdt3vUy4f`*<KEk*_9#
zM}rbMoR9&^&Py`wmuau8hgp+VCSOiOi6AqGifXR}8muJifv#*+-eeX&l-G}wwC6su
zqNd6$d>p01g7e9hnh8wWn~<uPpO9_23Bkm|=cF3wQBj|ZrD7`%$UZHG#^I}Z$z;U<
zl<kK<=`tMOIgPmd+U~2nukF9Of7ueBy;&L$-8_-ldMH)eJ3oBSRWd*LwWB=aXi7So
z;*DuX`~2ZmS7pZ4oOCtEcia@yu0znmS&?xzC!Njllef$n=h3^)qj%0`T_p>K)!OFe
z=ih(j-B(gAyS}J>YH?_xXYo>uj_t@+R4=t&J-=``TT&hqu5MoFy;t6_B*jLTdzQzS
zcPC1=KuK*weD}>A$<hN0eOX(1##WcK)h(S!+ZylL+*gJdhtt)Y<GWXSZ@JUey?1SW
zYo%1(?uQjrHTRC=El1pts2aFcmaH1clngAGzOAKPLo^KTVFP7xU#VNHiv^Z@)8@8#
z?`n19^6pIamc_w^o`nlvJ7Hw+wcK^K{gW56P_tvn8K>h-%NG+BoiY7GBURV*&VjcM
ztn656SqZP$(zQ>=*j27+dEgh;82hF@>ngowy=q-La^uV|CqJ0HS&`~E7_+8bJ%G8f
zd3iAIdv|!HI@!E0#qG~<y-BY3*5MR49J8<1HWsD(gA}*D7~%R<$6+LV1PC`YzBBgL
zSlqm_@dI0`epjY`f3kl6&6BD6{+MMI#yk4f=t}R(uA7FH`gGlan0XaOGWLrNG4q=a
zb-aT&#9u}}h}_(q>g<nMzi<tF<DqJ{eygV}o`>5gr~67^F|f2Z<!Av0D1DM;UQBEq
zNR=L*AI>t?3{#P0Dq_*QO!L}t8khYyYc*8qVaUiw_zipo`2E~@^wGEQF+2%#+y9bY
z2d?~aPv@|O{-}!{+F<->yB^_PhYUk)`cJy)p+@5;`xt}|SfS#!g&t~QZd+>*ZfP2N
zM*pc<hpDaa0A#<i^^`%)XN@#qeAYrE+-4lwXZoyz9_q4v)@efcDHhA0F$|kbf2B78
zJPx};dr)Zy0ZcytZ5LcsQAOp|&FS+_t76Al-YEqR7BC5(R8VI@rjLNYCo)nm&IHM}
z^f0nv{M=0Nym$)w6UQ)l5fTX<9k{`j4PCBoXjK#_IMdJw|0Fy`=BaFDU6LuAA6%IE
znz8<L==GsD?MuS)*h+QEweK#o{{b|j#x)NLyr2a{CVLRlLcd(GDMo3YzM<JTpb2Q!
zP%%GTRMYvu62gAS!|TIV70&AlptCa9HLo2|);w-`O`10p!+dg~`=~31E5>oKz{Yhi
z!)Z-y6~mkntRg0AgdZJx8*8Wu-BsOXH$%;t=AgD<gvNq?OsbASyl?2wVy_$2G8m_3
zH%0X~%xXDU0BpEaon_u96{T-jR84u#oVQiNs34OEtRKzJP|GMPYL05QU$L?AR^E2Q
z{$cLBs-sx%353yVBqxj5Fq4`+qBR3wEAP-oY%zZy1!Tf?Pt>B0N8DPlfr?o!T1%;-
zpk?zqg7BKBiJBk&pigyNKWCgXT%=I%d(3wjRdu{D9*yJyKVZw8HLO`Zlbh9=e9k+g
z7Wm$-sP%FGCGq=Ne`uWry?74tyx-5`9ytN37%N=zOJQ&5e8e;iK0NT~MQS+ky`e{p
zn-D`&Ty76@sCtMfdm`53(*b{&^9RBC7qN0=b4y%kf{WNSRKYiX4xESPoFot=P!UmP
zMBK-e!`>e}o5K`CP!6I7*;8hCK^m8tnII^r`ta0rq&n|t6v4-KLExa21Ko%dk!w8_
z;xmZHPWjFYP%4Sf<EsrlgsN&K8>U4VtT+rcVlO7>c@<G0h{%T`3MH8dz(R<)_hkkg
zbkpJitU8GA2ET;Lm<d}TU=XLs`{;4vStOoF5%+?}CZ0zuoT=>0EhEveQOUA{8t2s(
zMAq>WiUfTHTN{;S1WgwL95OR86C4MfbZQ#aKoM<X37sb>W7S8IwWX^2)q7DmLli=A
zXD&k13y5=pMoIh7Mj1>fVPlr%6L*=-a2i`|S2iqeSgK2zTjvk0GUkQDZ}u+fes(yv
zH&ORg!m>Nf?0Lvi1N12US2Ky{Ur4|BQsVS0sTaKo-*js9)pWo3P_J_s|GHoN=ciXr
z-J%z&V~1j|E_E&aC|TQ)DC_*3-InMV*Yw~M{Dy*t(4)b!F!Xc%(!kr+*xd5aiZju9
z@RmJc8A&svI3CB6{jRw+-n{CpNLOyXS(A45Eih1FwqHJxwYjbgE)K@{rGb>KX%)1w
ziJxuBx|_4^s)uHS!}5?an62Qdv}}4<g=)ar#j`{e=w6;lp*FatV+=j?s?~mF_u}p=
zPcJ^bRGGGJywUi}%^z%DIho%4^s3Eq<;dcZ*vY%LdKmPUXA;(pzj@$+fq!cQstI%J
zZy$2t0R1&ZBO%st#kOcmRBl_TTruA)x!H4bN22^dhCP^M5B_#dk8K`F=!N)bQ(1q#
z@#B^reSeeww!6pN-${MiWa!^&`n1zLP_O@VPwhaJ{x7Qx06*sJL$)MG-4c92IM-=#
zdqL6q2TBXIfm}6DgZl~sC!+IrU_l)!*iei8tb1NZtRm&$Hd3+BC%*0>SVhVKOP}kj
zNcigYUC>vN@=5L3&3E%*4g`t}Qp_K_#c8Ob9NHUlBXH!YsgPJ3)faOp0?i_=Rhr=X
z(;ufi#>m!Vq5uL?;8b&r6NT}R$a5Ip@C7;I>F2=I1%tZ7qnCAlUbcor|5<;~7x2Q1
zU3(!GuG1$-V2)z!Kms~oAqJ(1i#gaVOq~&U9weu1*FcaDQyw#zxFn_k6fFvb<itOQ
zX^K}ci9sS8!yzK%P!NK|%vq4Kab_9>hXBf*S;2+z%Eq8@5eH}q1$fB4;~<rsDvDQ`
z<vO*m3WOV34ez>k;@i;Sb@-G1D<t#OnuBtdXDhi(r6*bG0Rtve-JYy&|Hi~r8s|s9
zaZqeYwzV_Ux+mGXXTiB-NwQ54xnSKH6PGk`3DeeW`%{_rfn@u@f-_O&NwS->CRfH(
zn>5uXOq;T8U75E1$+rD~YD=;$q^>4us!5ny?`_(a+0>of)ScPXlibvk+SCh;tVy<c
zbxTKlHq&`1*?B0n1q+=?wk_9b_2;JQEL)jj8<K3p(rji!S8_vFn%#cST9aUE#8+WX
zk16Ha=B3Eyg{X1S&3l0o>%y?fm8W<co<?;`WjoYqk(Fn`e_y=)??qGhJ80_vKWOTU
z)0|~IhDdV{&vQ9<LT6aG6vn8b$|AW9(?Prx5VD7Au@mMv0{*Cf1iAPuBmiGS_|G7b
zP~xeUMwJ&FFkaPF0p1jqtoY~93L`+$k06<+?wMS%!Ax0e5`Imst1WHu>WpV!(zEa8
zVA3;?YB>y_-26G)m^C|Ndo!giN%%Fl6t){kdU{hWeScs(I87|A3X5o;8V01Xh%qPM
zi%Iv_XY%Y(R3g*GuvBIN8&C`BL^X4z(7c(nz)03TBn%G0q5QQ32={0P4unLU>k!~o
zmCS0-QG-{L>A+H&agoPF^Co}Y0Tg3;mIg*?Lq<_VCTcRsCZIPF&6D#_aJ~x=I`*9j
z2puCL`gKKg>5AQucr2>eLp%|Cm(8fXf)CU;D?4)$SKtpN=z16thcHn@Aw-`q_yRM6
z_ya6MTiRn3#}OuMw*-(X8>jsN*{X<1#VweVx99{b2o@{KnVYbu(i%F4p^5JUtvm21
z1t1}OsR^#H%c`%9UL9Q?%rx&wHt$K5?OiahR<tbpQx)45te^{3)L-*n^~PU_A6e;(
zcc#m`7c8qa^-Je74cn6q+fxlYQ#HF5ESH^G)_TRdXpJ3>TNkW%*_~@{ifwz63Guil
zO2)4N_`~dJP%rew;){GCqiO6{?1(3f1~IoAnuQ9CrXBlvYa5tJQ!9X(im+eQkhg4$
z-LuHkx?xo9Ltr_RhmABK!yP?7I+7Ph1w7YrA4LsVF-CuoM`a#mulO`38t098EaGu2
zIf2P@m^{v0d6p7KvGy1wDs!PY6BXtnj#R?D=djQZ$rIe<BHTn;=9uXQG`ovUUxH+w
z`r1`?&2rVUbYP_`?b<azn041=+#8ebjmw)-?#=T@!7F;rdDR&=E?<hzBwBW*T~Eyq
zu9jCV)%?smKlqc8ETdQn=J&S0-~DcPd@|LvcYf$D)2;B1=0SnZ&`u(}697^6hw%<g
zK}6RO#k*7BT|!?+F;0pai}`6(5d*huHZWAwAlV9hAsX+&*Ddto6!MkEZD1{fcLhBq
zkNHS6QCCd;%L~F{x1Yef1Hv31HJ>x_Y@v6BXA3x78%f?PM53-IF=h<{(hNU2@6Vj2
zAg0U`?@dna2fskM(kH0%P{UhpSXF1<oUOozQsjowBp%u89xybl>K>W1|3Bf|G<*kq
zXJ<j&ux?#v@QSf^wLN&ETm^Av&BFqG@?y?2=7vc0MwJT8*A4DVQ{I~d(c>nqwj^2#
zJ0ExypSiNAa}urSbx%}_oPx(`lUf;dMFCsIg&L1mQvw*dwTJR8TDdD)R)}$Vj6@?D
z9)1eix2x?ZJ5(s2=WiCwl}F2q#be<%f#Zg=fFABb@_^@u<zY2|pG&Um;eJPH0jFsz
z3AJU9aeE%WJZb~ak;cI=w|rd%@hTP6L0^>-*eP=?L?vOS1#PPe`qelb`mK(#kk<T;
z-fPLdyMi8OnI}AB?<k%zPo3{`*k%^=f*kjOJ7^FMA99;BXJ<*MH=GpC^kH;N2%{-B
z<qLBc{o!+<Y)a>RBAQSg7DA#&JS%k*2npuSbqvD^Qa3jY{vPye73o58jrpgAfIld3
zO2kehlMCRX3D(W^`N3F%XzDC@(qIG^A=n5$taC!)5T$dWnE=n75jY8^#A}2k<#rkI
zAd!K0b0afg1d2GSTqVxd9~=+N@B-u<I^Hy`Nh37nyW}MVU{<3duMmKR12-Gu0X1mB
zDJ{4*G+rZu7;2ve?HV0*YS>ZoXcY5iVK+AjgP9h_{S$tH-wF#2H~zp9lmrpIbud%a
z#?7s-AC$ZxRQM$htjd|GAR&4}lz^2Z?h%*{ep2!oaXCgJ)&id)_!x85ivO`hTL%ba
zD>EQoN;tMH`B@WX4y=fO23^Q{v@xCnNG&KQ4cD*r0FW@?$$x3a1&#m5&woZ3j#oP2
zWO|!S^AV%MSP@noN5sK#?c9l+-U>G$OYr@e63!+o+0BU#oNAfn_{=y4J||`Zfm!9#
zqeC@6{fH^|O(G?uBq)rIPlcPhIZrileTuU<CnL-W0^8s}MFsdZC{k2g23Q{g=i>sX
ztp>Ov-L;K4KS3qNFfw{b!+wnA=e?bZ{&EB<^de;|nEq2|fXl%PKn!`YAL~$$B}SHr
zN(k|~iZ37p=NEWoWjmRL64HgsmDYkiA-kX8Yw}^lw*Xak`vTw@A>ItdZz7QplemJF
zEMZX`32~a*a<7D6WLM5rtwqU2REvm57UI_sJtr;}mue!aP7J~uvPBsakiL{x`ia3Q
zTQv-RLc)U-Ma7ekqVm@x%PQknV)vk`6?kU<643t}{7DDl6amj{qw1NpF7tPpE#Pse
zu1zrIU%DF>3|Y21!*WTMTP}aU=G~gR>{hTQOE!FCq-^!8tUck_m<EHDb$ku(TX3x=
z>e^FP9f``$_*~-Yk;MMd#HcS}Ig@6_zpmrn8GLK-J^ua4cPA5Fhf=M*sk*)dQw`m@
z-1A3P?Ip3UH^&n-TT&HU6CDRK_Ja#>;j}I8j(35ZHo4_sqOxbf1{O%&miV4qmUQh<
zf~{JuY+b%|^XQ%abmemkw$;k&YZtFxycWG0T^>qw^`t5fL194@)*3&ZD(zliq0hPo
zKnH4$@++qoPcOAWjN6VeZqN3W{a<(vCS5Ht%hKT2)y>(mx@=iPc7rFo`2cu&Yf56q
zhc2pV-_5<ZChkn88c)TXSr?aaZA!W}#hq!_9w6#&Ty9#PU1?3b_Trl-UcEA!cJ(2=
zak+Fkv~uCD>lp|fZ|<QXa_qp!F$_I$HIQi9k*e6aV9m0nIag0~Wmkf2NV9vthShd)
z%T@7>@uTtmE3d{kB)GjdD{kv=4W|x1n>g@X;<>Se<E1ov8oo=+u@qejCLHZ)wgW$t
zs}U$C8h0ifyVC4a_gGh~XVLyg?Aj@sa`ey-Pt)KUPuMoT*OX}8ood{Z*tqv*?X4GY
z!S#fM<%KkJ61?*XTf@?8OH&EU*1JsmxBY<VN!Z&SNvN&fZtmH1WV7MUX2X#k=HgD|
z8fb>WVemxZ&3qlv5WooxCqB9{naHlgLXcdQ!Dx>n5Hy*u$TtXEpnNSn6*|O!0d&QG
z35iU1d;lRF*{Pw!T@etR%C`O`w2)8{gFQnwiTH035tHq3luH}{;1sk`&W0u5vSle6
z?^^OE?CtSq<5FBmSa&9vor(nlwubf~3`FS%+W|lqIm}KU0n<N9MZi*tKq>s!1k92^
z;BWw+8QhBJ^*DPn>!*4Z!Il%~z95KpatLsPeG(i|aIpF~h(y6XB+icpARcgH$9;60
z<mWE_XLtpR?ms~$n>8t-Oq~cVFdyOW3@@`maMqsnho$wFo%k0>5tDtm9y<X5&w#yR
zsWH*8BU!U!<x;ZxK+=9-ejw{?UXCQ4UGqn>j<VRZu}iVZWjgkQguOArG%0J8(@ud#
zft%?9ynLUPrn+(F(6?A<>k(40t~!Vb(|PSPs+-g-{9F^r_yR31s)KdbN4CKotqXz}
z=8g>xE4mcwEV;{4Ii-y_%t1z@rqO{Wx1zNvM>&%>o#EaU$X7@O;XK3)--8}yn{rwx
z7_#xq%*2EsKF)Ce7}|-L9Kr=gH5{yEyYtG};@Cet4a>^de>(E|NbITQt@99&>dZ10
z5-y{`Cbqige?V3iBAEDPJgmq;7AepDw?c>VuLrqnXUaV*^jRu5;`H$C@pctni;LKZ
z$B2t^BcY%Gbik~kBDQJ1165UWBTTthLD0vL#uu1mAdwk@V<f@SNL6?S{_Ef+puLqV
zM1Q#eAF;$NBp%&;7nnBU2MAk~043?+6t*DHj}an$y`e#Qd{#LS3EQK$Lczhq`jG`f
zrn&j#hROTkpR9}p7=MNsUt&^7aU8Lo>IXmJ69+K7d<~ne>t{WdxQdPPRP!Usx73S&
z4UO(26OTYl_TNDsabf7=r9o5`h_&#t`0tSrng4%4h|C`qL<Pe6i83a6x;pVIC?Nlj
zMlL6^;;*#!c}$dd3SEMUJ%FZ(nD<d=!8brzq4e@!G$@acLPM#|<HP02qsRIW!^$Gd
zN0fhQ>maic)7YNO1KUR|{!mA5b9pw&7O!_=2Dne~dPNYI6uHr(>?h*4`Zrki3?_#m
z0k0_pp2HA=mPlwELOyUw(4PxhgDF6w<`77loyHgR%#r(;vT+*3CtOB+9*N-p;?0ON
z%75tSVb>FntP)!<Df~|vxTFeq70DNutDdarARAjmub^0s39vCmE+UbTJqdFwD(RnN
z#rv2LZ4|XE<&gOcgvjnkF-`WdqNlO@r$YQpKzK&{pHKl(0i0-%LiCoV|Gt5uoBoD!
zd`X$Vq^#r@0LNEU{Z|wRzoe@4E2{b{svLg*yWaH`)A1Fv15_KUW8U=8=%QT@TD$0$
z2j^%#{WSdVOx?z8b<LW&neL(CGU1vYfqSmfH3I^GR#ClXA|OkZRVwY^zgUcx2Ub!#
zLO0O0S!d0f9`buO=b8a>Ky%lwnGj;JsTrULhiDVsPd_NL(7PVg80md$Ra8S$wxTjy
zzj4j_EKPT0OPkj82;6g*tr-viI#oc2fGh>2W&*-L<kTuCa54jVJ1Gt5sLHCWvue#`
oq<3Y@tJd_8lbJvclq_~qgCjCOFhkkH5{7R6dxwE8A&dWi02XPW5&!@I


From 15015fca4598ee22586946e023ac795fe165347e Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Mon, 1 Dec 2025 09:07:24 +0000
Subject: [PATCH 04/14] changed custom compute config

---
 nemo_curator_semantic_dedup/job.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index 78fbd5c..23b3ab6 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -6,8 +6,12 @@ name: nemo-curator-image-dedup
 # Build custom image with NeMo Curator CUDA dependencies
 containerfile: ./Dockerfile
 
-# Use named compute config with L40S GPU
-compute_config: "nemo-compute-config"
+# Compute configuration with L40S GPU for CUDA-accelerated image processing
+compute_config:
+  head_node:
+    instance_type: g6.8xlarge  # 1x L40S GPU, 32 vCPUs, 128GB RAM (AWS)
+    # For GCP, use: g2-standard-32 (1x L4 GPU)
+  worker_nodes: []  # Run entirely on head node
 
 # Working directory - upload only the example code, not data
 working_dir: .
@@ -52,9 +56,6 @@ env_vars:
   # Increase Ray API server limit for cosmos_xenna monitoring
   RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
 
-# When empty, uses the default Anyscale Cloud
-cloud:
-
 # The entrypoint script
 entrypoint: python image_dedup_example.py
 

From 89019bad6ee49c47db3c3bf68b0e16b25a084940 Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Tue, 9 Dec 2025 11:02:37 +0000
Subject: [PATCH 05/14] working version

---
 nemo_curator_semantic_dedup/Dockerfile        |   4 +
 .../image_dedup_example.py                    | 101 ++++++++++++++++++
 nemo_curator_semantic_dedup/job.yaml          |  28 +++--
 3 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/nemo_curator_semantic_dedup/Dockerfile b/nemo_curator_semantic_dedup/Dockerfile
index 200ef69..4d4b88c 100644
--- a/nemo_curator_semantic_dedup/Dockerfile
+++ b/nemo_curator_semantic_dedup/Dockerfile
@@ -42,6 +42,10 @@ snapshot_download('openai/clip-vit-large-patch14', local_dir=model_dir)"
 # Set environment variable for model directory
 ENV MODEL_DIR=/home/ray/model_weights
 
+# Required by cosmos_xenna (NeMo Curator backend) - must be set before Ray cluster starts
+# This allows cosmos_xenna to manage GPU allocation instead of Ray's default CUDA_VISIBLE_DEVICES handling
+ENV RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=0
+
 # Download and prepare the example dataset from HuggingFace
 # Downloads MS COCO parquet, deduplicates URLs, and truncates to 100k rows
 RUN mkdir -p /home/ray/data && \
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
index 077b94e..c5850c0 100644
--- a/nemo_curator_semantic_dedup/image_dedup_example.py
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -19,6 +19,62 @@
 import ray
 from helper import download_webdataset
 
+
+def wait_for_workers(min_cpus: int = 1, min_gpus: int = 1, timeout: int = 600, poll_interval: int = 10, stability_checks: int = 3):
+    """Wait for Ray cluster to have minimum required resources with stability verification.
+    
+    Args:
+        min_cpus: Minimum CPUs required
+        min_gpus: Minimum GPUs required  
+        timeout: Maximum seconds to wait
+        poll_interval: Seconds between polls
+        stability_checks: Number of consecutive successful checks required
+    """
+    # Connect to Ray cluster
+    if not ray.is_initialized():
+        ray.init(address="auto", ignore_reinit_error=True)
+    
+    print(f"Waiting for cluster resources (min {min_cpus} CPUs, {min_gpus} GPUs)...")
+    start_time = time.time()
+    consecutive_success = 0
+    
+    while True:
+        elapsed = time.time() - start_time
+        if elapsed > timeout:
+            raise TimeoutError(f"Cluster did not reach required resources within {timeout}s")
+        
+        try:
+            resources = ray.cluster_resources()
+            cpus = resources.get("CPU", 0)
+            gpus = resources.get("GPU", 0)
+            
+            print(f"  [{elapsed:.0f}s] Available: {cpus:.0f} CPUs, {gpus:.0f} GPUs (check {consecutive_success + 1}/{stability_checks})")
+            
+            if cpus >= min_cpus and gpus >= min_gpus:
+                consecutive_success += 1
+                if consecutive_success >= stability_checks:
+                    print(f"✓ Cluster stable with {cpus:.0f} CPUs and {gpus:.0f} GPUs")
+                    # Log full resources for debugging
+                    print(f"  Full resources: {resources}")
+                    # Add delay to ensure Ray GCS state is fully propagated
+                    print("  Waiting 5s for resource state to stabilize...")
+                    time.sleep(5)
+                    # Verify one more time
+                    final_resources = ray.cluster_resources()
+                    if "CPU" not in final_resources:
+                        print(f"  WARNING: CPU key missing after delay! Resources: {final_resources}")
+                        consecutive_success = 0
+                        continue
+                    print(f"  Final verification: {final_resources.get('CPU', 0):.0f} CPUs, {final_resources.get('GPU', 0):.0f} GPUs")
+                    return
+            else:
+                consecutive_success = 0
+        except Exception as e:
+            print(f"  [{elapsed:.0f}s] Waiting for cluster... ({e})")
+            consecutive_success = 0
+        
+        time.sleep(poll_interval)
+
 from nemo_curator.core.client import RayClient
 from nemo_curator.pipeline import Pipeline
 from nemo_curator.stages.deduplication.semantic import SemanticDeduplicationWorkflow
@@ -127,6 +183,12 @@ def main(args: argparse.Namespace) -> None:
     ray_client = RayClient()
     ray_client.start()
 
+    # Wait for all cluster nodes to be ready (head + workers)
+    # Read expected resources from environment or use defaults
+    expected_cpus = int(os.environ.get("EXPECTED_CPUS", "4"))
+    expected_gpus = int(os.environ.get("EXPECTED_GPUS", "1"))
+    wait_for_workers(min_cpus=expected_cpus, min_gpus=expected_gpus, timeout=600, poll_interval=5, stability_checks=3)
+
     print("Starting image curation pipeline...")
     print(f"Input parquet file: {args.input_parquet}")
     print(f"Input webdataset directory: {args.input_wds_dataset_dir}")
@@ -164,6 +226,45 @@ def main(args: argparse.Namespace) -> None:
     # Step 2: Create and run curation pipelines
     # Step 2.1: Create image embedding pipeline
     print("Step 2.1: Running image embedding pipeline...")
+    
+    # Re-check cluster resources before running GPU pipeline
+    # This ensures workers are still connected after the download phase
+    # Use aggressive checking: 1s intervals, 5 consecutive checks required
+    print("Verifying cluster resources before GPU pipeline...")
+    wait_for_workers(min_cpus=expected_cpus, min_gpus=expected_gpus, timeout=300, poll_interval=1, stability_checks=5)
+    
+    # Extra verification: Query ray.nodes() to ensure node info is available
+    print("Verifying Ray nodes...")
+    nodes = ray.nodes()
+    print(f"  Found {len(nodes)} Ray nodes:")
+    for node in nodes:
+        node_resources = node.get("Resources", {})
+        alive = node.get("Alive", False)
+        print(f"    - Node {node.get('NodeID', 'unknown')[:8]}: Alive={alive}, CPUs={node_resources.get('CPU', 0)}, GPUs={node_resources.get('GPU', 0)}")
+    
+    # Check both cluster_resources and available_resources
+    # cosmos_xenna might use available_resources which could be different
+    print("\nResource comparison:")
+    cluster_res = ray.cluster_resources()
+    avail_res = ray.available_resources()
+    print(f"  cluster_resources(): CPU={cluster_res.get('CPU', 'MISSING')}, GPU={cluster_res.get('GPU', 'MISSING')}")
+    print(f"  available_resources(): CPU={avail_res.get('CPU', 'MISSING')}, GPU={avail_res.get('GPU', 'MISSING')}")
+    
+    # Wait for resources to stabilize before cosmos_xenna runs
+    print("\nWaiting 10s for Ray state to fully stabilize before cosmos_xenna...")
+    time.sleep(10)
+    
+    # Final check right before pipeline
+    print("Final resource check:")
+    cluster_res = ray.cluster_resources()
+    avail_res = ray.available_resources()
+    print(f"  cluster_resources(): CPU={cluster_res.get('CPU', 'MISSING')}, GPU={cluster_res.get('GPU', 'MISSING')}")
+    print(f"  available_resources(): CPU={avail_res.get('CPU', 'MISSING')}, GPU={avail_res.get('GPU', 'MISSING')}")
+    
+    if 'CPU' not in cluster_res or 'GPU' not in cluster_res:
+        print("WARNING: cluster_resources missing CPU or GPU key!")
+        print(f"  Full cluster_resources: {cluster_res}")
+    
     start_time = time.time()
     pipeline = create_image_embedding_pipeline(args)
     print(pipeline.describe())
diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index 23b3ab6..80d5e85 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -6,12 +6,18 @@ name: nemo-curator-image-dedup
 # Build custom image with NeMo Curator CUDA dependencies
 containerfile: ./Dockerfile
 
-# Compute configuration with L40S GPU for CUDA-accelerated image processing
+# Compute configuration with L4 GPU for CUDA-accelerated image processing
+# Head + worker nodes for distributed processing
 compute_config:
   head_node:
-    instance_type: g6.8xlarge  # 1x L40S GPU, 32 vCPUs, 128GB RAM (AWS)
-    # For GCP, use: g2-standard-32 (1x L4 GPU)
-  worker_nodes: []  # Run entirely on head node
+    instance_type: g6.8xlarge  # 1x L4 GPU, 32 vCPUs, 128GB RAM
+    # Ensure Ray reports CPU resources on the head node for cosmos_xenna
+    resources:
+      CPU: 32
+  worker_nodes:
+    - instance_type: g6.8xlarge  # 1x L4 GPU per worker
+      min_nodes: 2
+      max_nodes: 2
 
 # Working directory - upload only the example code, not data
 working_dir: .
@@ -46,15 +52,23 @@ env_vars:
   DOWNLOAD_PROCESSES: "8"
   ENTRIES_PER_TAR: "1000"
   
-  # Set to "true" to skip downloading (use existing WebDataset)
-  # WebDataset already exists from previous run
-  SKIP_DOWNLOAD: "false"
+
+  SKIP_DOWNLOAD: "false" # Always keep false
   
   # Ray memory settings to avoid OOM
   RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION: "0.5"
   
   # Increase Ray API server limit for cosmos_xenna monitoring
   RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
+  
+  # Required by cosmos_xenna (NeMo Curator backend) - must be set before Ray starts
+  # This allows cosmos_xenna to manage GPU allocation instead of Ray
+  RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES: "0"
+  
+  # Expected cluster resources (head + 1 worker)
+  # With 1 head (32 CPUs, 1 GPU) + 1 worker (32 CPUs, 1 GPU) = 64 CPUs, 2 GPUs
+  EXPECTED_CPUS: "60"
+  EXPECTED_GPUS: "2"
 
 # The entrypoint script
 entrypoint: python image_dedup_example.py

From c22c1da6641a4b1236d33e352da0309f8ee96142 Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Tue, 9 Dec 2025 19:59:05 +0000
Subject: [PATCH 06/14] minimal working version

---
 nemo_curator_semantic_dedup/Dockerfile        |   4 -
 .../image_dedup_example.py                    | 101 ------------------
 nemo_curator_semantic_dedup/job.yaml          |   9 --
 3 files changed, 114 deletions(-)

diff --git a/nemo_curator_semantic_dedup/Dockerfile b/nemo_curator_semantic_dedup/Dockerfile
index 4d4b88c..200ef69 100644
--- a/nemo_curator_semantic_dedup/Dockerfile
+++ b/nemo_curator_semantic_dedup/Dockerfile
@@ -42,10 +42,6 @@ snapshot_download('openai/clip-vit-large-patch14', local_dir=model_dir)"
 # Set environment variable for model directory
 ENV MODEL_DIR=/home/ray/model_weights
 
-# Required by cosmos_xenna (NeMo Curator backend) - must be set before Ray cluster starts
-# This allows cosmos_xenna to manage GPU allocation instead of Ray's default CUDA_VISIBLE_DEVICES handling
-ENV RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES=0
-
 # Download and prepare the example dataset from HuggingFace
 # Downloads MS COCO parquet, deduplicates URLs, and truncates to 100k rows
 RUN mkdir -p /home/ray/data && \
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
index c5850c0..077b94e 100644
--- a/nemo_curator_semantic_dedup/image_dedup_example.py
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -19,62 +19,6 @@
 import ray
 from helper import download_webdataset
 
-
-def wait_for_workers(min_cpus: int = 1, min_gpus: int = 1, timeout: int = 600, poll_interval: int = 10, stability_checks: int = 3):
-    """Wait for Ray cluster to have minimum required resources with stability verification.
-    
-    Args:
-        min_cpus: Minimum CPUs required
-        min_gpus: Minimum GPUs required  
-        timeout: Maximum seconds to wait
-        poll_interval: Seconds between polls
-        stability_checks: Number of consecutive successful checks required
-    """
-    # Connect to Ray cluster
-    if not ray.is_initialized():
-        ray.init(address="auto", ignore_reinit_error=True)
-    
-    print(f"Waiting for cluster resources (min {min_cpus} CPUs, {min_gpus} GPUs)...")
-    start_time = time.time()
-    consecutive_success = 0
-    
-    while True:
-        elapsed = time.time() - start_time
-        if elapsed > timeout:
-            raise TimeoutError(f"Cluster did not reach required resources within {timeout}s")
-        
-        try:
-            resources = ray.cluster_resources()
-            cpus = resources.get("CPU", 0)
-            gpus = resources.get("GPU", 0)
-            
-            print(f"  [{elapsed:.0f}s] Available: {cpus:.0f} CPUs, {gpus:.0f} GPUs (check {consecutive_success + 1}/{stability_checks})")
-            
-            if cpus >= min_cpus and gpus >= min_gpus:
-                consecutive_success += 1
-                if consecutive_success >= stability_checks:
-                    print(f"✓ Cluster stable with {cpus:.0f} CPUs and {gpus:.0f} GPUs")
-                    # Log full resources for debugging
-                    print(f"  Full resources: {resources}")
-                    # Add delay to ensure Ray GCS state is fully propagated
-                    print("  Waiting 5s for resource state to stabilize...")
-                    time.sleep(5)
-                    # Verify one more time
-                    final_resources = ray.cluster_resources()
-                    if "CPU" not in final_resources:
-                        print(f"  WARNING: CPU key missing after delay! Resources: {final_resources}")
-                        consecutive_success = 0
-                        continue
-                    print(f"  Final verification: {final_resources.get('CPU', 0):.0f} CPUs, {final_resources.get('GPU', 0):.0f} GPUs")
-                    return
-            else:
-                consecutive_success = 0
-        except Exception as e:
-            print(f"  [{elapsed:.0f}s] Waiting for cluster... ({e})")
-            consecutive_success = 0
-        
-        time.sleep(poll_interval)
-
 from nemo_curator.core.client import RayClient
 from nemo_curator.pipeline import Pipeline
 from nemo_curator.stages.deduplication.semantic import SemanticDeduplicationWorkflow
@@ -183,12 +127,6 @@ def main(args: argparse.Namespace) -> None:
     ray_client = RayClient()
     ray_client.start()
 
-    # Wait for all cluster nodes to be ready (head + workers)
-    # Read expected resources from environment or use defaults
-    expected_cpus = int(os.environ.get("EXPECTED_CPUS", "4"))
-    expected_gpus = int(os.environ.get("EXPECTED_GPUS", "1"))
-    wait_for_workers(min_cpus=expected_cpus, min_gpus=expected_gpus, timeout=600, poll_interval=5, stability_checks=3)
-
     print("Starting image curation pipeline...")
     print(f"Input parquet file: {args.input_parquet}")
     print(f"Input webdataset directory: {args.input_wds_dataset_dir}")
@@ -226,45 +164,6 @@ def main(args: argparse.Namespace) -> None:
     # Step 2: Create and run curation pipelines
     # Step 2.1: Create image embedding pipeline
     print("Step 2.1: Running image embedding pipeline...")
-    
-    # Re-check cluster resources before running GPU pipeline
-    # This ensures workers are still connected after the download phase
-    # Use aggressive checking: 1s intervals, 5 consecutive checks required
-    print("Verifying cluster resources before GPU pipeline...")
-    wait_for_workers(min_cpus=expected_cpus, min_gpus=expected_gpus, timeout=300, poll_interval=1, stability_checks=5)
-    
-    # Extra verification: Query ray.nodes() to ensure node info is available
-    print("Verifying Ray nodes...")
-    nodes = ray.nodes()
-    print(f"  Found {len(nodes)} Ray nodes:")
-    for node in nodes:
-        node_resources = node.get("Resources", {})
-        alive = node.get("Alive", False)
-        print(f"    - Node {node.get('NodeID', 'unknown')[:8]}: Alive={alive}, CPUs={node_resources.get('CPU', 0)}, GPUs={node_resources.get('GPU', 0)}")
-    
-    # Check both cluster_resources and available_resources
-    # cosmos_xenna might use available_resources which could be different
-    print("\nResource comparison:")
-    cluster_res = ray.cluster_resources()
-    avail_res = ray.available_resources()
-    print(f"  cluster_resources(): CPU={cluster_res.get('CPU', 'MISSING')}, GPU={cluster_res.get('GPU', 'MISSING')}")
-    print(f"  available_resources(): CPU={avail_res.get('CPU', 'MISSING')}, GPU={avail_res.get('GPU', 'MISSING')}")
-    
-    # Wait for resources to stabilize before cosmos_xenna runs
-    print("\nWaiting 10s for Ray state to fully stabilize before cosmos_xenna...")
-    time.sleep(10)
-    
-    # Final check right before pipeline
-    print("Final resource check:")
-    cluster_res = ray.cluster_resources()
-    avail_res = ray.available_resources()
-    print(f"  cluster_resources(): CPU={cluster_res.get('CPU', 'MISSING')}, GPU={cluster_res.get('GPU', 'MISSING')}")
-    print(f"  available_resources(): CPU={avail_res.get('CPU', 'MISSING')}, GPU={avail_res.get('GPU', 'MISSING')}")
-    
-    if 'CPU' not in cluster_res or 'GPU' not in cluster_res:
-        print("WARNING: cluster_resources missing CPU or GPU key!")
-        print(f"  Full cluster_resources: {cluster_res}")
-    
     start_time = time.time()
     pipeline = create_image_embedding_pipeline(args)
     print(pipeline.describe())
diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index 80d5e85..1cca312 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -60,15 +60,6 @@ env_vars:
   
   # Increase Ray API server limit for cosmos_xenna monitoring
   RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
-  
-  # Required by cosmos_xenna (NeMo Curator backend) - must be set before Ray starts
-  # This allows cosmos_xenna to manage GPU allocation instead of Ray
-  RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES: "0"
-  
-  # Expected cluster resources (head + 1 worker)
-  # With 1 head (32 CPUs, 1 GPU) + 1 worker (32 CPUs, 1 GPU) = 64 CPUs, 2 GPUs
-  EXPECTED_CPUS: "60"
-  EXPECTED_GPUS: "2"
 
 # The entrypoint script
 entrypoint: python image_dedup_example.py

From 4a8a73ff1c047e4b7daafc1691aebae227c7584a Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Sat, 27 Dec 2025 23:07:50 +0000
Subject: [PATCH 07/14] working

---
 nemo_curator_semantic_dedup/Dockerfile        | 35 +++++++++++++++++--
 .../image_dedup_example.py                    |  8 ++---
 nemo_curator_semantic_dedup/job.yaml          | 25 ++++++++-----
 3 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/nemo_curator_semantic_dedup/Dockerfile b/nemo_curator_semantic_dedup/Dockerfile
index 200ef69..5ca4caf 100644
--- a/nemo_curator_semantic_dedup/Dockerfile
+++ b/nemo_curator_semantic_dedup/Dockerfile
@@ -2,13 +2,17 @@
 # Uses CUDA 12.8 for GPU-accelerated processing
 FROM anyscale/ray:2.52.0-slim-py312-cu128
 
+# Cache buster - change this to force rebuild
+ARG CACHE_BUST=2025-12-27-v5
+
 # Install system dependencies
 RUN sudo apt-get update && \
     sudo apt-get install -y --no-install-recommends \
         build-essential \
         unzip \
         wget \
-        curl && \
+        curl \
+        git && \
     sudo apt-get clean && \
     sudo rm -rf /var/lib/apt/lists/*
 
@@ -16,8 +20,33 @@ RUN sudo apt-get update && \
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 
 # Install Python dependencies
-# NeMo Curator with CUDA 12 support for image processing
-RUN uv pip install --system "nemo-curator[image_cuda12]"
+# Use uv pip install --system to install into the base anaconda environment
+# so all Ray workers (not just the driver) have these packages
+RUN python -m pip install --upgrade pip setuptools wheel
+
+# IMPORTANT: Uninstall any pre-existing RAPIDS/cuML packages from the base image
+# The base image may have incompatible versions that conflict with scikit-learn
+RUN python -m pip uninstall -y cuml-cu12 cudf-cu12 cugraph-cu12 pylibraft-cu12 raft-dask-cu12 rmm-cu12 || true && \
+    echo "Cleaned up pre-existing RAPIDS packages"
+
+# Upgrade scikit-learn FIRST in the system environment for cuML compatibility
+# cuML 25.6.* requires sklearn with _get_default_requests (added in sklearn 1.5)
+# This MUST be in the base anaconda env so workers have it
+RUN uv pip install --system "scikit-learn>=1.5,<1.6" && \
+    python -c "import sklearn; print(f'scikit-learn version: {sklearn.__version__}')"
+
+# Clone NeMo-Curator from fork and install in editable mode
+# This ensures all Ray workers have the same code with your local edits
+ARG CURATOR_REPO=https://github.com/avigyabb/Curator.git
+ARG CURATOR_REF=avi-test
+# ARG CURATOR_REF=main
+RUN git clone --depth 1 -b ${CURATOR_REF} ${CURATOR_REPO} /home/ray/NeMo-Curator && \
+    uv pip install --system -e /home/ray/NeMo-Curator[image_cuda12]
+
+# Re-upgrade scikit-learn AFTER nemo-curator in case it was downgraded
+# cuML 25.6.* needs sklearn >= 1.5 (has _get_default_requests)
+RUN uv pip install --system "scikit-learn>=1.5,<1.6" && \
+    python -c "import sklearn; print(f'Final scikit-learn version: {sklearn.__version__}')"
 
 # Additional dependencies for image downloading and processing
 RUN uv pip install --system \
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
index 077b94e..5256c6b 100644
--- a/nemo_curator_semantic_dedup/image_dedup_example.py
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -46,10 +46,8 @@ def create_image_embedding_pipeline(args: argparse.Namespace) -> Pipeline:
 
     # Stage 1: Read images from webdataset tar files (now runs in parallel)
     pipeline.add_stage(ImageReaderStage(
-        task_batch_size=args.batch_size,
+        batch_size=args.batch_size,
         verbose=args.verbose,
-        num_threads=16,  # More threads for I/O
-        num_gpus_per_worker=0.25,
     ))
 
     # Stage 2: Generate CLIP embeddings for images
@@ -98,10 +96,8 @@ def create_image_deduplication_pipeline(args: argparse.Namespace) -> Pipeline:
 
     # Stage 1: Read images from webdataset tar files (now runs in parallel)
     pipeline.add_stage(ImageReaderStage(
-        task_batch_size=args.batch_size,
+        batch_size=args.batch_size,
         verbose=args.verbose,
-        num_threads=16,  # More threads for I/O
-        num_gpus_per_worker=0.25,
     ))
 
     # Stage 2: Read removal list from parquet file and filter images
diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index 1cca312..c8227d1 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -19,8 +19,8 @@ compute_config:
       min_nodes: 2
       max_nodes: 2
 
-# Working directory - upload only the example code, not data
-working_dir: .
+# Working directory - use the repo root (absolute) so Curator/ is included
+working_dir: /home/ray/default
 
 # Environment variables for job configuration
 # Override these when submitting to use your own data paths
@@ -45,24 +45,31 @@ env_vars:
   # Model weights directory (pre-downloaded in Docker image)
   MODEL_DIR: "/home/ray/model_weights"
   
-  # Processing settings
-  BATCH_SIZE: "32"
-  EMBEDDING_BATCH_SIZE: "32"
-  TAR_FILES_PER_PARTITION: "10"
+  # Processing settings (reduced to prevent OOM)
+  BATCH_SIZE: "4"
+  EMBEDDING_BATCH_SIZE: "8"
+  TAR_FILES_PER_PARTITION: "1"
   DOWNLOAD_PROCESSES: "8"
-  ENTRIES_PER_TAR: "1000"
+  ENTRIES_PER_TAR: "500"
   
 
   SKIP_DOWNLOAD: "false" # Always keep false
   
   # Ray memory settings to avoid OOM
-  RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION: "0.5"
+  RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION: "0.3"
+  # Spill objects to disk when memory is low instead of crashing
+  RAY_OBJECT_SPILLING_CONFIG: '{"type":"filesystem","params":{"directory_path":"/tmp/ray_spill"}}'
+  # Kill tasks that use too much memory before they OOM the node
+  RAY_memory_monitor_refresh_ms: "100"
+  # Force garbage collection more frequently
+  RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING: "1"
   
   # Increase Ray API server limit for cosmos_xenna monitoring
   RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
 
 # The entrypoint script
-entrypoint: python image_dedup_example.py
+# Install local Curator (uploaded via working_dir) so image uses your current code
+entrypoint: python examples/nemo_curator_semantic_dedup/image_dedup_example.py
 
 # Don't retry on failure - easier to debug
 max_retries: 0

From 507db13716c60c75d11b6d6b6b760b01b7492863 Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Tue, 30 Dec 2025 01:22:50 +0000
Subject: [PATCH 08/14] working

---
 nemo_curator_semantic_dedup/Dockerfile        |  8 ++++--
 .../image_dedup_example.py                    | 20 +++++++++----
 nemo_curator_semantic_dedup/job.yaml          | 28 +++++++++++++++----
 3 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/nemo_curator_semantic_dedup/Dockerfile b/nemo_curator_semantic_dedup/Dockerfile
index 5ca4caf..e2b6845 100644
--- a/nemo_curator_semantic_dedup/Dockerfile
+++ b/nemo_curator_semantic_dedup/Dockerfile
@@ -2,8 +2,7 @@
 # Uses CUDA 12.8 for GPU-accelerated processing
 FROM anyscale/ray:2.52.0-slim-py312-cu128
 
-# Cache buster - change this to force rebuild
-ARG CACHE_BUST=2025-12-27-v5
+# Note: Cache busting for git clone is done via CURATOR_CACHE_BUST arg below
 
 # Install system dependencies
 RUN sudo apt-get update && \
@@ -40,7 +39,10 @@ RUN uv pip install --system "scikit-learn>=1.5,<1.6" && \
 ARG CURATOR_REPO=https://github.com/avigyabb/Curator.git
 ARG CURATOR_REF=avi-test
 # ARG CURATOR_REF=main
-RUN git clone --depth 1 -b ${CURATOR_REF} ${CURATOR_REPO} /home/ray/NeMo-Curator && \
+# Cache bust for git clone - change this value to force re-clone after pushing to branch
+ARG CURATOR_CACHE_BUST=2025-12-29-v3
+RUN echo "Cache bust: ${CURATOR_CACHE_BUST}" && \
+    git clone --depth 1 -b ${CURATOR_REF} ${CURATOR_REPO} /home/ray/NeMo-Curator && \
     uv pip install --system -e /home/ray/NeMo-Curator[image_cuda12]
 
 # Re-upgrade scikit-learn AFTER nemo-curator in case it was downgraded
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
index 5256c6b..d4e47f6 100644
--- a/nemo_curator_semantic_dedup/image_dedup_example.py
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -19,6 +19,8 @@
 import ray
 from helper import download_webdataset
 
+from nemo_curator.backends.experimental.ray_actor_pool import RayActorPoolExecutor
+from nemo_curator.backends.experimental.ray_data import RayDataExecutor
 from nemo_curator.core.client import RayClient
 from nemo_curator.pipeline import Pipeline
 from nemo_curator.stages.deduplication.semantic import SemanticDeduplicationWorkflow
@@ -158,20 +160,28 @@ def main(args: argparse.Namespace) -> None:
         print("\n" + "=" * 50 + "\n")
 
     # Step 2: Create and run curation pipelines
+    # Use experimental executors with ignore_head_node=True to avoid scheduling on head node
+    # This allows using a CPU-only head node while GPU tasks run on workers
+    streaming_executor = RayDataExecutor(ignore_head_node=True)
+    batch_executor = RayActorPoolExecutor(ignore_head_node=True)
+
     # Step 2.1: Create image embedding pipeline
     print("Step 2.1: Running image embedding pipeline...")
     start_time = time.time()
     pipeline = create_image_embedding_pipeline(args)
     print(pipeline.describe())
     print("\n" + "=" * 50 + "\n")
-    pipeline.run()
+    pipeline.run(executor=streaming_executor)
 
-    # Step 2.2: Create image deduplication pipeline (pairwise executor is XennaExecutor by default)
+    # Step 2.2: Create image deduplication pipeline (semantic dedup workflow)
     print("Step 2.2: Running image deduplication pipeline...")
     start_time = time.time()
-    pipeline = create_embedding_deduplication_workflow(args)
+    workflow = create_embedding_deduplication_workflow(args)
     print("\n" + "=" * 50 + "\n")
-    pipeline.run()
+    workflow.run(
+        kmeans_executor=RayActorPoolExecutor(ignore_head_node=True),
+        pairwise_executor=RayActorPoolExecutor(ignore_head_node=True),
+    )
 
     # Step 2.3: Create image deduplication pipeline
     print("Step 2.3: Running image deduplication pipeline...")
@@ -179,7 +189,7 @@ def main(args: argparse.Namespace) -> None:
     pipeline = create_image_deduplication_pipeline(args)
     print(pipeline.describe())
     print("\n" + "=" * 50 + "\n")
-    pipeline.run()
+    pipeline.run(executor=streaming_executor)
 
     end_time = time.time()
 
diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index c8227d1..f6d08fe 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -7,15 +7,15 @@ name: nemo-curator-image-dedup
 containerfile: ./Dockerfile
 
 # Compute configuration with L4 GPU for CUDA-accelerated image processing
-# Head + worker nodes for distributed processing
+# CPU-only head node + GPU worker nodes (using ignore_head_node=True in executors)
 compute_config:
   head_node:
-    instance_type: g6.8xlarge  # 1x L4 GPU, 32 vCPUs, 128GB RAM
-    # Ensure Ray reports CPU resources on the head node for cosmos_xenna
+    instance_type: m6i.2xlarge  # CPU-only, 8 vCPUs, 32GB RAM
+    # No tasks scheduled here - using RayDataExecutor/RayActorPoolExecutor with ignore_head_node=True
     resources:
-      CPU: 32
+      CPU: 0  # Prevent any task scheduling on head node
   worker_nodes:
-    - instance_type: g6.8xlarge  # 1x L4 GPU per worker
+    - instance_type: g5.12xlarge  # 4x A10G GPUs per worker, 48 vCPUs, 192GB RAM
       min_nodes: 2
       max_nodes: 2
 
@@ -52,9 +52,26 @@ env_vars:
   DOWNLOAD_PROCESSES: "8"
   ENTRIES_PER_TAR: "500"
   
+  # GPU allocation per worker - 2.0 = one worker per 2 GPUs (very strict limit)
+  # This prevents OOM even with Ray Data pipelining overlap between stages
+  EMBEDDING_GPUS_PER_WORKER: "2.0"
+  
 
   SKIP_DOWNLOAD: "false" # Always keep false
   
+  # Don't hide GPUs from tasks that request num_gpus=0 (needed for DALI)
+  RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
+  
+  # NCCL settings for multi-GPU K-means across nodes
+  # Enable debug info to diagnose NCCL communication failures
+  NCCL_DEBUG: "INFO"
+  # Use the network interface that can reach other nodes
+  NCCL_SOCKET_IFNAME: "ens,eth"
+  # Increase timeout for initialization
+  NCCL_TIMEOUT: "600"
+  # Force socket-based communication (more compatible than IB)
+  NCCL_NET: "Socket"
+  
   # Ray memory settings to avoid OOM
   RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION: "0.3"
   # Spill objects to disk when memory is low instead of crashing
@@ -68,7 +85,6 @@ env_vars:
   RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
 
 # The entrypoint script
-# Install local Curator (uploaded via working_dir) so image uses your current code
 entrypoint: python examples/nemo_curator_semantic_dedup/image_dedup_example.py
 
 # Don't retry on failure - easier to debug

From 2ede91cb1b2aeda6c129ad303c71eaa0cdb29e6a Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Tue, 30 Dec 2025 09:46:54 +0000
Subject: [PATCH 09/14] working w/ laion

---
 nemo_curator_semantic_dedup/helper.py         | 174 +++++++++++++++---
 .../image_dedup_example.py                    |   8 +
 nemo_curator_semantic_dedup/job.yaml          |  19 +-
 3 files changed, 163 insertions(+), 38 deletions(-)

diff --git a/nemo_curator_semantic_dedup/helper.py b/nemo_curator_semantic_dedup/helper.py
index a83a1fa..11390f6 100644
--- a/nemo_curator_semantic_dedup/helper.py
+++ b/nemo_curator_semantic_dedup/helper.py
@@ -26,6 +26,7 @@
 
 import aiohttp
 import pandas as pd
+import pyarrow.dataset as pa_ds
 from loguru import logger
 from PIL import Image
 from tqdm import tqdm
@@ -39,25 +40,26 @@
 
 
 async def fetch_image_bytes(session: aiohttp.ClientSession, url: str, retries: int = 3) -> bytes | None:
+    last_error = None
     for attempt in range(1, retries + 1):
         try:
             async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response:
                 if response.status == HTTP_OK:
                     return await response.read()
-                elif attempt > 1:
-                    logger.debug(f"[Attempt {attempt}] Failed to download {url}: HTTP status {response.status}")
+                last_error = f"HTTP {response.status}"
         except (aiohttp.ClientError, asyncio.TimeoutError) as e:
-            if attempt > 1:
-                logger.debug(f"[Attempt {attempt}] Failed to download {url}: {e}")
+            last_error = str(e)
 
         if attempt < retries:
             await asyncio.sleep(1)
 
-    logger.debug(f"All {retries} attempts failed for {url}")
+    # Only log final failure (not every retry) to reduce noise
+    # logger.debug(f"Failed: {url} ({last_error})")
     return None
 
 
-async def process_batch(batch: pd.DataFrame, output_dir: str, batch_num: int) -> None:
+async def process_batch(batch: pd.DataFrame, output_dir: str, batch_num: int) -> int:
+    """Process a batch of URLs and return the number of successfully downloaded images."""
     tar_filename = os.path.join(output_dir, f"{batch_num:05d}.tar")
 
     metadatas = []
@@ -80,16 +82,59 @@ async def process_batch(batch: pd.DataFrame, output_dir: str, batch_num: int) ->
 
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
+    success_count = 0
     with tarfile.open(tar_filename, "w") as tar:
         for i, result in enumerate(results):
             # Only proceed for successful downloads (bytes)
             if isinstance(result, bytes) and result:
+                # Validate and convert to JPEG (DALI doesn't support WebP/other formats)
+                try:
+                    img = Image.open(io.BytesIO(result))
+                    img.verify()  # Verify it's a valid image
+                    # Re-open after verify (verify consumes the file)
+                    img = Image.open(io.BytesIO(result))
+                    
+                    # Robust RGB conversion for ALL image modes (L, LA, P, PA, RGBA, CMYK, etc.)
+                    # This ensures CLIP gets 3-channel images
+                    if img.mode != "RGB":
+                        # For palette images, convert to RGBA first to preserve transparency info
+                        if img.mode == "P":
+                            img = img.convert("RGBA")
+                        # For any mode with alpha, composite onto white background
+                        if img.mode in ("RGBA", "LA", "PA"):
+                            background = Image.new("RGB", img.size, (255, 255, 255))
+                            # Use alpha channel as mask
+                            if img.mode == "LA":
+                                img = img.convert("RGBA")
+                            background.paste(img, mask=img.split()[-1])
+                            img = background
+                        else:
+                            # Simple conversion for grayscale (L), CMYK, etc.
+                            img = img.convert("RGB")
+                    
+                    # Final safety check - ensure we have exactly 3 channels
+                    if img.mode != "RGB":
+                        continue  # Skip if conversion somehow failed
+                    
+                    # Skip images that are too small (CLIP needs at least 3x3 to avoid channel ambiguity)
+                    if img.size[0] < 3 or img.size[1] < 3:
+                        continue
+                    
+                    # Re-encode as JPEG to ensure DALI compatibility
+                    jpeg_buffer = io.BytesIO()
+                    img.save(jpeg_buffer, format="JPEG", quality=95)
+                    jpeg_bytes = jpeg_buffer.getvalue()
+                except Exception:
+                    # Skip invalid/corrupted images (e.g., HTML error pages)
+                    continue
+
+                success_count += 1
                 key = f"{batch_num:05d}{i:04d}"
 
-                # Add image bytes
+                # Add image bytes (now guaranteed to be JPEG)
                 jpg_info = tarfile.TarInfo(name=f"{key}.jpg")
-                jpg_info.size = len(result)
-                tar.addfile(jpg_info, fileobj=io.BytesIO(result))
+                jpg_info.size = len(jpeg_bytes)
+                tar.addfile(jpg_info, fileobj=io.BytesIO(jpeg_bytes))
 
                 # Add caption text
                 caption_bytes = str(metadatas[i]["caption"]).encode("utf-8")
@@ -107,12 +152,14 @@ async def process_batch(batch: pd.DataFrame, output_dir: str, batch_num: int) ->
     meta_df = pd.DataFrame(metadatas)
     parquet_path = os.path.join(output_dir, f"{batch_num:05d}.parquet")
     meta_df.to_parquet(parquet_path)
+    
+    return success_count
 
 
-def process_parquet_chunk(chunk: tuple[int, pd.DataFrame], output_dir: str) -> None:
+def process_parquet_chunk(chunk: tuple[int, pd.DataFrame], output_dir: str) -> int:
+    """Process a chunk and return the number of successfully downloaded images."""
     batch_num, batch = chunk
-
-    asyncio.run(process_batch(batch, output_dir, batch_num))
+    return asyncio.run(process_batch(batch, output_dir, batch_num))
 
 
 def download_webdataset(
@@ -120,30 +167,99 @@ def download_webdataset(
     output_dir: str,
     entries_per_tar: int = 10000,
     num_processes: int = 2,
+    max_entries: int | None = None,
 ) -> None:
-    os.makedirs(output_dir, exist_ok=True)
+    """Stream a large Parquet of URLs/TEXT into WebDataset tar shards.
 
-    # Read the parquet file
-    df = pd.read_parquet(parquet_path)
-    print(f"Loaded {len(df)} entries from parquet file")
+    Uses pyarrow dataset streaming to avoid loading the entire Parquet into memory,
+    so it can scale to 100M+ rows (e.g., LAION subsets).
 
-    # Split the dataframe into chunks for multiprocessing
-    chunks = [
-        (batch_num, df[i : i + entries_per_tar]) for batch_num, i in enumerate(range(0, len(df), entries_per_tar))
-    ]
-    print(f"Split into {len(chunks)} chunks of {entries_per_tar} entries each")
+    Args:
+        parquet_path: Path to the parquet file containing URLs and text
+        output_dir: Directory to save the webdataset tar files
+        entries_per_tar: Number of entries per tar file
+        num_processes: Number of parallel download processes
+        max_entries: Maximum number of entries to process (for testing). None = no limit.
+    """
+    os.makedirs(output_dir, exist_ok=True)
 
-    # Use multiprocessing to process chunks in parallel with progress tracking
+    # Stream the Parquet in batches; resolve URL/TEXT in a case-insensitive way and map TEXT->caption if needed
+    dataset = pa_ds.dataset(parquet_path, format="parquet")
+    schema = dataset.schema
+    available = set(schema.names)
+
+    def resolve_cols() -> list[str]:
+        resolved = []
+        for col in ["URL", "TEXT"]:
+            if col in available:
+                resolved.append(col)
+                continue
+            lower = col.lower()
+            if lower in available:
+                resolved.append(lower)
+                continue
+            if col.upper() == "TEXT" and "caption" in available:
+                resolved.append("caption")
+        if not resolved:
+            raise ValueError(f"No URL/TEXT-like columns found in {parquet_path}; available: {sorted(available)}")
+        return resolved
+
+    resolved_cols = resolve_cols()
+    total_rows = dataset.count_rows()
+
+    # Apply max_entries limit for testing
+    if max_entries is not None and total_rows is not None:
+        total_rows = min(total_rows, max_entries)
+        print(f"Limiting to {max_entries} entries for testing")
+
+    total_chunks = math.ceil(total_rows / entries_per_tar) if total_rows is not None else None
+
+    def batch_iter():
+        batch_num = 0
+        rows_yielded = 0
+        for batch in dataset.to_batches(columns=resolved_cols, batch_size=entries_per_tar):
+            df = batch.to_pandas()
+
+            # Apply max_entries limit
+            if max_entries is not None:
+                remaining = max_entries - rows_yielded
+                if remaining <= 0:
+                    break
+                if len(df) > remaining:
+                    df = df.head(remaining)
+
+            # normalize column names to URL/TEXT expected downstream
+            col_map: dict[str, str] = {}
+            if "url" in df.columns and "URL" not in df.columns:
+                col_map["url"] = "URL"
+            if "caption" in df.columns and "TEXT" not in df.columns:
+                col_map["caption"] = "TEXT"
+            df = df.rename(columns=col_map)
+            yield (batch_num, df)
+            rows_yielded += len(df)
+            batch_num += 1
+
+    total_success = 0
+    total_attempted = 0
     with Pool(processes=num_processes) as pool:
         func = partial(process_parquet_chunk, output_dir=output_dir)
-
-        # Use tqdm to track progress of chunk processing
-        list(tqdm(
-            pool.imap(func, chunks),
-            total=len(chunks),
+        for success_count in tqdm(
+            pool.imap(func, batch_iter()),
+            total=total_chunks,
             desc="Processing chunks",
-            unit="chunk"
-        ))
+            unit="chunk",
+        ):
+            total_success += success_count
+            total_attempted += entries_per_tar  # approximate
+
+    # Report download success rate
+    success_rate = (total_success / total_attempted * 100) if total_attempted > 0 else 0
+    print(f"\n✓ Download complete: {total_success} images saved ({success_rate:.1f}% success rate)")
+    print(f"  Note: LAION datasets have high link rot - many URLs no longer work.")
+    
+    if total_success == 0:
+        print("\n⚠️  WARNING: No images were downloaded successfully!")
+        print("  This is likely due to LAION link rot. Try increasing MAX_ENTRIES.")
 
     # Best-effort cleanup of legacy tmp dir from previous versions
     tmp_dir = os.path.join(output_dir, "tmp")
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
index d4e47f6..d8022be 100644
--- a/nemo_curator_semantic_dedup/image_dedup_example.py
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -148,6 +148,7 @@ def main(args: argparse.Namespace) -> None:
             output_dir=args.input_wds_dataset_dir,
             num_processes=args.download_processes,
             entries_per_tar=args.entries_per_tar,
+            max_entries=args.max_entries,
         )
 
         download_time = time.time() - download_start
@@ -300,6 +301,12 @@ def get_env_float(env_var: str, arg_value: float, default: float) -> float:
         default=None,
         help="Skip dataset download and use existing webdataset (env: SKIP_DOWNLOAD)"
     )
+    parser.add_argument(
+        "--max-entries",
+        type=int,
+        default=None,
+        help="Maximum entries to download for testing (env: MAX_ENTRIES). None = no limit."
+    )
 
     # Image reader arguments
     parser.add_argument(
@@ -356,6 +363,7 @@ def get_env_float(env_var: str, arg_value: float, default: float) -> float:
         model_dir=get_env_or_arg("MODEL_DIR", cli_args.model_dir, "/home/ray/model_weights"),
         download_processes=get_env_int("DOWNLOAD_PROCESSES", cli_args.download_processes, 8),
         entries_per_tar=get_env_int("ENTRIES_PER_TAR", cli_args.entries_per_tar, 1000),
+        max_entries=int(get_env_or_arg("MAX_ENTRIES", cli_args.max_entries)) if get_env_or_arg("MAX_ENTRIES", cli_args.max_entries) else None,
         skip_download=get_env_bool("SKIP_DOWNLOAD", cli_args.skip_download, False),
         tar_files_per_partition=get_env_int("TAR_FILES_PER_PARTITION", cli_args.tar_files_per_partition, 1),
         batch_size=get_env_int("BATCH_SIZE", cli_args.batch_size, 100),
diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index f6d08fe..fef0e2d 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -26,8 +26,9 @@ working_dir: /home/ray/default
 # Override these when submitting to use your own data paths
 env_vars:
   # Input parquet file with image URLs (TEXT and URL columns)
-  # This file is copied into the Docker image during build
-  INPUT_PARQUET: "/home/ray/data/truncated_100k_mscoco.parquet"
+  # LAION dataset (relative to working_dir)
+  INPUT_PARQUET: "examples/nemo_curator_semantic_dedup/laion_meta/laion_subset_10m.parquet"
+  MAX_ENTRIES: "10000"  # Limit for testing
   
   # Directory for WebDataset tar files (created from parquet)
   # Use /mnt/cluster_storage for persistence, or /home/ray/data for ephemeral
@@ -63,14 +64,14 @@ env_vars:
   RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
   
   # NCCL settings for multi-GPU K-means across nodes
-  # Enable debug info to diagnose NCCL communication failures
-  NCCL_DEBUG: "INFO"
-  # Use the network interface that can reach other nodes
+  NCCL_DEBUG: "WARN"
   NCCL_SOCKET_IFNAME: "ens,eth"
-  # Increase timeout for initialization
-  NCCL_TIMEOUT: "600"
-  # Force socket-based communication (more compatible than IB)
-  NCCL_NET: "Socket"
+  NCCL_TIMEOUT: "1800"
+  # Disable features that don't work well across nodes on g5 instances
+  NCCL_P2P_DISABLE: "1"      # Disable GPU peer-to-peer (forces host memory path)
+  NCCL_SHM_DISABLE: "1"      # Disable shared memory (forces network for all)
+  NCCL_IB_DISABLE: "1"       # Disable InfiniBand (not available on g5)
+  NCCL_ASYNC_ERROR_HANDLING: "1"  # Better error recovery
   
   # Ray memory settings to avoid OOM
   RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION: "0.3"

From e2323f2944b2e6f96eb12266d145a6f4a83de75d Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Wed, 31 Dec 2025 00:24:25 +0000
Subject: [PATCH 10/14] working version at scale

---
 nemo_curator_semantic_dedup/job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index fef0e2d..c35447c 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -28,7 +28,7 @@ env_vars:
   # Input parquet file with image URLs (TEXT and URL columns)
   # LAION dataset (relative to working_dir)
   INPUT_PARQUET: "examples/nemo_curator_semantic_dedup/laion_meta/laion_subset_10m.parquet"
-  MAX_ENTRIES: "10000"  # Limit for testing
+  MAX_ENTRIES: "100000"  # Limit for testing
   
   # Directory for WebDataset tar files (created from parquet)
   # Use /mnt/cluster_storage for persistence, or /home/ray/data for ephemeral

From 82aa265a25b7357ca59691226badc96e07a84fc0 Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Wed, 31 Dec 2025 01:29:03 +0000
Subject: [PATCH 11/14] job yaml cleanup

---
 nemo_curator_semantic_dedup/job.yaml | 29 +---------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index c35447c..7467363 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -28,7 +28,7 @@ env_vars:
   # Input parquet file with image URLs (TEXT and URL columns)
   # LAION dataset (relative to working_dir)
   INPUT_PARQUET: "examples/nemo_curator_semantic_dedup/laion_meta/laion_subset_10m.parquet"
-  MAX_ENTRIES: "100000"  # Limit for testing
+  MAX_ENTRIES: "10000"  # Limit for testing
   
   # Directory for WebDataset tar files (created from parquet)
   # Use /mnt/cluster_storage for persistence, or /home/ray/data for ephemeral
@@ -52,38 +52,11 @@ env_vars:
   TAR_FILES_PER_PARTITION: "1"
   DOWNLOAD_PROCESSES: "8"
   ENTRIES_PER_TAR: "500"
-  
-  # GPU allocation per worker - 2.0 = one worker per 2 GPUs (very strict limit)
-  # This prevents OOM even with Ray Data pipelining overlap between stages
-  EMBEDDING_GPUS_PER_WORKER: "2.0"
-  
 
   SKIP_DOWNLOAD: "false" # Always keep false
   
   # Don't hide GPUs from tasks that request num_gpus=0 (needed for DALI)
   RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
-  
-  # NCCL settings for multi-GPU K-means across nodes
-  NCCL_DEBUG: "WARN"
-  NCCL_SOCKET_IFNAME: "ens,eth"
-  NCCL_TIMEOUT: "1800"
-  # Disable features that don't work well across nodes on g5 instances
-  NCCL_P2P_DISABLE: "1"      # Disable GPU peer-to-peer (forces host memory path)
-  NCCL_SHM_DISABLE: "1"      # Disable shared memory (forces network for all)
-  NCCL_IB_DISABLE: "1"       # Disable InfiniBand (not available on g5)
-  NCCL_ASYNC_ERROR_HANDLING: "1"  # Better error recovery
-  
-  # Ray memory settings to avoid OOM
-  RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION: "0.3"
-  # Spill objects to disk when memory is low instead of crashing
-  RAY_OBJECT_SPILLING_CONFIG: '{"type":"filesystem","params":{"directory_path":"/tmp/ray_spill"}}'
-  # Kill tasks that use too much memory before they OOM the node
-  RAY_memory_monitor_refresh_ms: "100"
-  # Force garbage collection more frequently
-  RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING: "1"
-  
-  # Increase Ray API server limit for cosmos_xenna monitoring
-  RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
 
 # The entrypoint script
 entrypoint: python examples/nemo_curator_semantic_dedup/image_dedup_example.py

From 06d674249ad58dc135b5ec8ec3b0d250f33ad363 Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Fri, 2 Jan 2026 07:40:40 +0000
Subject: [PATCH 12/14] removed unnecessary functions in helper.py

---
 nemo_curator_semantic_dedup/helper.py | 176 +-------------------------
 1 file changed, 2 insertions(+), 174 deletions(-)

diff --git a/nemo_curator_semantic_dedup/helper.py b/nemo_curator_semantic_dedup/helper.py
index 11390f6..c798d2c 100644
--- a/nemo_curator_semantic_dedup/helper.py
+++ b/nemo_curator_semantic_dedup/helper.py
@@ -244,7 +244,7 @@ def batch_iter():
     with Pool(processes=num_processes) as pool:
         func = partial(process_parquet_chunk, output_dir=output_dir)
         for success_count in tqdm(
-            pool.imap(func, batch_iter()),
+            pool.imap_unordered(func, batch_iter()),
             total=total_chunks,
             desc="Processing chunks",
             unit="chunk",
@@ -268,176 +268,4 @@ def batch_iter():
             os.rmdir(tmp_dir)
     except OSError as e:
         logger.debug(f"Failed to remove tmp dir {tmp_dir}: {e}")
-
-
-def _prepare_metadata_record(
-    image_obj: ImageObject,
-    new_id: str,
-    old_id_col: str | None,
-) -> dict:
-    """Prepare metadata record for an image object."""
-    metadata_record = {
-        "id": new_id,
-        "original_id": image_obj.image_id,
-        "original_path": image_obj.image_path,
-    }
-
-    # Preserve original ID in specified column if requested
-    if old_id_col:
-        metadata_record[old_id_col] = image_obj.image_id
-
-    # Add scores and embeddings to metadata
-    if image_obj.aesthetic_score is not None:
-        metadata_record["aesthetic_score"] = image_obj.aesthetic_score
-    if image_obj.nsfw_score is not None:
-        metadata_record["nsfw_score"] = image_obj.nsfw_score
-    if image_obj.embedding is not None:
-        # Convert embedding to list for JSON serialization
-        metadata_record["embedding"] = image_obj.embedding.tolist()
-        metadata_record["embedding_dim"] = len(image_obj.embedding)
-
-    # Add original metadata
-    if image_obj.metadata:
-        metadata_record.update(image_obj.metadata)
-
-    return metadata_record
-
-
-def _add_caption_to_metadata(image_obj: ImageObject, metadata_record: dict) -> None:
-    """Add caption/text to metadata record."""
-    if "caption" in image_obj.metadata:
-        metadata_record["caption"] = str(image_obj.metadata["caption"])
-    elif "text" in image_obj.metadata:
-        metadata_record["caption"] = str(image_obj.metadata["text"])
-    elif "TEXT" in image_obj.metadata:
-        metadata_record["caption"] = str(image_obj.metadata["TEXT"])
-
-
-def _add_image_to_tar(tar: tarfile.TarFile, image_obj: ImageObject, new_id: str) -> None:
-    """Add image data to tar file if available."""
-    if image_obj.image_data is not None:
-        # Convert numpy array to PIL Image and save as bytes
-        image_pil = Image.fromarray(image_obj.image_data)
-        image_bytes = _image_to_bytes(image_pil)
-
-        # Add image to tar
-        image_info = tarfile.TarInfo(name=f"{new_id}.jpg")
-        image_info.size = len(image_bytes.getvalue())
-        tar.addfile(image_info, fileobj=image_bytes)
-
-
-def _add_json_to_tar(tar: tarfile.TarFile, metadata_record: dict, new_id: str) -> None:
-    """Add JSON metadata to tar file."""
-    json_data = json.dumps(metadata_record, indent=2)
-    json_bytes = json_data.encode("utf-8")
-    json_info = tarfile.TarInfo(name=f"{new_id}.json")
-    json_info.size = len(json_bytes)
-    tar.addfile(json_info, fileobj=io.BytesIO(json_bytes))
-
-
-def save_imagebatch_to_webdataset(
-    image_batches: list[ImageBatch],
-    output_path: str,
-    samples_per_shard: int = 10000,
-    max_shards: int = 5,
-    old_id_col: str | None = None,
-) -> None:
-    """
-    Save ImageBatch objects to WebDataset format with resharding.
-
-    Args:
-        image_batches: List of ImageBatch objects from pipeline output
-        output_path: Directory path where the WebDataset should be saved
-        samples_per_shard: Number of samples to include in each tar file
-        max_shards: Order of magnitude of max shards (for zero-padding filenames)
-        old_id_col: If specified, will preserve the original image_id in this column
-    """
-    os.makedirs(output_path, exist_ok=True)
-
-    # Flatten all ImageObjects from all batches
-    all_image_objects = []
-    for batch in image_batches:
-        all_image_objects.extend(batch.data)
-
-    if not all_image_objects:
-        print("No images to save")
-        return
-
-    print(f"Processing {len(all_image_objects)} images into {samples_per_shard} samples per shard")
-
-    max_samples_per_shard = math.ceil(math.log10(samples_per_shard))
-
-    # Process images in shards
-    shard_id = 0
-    for i in range(0, len(all_image_objects), samples_per_shard):
-        shard_images = all_image_objects[i:i + samples_per_shard]
-
-        # Create output file paths
-        parquet_filename = _name_partition(shard_id, max_shards=max_shards)
-        tar_filename = _name_partition(shard_id, max_shards=max_shards, ext="tar")
-        parquet_path = os.path.join(output_path, parquet_filename)
-        tar_path = os.path.join(output_path, tar_filename)
-
-        # Prepare metadata for parquet
-        metadata_records = []
-
-        # Create tar file with images and metadata
-        with tarfile.open(tar_path, "w") as tar:
-            for sample_idx, image_obj in enumerate(shard_images):
-                # Generate new ID combining shard and sample indices
-                new_id = _combine_id(
-                    shard_id,
-                    sample_idx,
-                    max_shards=max_shards,
-                    max_samples_per_shard=max_samples_per_shard
-                )
-
-                # Prepare metadata record for parquet
-                metadata_record = _prepare_metadata_record(image_obj, new_id, old_id_col)
-                metadata_records.append(metadata_record)
-
-                # Save image data if available and requested
-                _add_image_to_tar(tar, image_obj, new_id)
-
-                # Store caption/text in metadata (no separate .txt file)
-                _add_caption_to_metadata(image_obj, metadata_record)
-
-                # Add JSON metadata to tar
-                _add_json_to_tar(tar, metadata_record, new_id)
-
-        # Save metadata to parquet
-        metadata_df = pd.DataFrame(metadata_records)
-        metadata_df.to_parquet(parquet_path, index=False)
-
-        print(f"✓ Saved shard {shard_id:0{max_shards}d} with {len(shard_images)} samples")
-        print(f"  - Tar file: {tar_filename}")
-        print(f"  - Parquet file: {parquet_filename}")
-
-        shard_id += 1
-
-    print(f"\nSuccessfully saved {len(all_image_objects)} images to {shard_id} shards")
-    print(f"Output directory: {output_path}")
-
-
-def _name_partition(
-    partition_index: int,
-    max_shards: int = 5,
-    ext: str = "parquet",
-) -> str:
-    """Generate partition filename with proper zero-padding."""
-    return f"{partition_index:0{max_shards}d}.{ext}"
-
-
-def _combine_id(shard_id: int, sample_id: int, max_shards: int = 5, max_samples_per_shard: int = 4) -> str:
-    """Combine shard and sample IDs into a unique identifier."""
-    int_id = sample_id + (10**max_samples_per_shard) * shard_id
-    n_digits = max_samples_per_shard + max_shards
-    return f"{int_id:0{n_digits}d}"
-
-
-def _image_to_bytes(image_pil: Image.Image, image_format: str = "JPEG") -> io.BytesIO:
-    """Convert PIL Image to BytesIO object for tarfile."""
-    buffer = io.BytesIO()
-    image_pil.save(buffer, format=image_format)
-    buffer.seek(0)
-    return buffer
\ No newline at end of file
+        
\ No newline at end of file

From be4100e277e1d884d47aa55b84d20dfcf1787e7b Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Fri, 2 Jan 2026 08:06:31 +0000
Subject: [PATCH 13/14] changes to dockerfile

---
 nemo_curator_semantic_dedup/Dockerfile | 29 --------------------------
 1 file changed, 29 deletions(-)

diff --git a/nemo_curator_semantic_dedup/Dockerfile b/nemo_curator_semantic_dedup/Dockerfile
index e2b6845..997dc07 100644
--- a/nemo_curator_semantic_dedup/Dockerfile
+++ b/nemo_curator_semantic_dedup/Dockerfile
@@ -28,12 +28,6 @@ RUN python -m pip install --upgrade pip setuptools wheel
 RUN python -m pip uninstall -y cuml-cu12 cudf-cu12 cugraph-cu12 pylibraft-cu12 raft-dask-cu12 rmm-cu12 || true && \
     echo "Cleaned up pre-existing RAPIDS packages"
 
-# Upgrade scikit-learn FIRST in the system environment for cuML compatibility
-# cuML 25.6.* requires sklearn with _get_default_requests (added in sklearn 1.5)
-# This MUST be in the base anaconda env so workers have it
-RUN uv pip install --system "scikit-learn>=1.5,<1.6" && \
-    python -c "import sklearn; print(f'scikit-learn version: {sklearn.__version__}')"
-
 # Clone NeMo-Curator from fork and install in editable mode
 # This ensures all Ray workers have the same code with your local edits
 ARG CURATOR_REPO=https://github.com/avigyabb/Curator.git
@@ -61,32 +55,9 @@ RUN uv pip install --system \
     huggingface_hub \
     transformers
 
-# Pre-download CLIP model weights to avoid runtime downloads
-# This makes job startup faster and more reliable
-RUN python -c "\
-from huggingface_hub import snapshot_download; \
-import os; \
-model_dir = '/home/ray/model_weights/openai/clip-vit-large-patch14'; \
-os.makedirs(model_dir, exist_ok=True); \
-snapshot_download('openai/clip-vit-large-patch14', local_dir=model_dir)"
-
 # Set environment variable for model directory
 ENV MODEL_DIR=/home/ray/model_weights
 
-# Download and prepare the example dataset from HuggingFace
-# Downloads MS COCO parquet, deduplicates URLs, and truncates to 100k rows
-RUN mkdir -p /home/ray/data && \
-    curl -L https://huggingface.co/datasets/ChristophSchuhmann/MS_COCO_2017_URL_TEXT/resolve/main/mscoco.parquet \
-         -o /home/ray/data/mscoco.parquet && \
-    python -c "\
-import pandas as pd; \
-df = pd.read_parquet('/home/ray/data/mscoco.parquet'); \
-deduped = df[~df['URL'].duplicated()]; \
-truncated = deduped[:100000]; \
-truncated.to_parquet('/home/ray/data/truncated_100k_mscoco.parquet'); \
-print(f'Created truncated dataset with {len(truncated)} rows')" && \
-    rm /home/ray/data/mscoco.parquet
-
 # Create output directories
 RUN mkdir -p /home/ray/data/webdataset \
              /home/ray/data/results \

From 907eb658552760b0a97179ee526fc5eca9b8b104 Mon Sep 17 00:00:00 2001
From: Avi Basnet <avigyabb@stanford.edu>
Date: Mon, 12 Jan 2026 23:55:28 +0000
Subject: [PATCH 14/14] working with ray data

---
 nemo_curator_semantic_dedup/helper.py         | 461 ++++++++++++++----
 .../image_dedup_example.py                    |  63 ++-
 nemo_curator_semantic_dedup/job.yaml          |  25 +-
 3 files changed, 429 insertions(+), 120 deletions(-)

diff --git a/nemo_curator_semantic_dedup/helper.py b/nemo_curator_semantic_dedup/helper.py
index c798d2c..78b0cbd 100644
--- a/nemo_curator_semantic_dedup/helper.py
+++ b/nemo_curator_semantic_dedup/helper.py
@@ -12,34 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Helper functions for downloading and preparing image datasets.
+
+This module provides two approaches for converting parquet files (with URLs) to WebDataset format:
+
+1. `parquet_to_webdataset_ray()` - Distributed approach using Ray Data (recommended)
+   - Scales across all nodes in the cluster
+   - Uses Ray Data for parallel reading and processing
+   - Best for large datasets (millions of images)
+
+2. `download_webdataset()` - Single-node multiprocessing approach (legacy)
+   - Runs on a single machine
+   - Uses Python multiprocessing for parallelism
+   - Simpler but doesn't scale beyond one node
+"""
+
 from __future__ import annotations
 
 import asyncio
 import io
 import json
-import math
 import os
 import tarfile
-from functools import partial
-from multiprocessing import Pool
-from typing import TYPE_CHECKING
+import uuid
+from typing import TYPE_CHECKING, Any
 
 import aiohttp
 import pandas as pd
-import pyarrow.dataset as pa_ds
 from loguru import logger
 from PIL import Image
-from tqdm import tqdm
 
 if TYPE_CHECKING:
-    from nemo_curator.tasks import ImageObject
-    from nemo_curator.tasks.image import ImageBatch
+    pass
 
 # HTTP status codes
 HTTP_OK = 200
 
 
+# =============================================================================
+# Image Download and Validation Utilities
+# =============================================================================
+
 async def fetch_image_bytes(session: aiohttp.ClientSession, url: str, retries: int = 3) -> bytes | None:
+    """Fetch image bytes from URL with retries."""
     last_error = None
     for attempt in range(1, retries + 1):
         try:
@@ -53,113 +69,343 @@ async def fetch_image_bytes(session: aiohttp.ClientSession, url: str, retries: i
         if attempt < retries:
             await asyncio.sleep(1)
 
-    # Only log final failure (not every retry) to reduce noise
-    # logger.debug(f"Failed: {url} ({last_error})")
     return None
 
 
-async def process_batch(batch: pd.DataFrame, output_dir: str, batch_num: int) -> int:
-    """Process a batch of URLs and return the number of successfully downloaded images."""
-    tar_filename = os.path.join(output_dir, f"{batch_num:05d}.tar")
-
-    metadatas = []
-    # Set timeout and connection limits for the session
+def validate_and_convert_to_jpeg(image_bytes: bytes) -> bytes | None:
+    """
+    Validate image and convert to JPEG format for DALI compatibility.
+    
+    Args:
+        image_bytes: Raw image bytes
+        
+    Returns:
+        JPEG bytes if valid, None if image is invalid/corrupted
+    """
+    try:
+        img = Image.open(io.BytesIO(image_bytes))
+        img.verify()  # Verify it's a valid image
+        # Re-open after verify (verify consumes the file)
+        img = Image.open(io.BytesIO(image_bytes))
+        
+        # Robust RGB conversion for ALL image modes (L, LA, P, PA, RGBA, CMYK, etc.)
+        # This ensures CLIP gets 3-channel images
+        if img.mode != "RGB":
+            # For palette images, convert to RGBA first to preserve transparency info
+            if img.mode == "P":
+                img = img.convert("RGBA")
+            # For any mode with alpha, composite onto white background
+            if img.mode in ("RGBA", "LA", "PA"):
+                background = Image.new("RGB", img.size, (255, 255, 255))
+                # Use alpha channel as mask
+                if img.mode == "LA":
+                    img = img.convert("RGBA")
+                background.paste(img, mask=img.split()[-1])
+                img = background
+            else:
+                # Simple conversion for grayscale (L), CMYK, etc.
+                img = img.convert("RGB")
+        
+        # Final safety check - ensure we have exactly 3 channels
+        if img.mode != "RGB":
+            return None
+        
+        # Skip images that are too small (CLIP needs at least 3x3 to avoid channel ambiguity)
+        if img.size[0] < 3 or img.size[1] < 3:
+            return None
+        
+        # Re-encode as JPEG to ensure DALI compatibility
+        jpeg_buffer = io.BytesIO()
+        img.save(jpeg_buffer, format="JPEG", quality=95)
+        return jpeg_buffer.getvalue()
+    except Exception:
+        return None
+
+
+async def download_batch_images(
+    batch: pd.DataFrame,
+    url_col: str = "URL",
+    text_col: str = "TEXT",
+) -> list[dict[str, Any]]:
+    """
+    Download images for a batch of URLs asynchronously.
+    
+    Args:
+        batch: DataFrame with URL and TEXT columns
+        url_col: Name of URL column
+        text_col: Name of text/caption column
+        
+    Returns:
+        List of dicts with 'url', 'caption', 'jpeg_bytes' (None if failed)
+    """
     timeout = aiohttp.ClientTimeout(total=15)
     connector = aiohttp.TCPConnector(limit=256, limit_per_host=16)
-
+    
+    results = []
     async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
         tasks = []
-        for i, (_, row) in enumerate(batch.iterrows()):
-            caption = row["TEXT"]
-            url = row["URL"]
+        metadata = []
+        
+        for _, row in batch.iterrows():
+            url = row[url_col]
+            caption = row[text_col]
+            metadata.append({"url": url, "caption": caption})
+            tasks.append(fetch_image_bytes(session, url, retries=3))
+        
+        raw_results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        for meta, raw_bytes in zip(metadata, raw_results):
+            jpeg_bytes = None
+            if isinstance(raw_bytes, bytes) and raw_bytes:
+                jpeg_bytes = validate_and_convert_to_jpeg(raw_bytes)
+            
+            results.append({
+                "url": meta["url"],
+                "caption": meta["caption"],
+                "jpeg_bytes": jpeg_bytes,
+            })
+    
+    return results
 
-            key = f"{batch_num:05d}{i:04d}"
 
-            meta = {"url": url, "caption": caption, "key": key}
+def write_tar_shard(
+    images: list[dict[str, Any]],
+    output_path: str,
+    shard_id: str,
+) -> dict[str, int]:
+    """
+    Write a tar shard with downloaded images.
+    
+    Args:
+        images: List of dicts with 'url', 'caption', 'jpeg_bytes'
+        output_path: Path to write tar file
+        shard_id: Unique identifier for this shard
+        
+    Returns:
+        Dict with 'success_count' and 'total_count'
+    """
+    success_count = 0
+    metadatas = []
+    
+    with tarfile.open(output_path, "w") as tar:
+        for i, img_data in enumerate(images):
+            if img_data["jpeg_bytes"] is None:
+                continue
+            
+            key = f"{shard_id}_{i:06d}"
+            jpeg_bytes = img_data["jpeg_bytes"]
+            
+            # Add image bytes
+            jpg_info = tarfile.TarInfo(name=f"{key}.jpg")
+            jpg_info.size = len(jpeg_bytes)
+            tar.addfile(jpg_info, fileobj=io.BytesIO(jpeg_bytes))
+            
+            # Add caption text
+            caption_bytes = str(img_data["caption"]).encode("utf-8")
+            txt_info = tarfile.TarInfo(name=f"{key}.txt")
+            txt_info.size = len(caption_bytes)
+            tar.addfile(txt_info, fileobj=io.BytesIO(caption_bytes))
+            
+            # Add JSON metadata
+            meta = {"url": img_data["url"], "caption": img_data["caption"], "key": key}
+            json_bytes = json.dumps(meta).encode("utf-8")
+            json_info = tarfile.TarInfo(name=f"{key}.json")
+            json_info.size = len(json_bytes)
+            tar.addfile(json_info, fileobj=io.BytesIO(json_bytes))
+            
             metadatas.append(meta)
+            success_count += 1
+    
+    # Write parquet sidecar
+    if metadatas:
+        parquet_path = output_path.replace(".tar", ".parquet")
+        pd.DataFrame(metadatas).to_parquet(parquet_path)
+    
+    return {"success_count": success_count, "total_count": len(images)}
 
-            tasks.append(fetch_image_bytes(session, url, retries=3))
 
-        results = await asyncio.gather(*tasks, return_exceptions=True)
+# =============================================================================
+# Ray Data Approach (Distributed)
+# =============================================================================
 
-    success_count = 0
-    with tarfile.open(tar_filename, "w") as tar:
-        for i, result in enumerate(results):
-            # Only proceed for successful downloads (bytes)
-            if isinstance(result, bytes) and result:
-                # Validate and convert to JPEG (DALI doesn't support WebP/other formats)
-                try:
-                    img = Image.open(io.BytesIO(result))
-                    img.verify()  # Verify it's a valid image
-                    # Re-open after verify (verify consumes the file)
-                    img = Image.open(io.BytesIO(result))
-                    
-                    # Robust RGB conversion for ALL image modes (L, LA, P, PA, RGBA, CMYK, etc.)
-                    # This ensures CLIP gets 3-channel images
-                    if img.mode != "RGB":
-                        # For palette images, convert to RGBA first to preserve transparency info
-                        if img.mode == "P":
-                            img = img.convert("RGBA")
-                        # For any mode with alpha, composite onto white background
-                        if img.mode in ("RGBA", "LA", "PA"):
-                            background = Image.new("RGB", img.size, (255, 255, 255))
-                            # Use alpha channel as mask
-                            if img.mode == "LA":
-                                img = img.convert("RGBA")
-                            background.paste(img, mask=img.split()[-1])
-                            img = background
-                        else:
-                            # Simple conversion for grayscale (L), CMYK, etc.
-                            img = img.convert("RGB")
-                    
-                    # Final safety check - ensure we have exactly 3 channels
-                    if img.mode != "RGB":
-                        continue  # Skip if conversion somehow failed
-                    
-                    # Skip images that are too small (CLIP needs at least 3x3 to avoid channel ambiguity)
-                    if img.size[0] < 3 or img.size[1] < 3:
-                        continue
-                    
-                    # Re-encode as JPEG to ensure DALI compatibility
-                    jpeg_buffer = io.BytesIO()
-                    img.save(jpeg_buffer, format="JPEG", quality=95)
-                    jpeg_bytes = jpeg_buffer.getvalue()
-                except Exception:
-                    # Skip invalid/corrupted images (e.g., HTML error pages)
-                    continue
-
-                success_count += 1
-                key = f"{batch_num:05d}{i:04d}"
-
-                # Add image bytes (now guaranteed to be JPEG)
-                jpg_info = tarfile.TarInfo(name=f"{key}.jpg")
-                jpg_info.size = len(jpeg_bytes)
-                tar.addfile(jpg_info, fileobj=io.BytesIO(jpeg_bytes))
-
-                # Add caption text
-                caption_bytes = str(metadatas[i]["caption"]).encode("utf-8")
-                txt_info = tarfile.TarInfo(name=f"{key}.txt")
-                txt_info.size = len(caption_bytes)
-                tar.addfile(txt_info, fileobj=io.BytesIO(caption_bytes))
-
-                # Add JSON metadata
-                json_bytes = json.dumps(metadatas[i]).encode("utf-8")
-                json_info = tarfile.TarInfo(name=f"{key}.json")
-                json_info.size = len(json_bytes)
-                tar.addfile(json_info, fileobj=io.BytesIO(json_bytes))
-
-    # Write parquet
-    meta_df = pd.DataFrame(metadatas)
-    parquet_path = os.path.join(output_dir, f"{batch_num:05d}.parquet")
-    meta_df.to_parquet(parquet_path)
-    
-    return success_count
+def process_batch_ray(batch: dict[str, Any], output_dir: str) -> dict[str, Any]:
+    """
+    Ray Data map function to process a batch of URLs.
+    
+    This function is called by Ray Data's map_batches() and runs distributed
+    across all nodes in the cluster.
+    
+    Args:
+        batch: Dict with 'URL' and 'TEXT' arrays (Ray Data batch format)
+        output_dir: Directory to write tar files
+        
+    Returns:
+        Dict with statistics about processing
+    """
+    import ray
+    
+    # Convert Ray Data batch format to DataFrame
+    df = pd.DataFrame({
+        "URL": batch["URL"],
+        "TEXT": batch["TEXT"],
+    })
+    
+    # Generate unique shard ID using node ID + UUID to avoid collisions
+    node_id = ray.get_runtime_context().get_node_id()[:8]
+    shard_id = f"{node_id}_{uuid.uuid4().hex[:8]}"
+    tar_path = os.path.join(output_dir, f"{shard_id}.tar")
+    
+    # Download images asynchronously
+    images = asyncio.run(download_batch_images(df))
+    
+    # Write tar shard
+    stats = write_tar_shard(images, tar_path, shard_id)
+    
+    # Return statistics as a single-row batch
+    return {
+        "shard_id": [shard_id],
+        "success_count": [stats["success_count"]],
+        "total_count": [stats["total_count"]],
+    }
+
+
+def parquet_to_webdataset_ray(
+    parquet_path: str,
+    output_dir: str,
+    entries_per_tar: int = 1000,
+    max_entries: int | None = None,
+    concurrency: int | None = None,
+) -> dict[str, int]:
+    """
+    Convert parquet file with URLs to WebDataset tar files using Ray Data.
+    
+    This distributes the download work across all nodes in the Ray cluster,
+    providing much better scalability than single-node processing.
+    
+    Args:
+        parquet_path: Path to parquet file with URL and TEXT columns
+        output_dir: Directory to save tar files
+        entries_per_tar: Number of entries per tar shard
+        max_entries: Maximum entries to process (for testing)
+        concurrency: Number of concurrent download tasks (defaults to num CPUs)
+        
+    Returns:
+        Dict with 'total_success' and 'total_attempted' counts
+    """
+    import ray.data
+    
+    os.makedirs(output_dir, exist_ok=True)
+    
+    print(f"Reading parquet from: {parquet_path}")
+    
+    # Read parquet with Ray Data - this distributes reading across the cluster
+    ds = ray.data.read_parquet(parquet_path)
+    
+    # Get schema and normalize column names
+    schema = ds.schema()
+    col_names = schema.names if hasattr(schema, 'names') else [f.name for f in schema]
+    col_map = {}
+    
+    # Handle case-insensitive column matching
+    for col in col_names:
+        if col.lower() == "url":
+            col_map[col] = "URL"
+        elif col.lower() in ("text", "caption"):
+            col_map[col] = "TEXT"
+    
+    if col_map:
+        # Rename columns to standard names
+        def rename_cols(batch):
+            result = {}
+            for old_name, new_name in col_map.items():
+                if old_name in batch:
+                    result[new_name] = batch[old_name]
+            # Keep any columns that weren't renamed
+            for col in batch:
+                if col not in col_map and col not in result:
+                    result[col] = batch[col]
+            return result
+        
+        ds = ds.map_batches(rename_cols, batch_format="pandas")
+    
+    # Select only the columns we need
+    ds = ds.select_columns(["URL", "TEXT"])
+    
+    # Apply max_entries limit
+    if max_entries is not None:
+        print(f"Limiting to {max_entries} entries for testing")
+        ds = ds.limit(max_entries)
+    
+    # Count total rows for progress reporting
+    total_rows = ds.count()
+    print(f"Total entries to process: {total_rows}")
+    
+    # Process batches in parallel across the cluster
+    # Each batch becomes one tar shard
+    from functools import partial
+    
+    process_fn = partial(process_batch_ray, output_dir=output_dir)
+    
+    # Determine concurrency based on cluster resources
+    if concurrency is None:
+        import ray
+        cluster_resources = ray.cluster_resources()
+        concurrency = max(1, int(cluster_resources.get("CPU", 4) // 2))
+    
+    print(f"Processing with concurrency={concurrency}, entries_per_tar={entries_per_tar}")
+    
+    # map_batches distributes work across all nodes
+    results_ds = ds.map_batches(
+        process_fn,
+        batch_size=entries_per_tar,
+        batch_format="numpy",
+        concurrency=concurrency,
+    )
+    
+    # Materialize results and aggregate statistics
+    results = results_ds.take_all()
+    
+    total_success = sum(r["success_count"] for r in results)
+    total_attempted = sum(r["total_count"] for r in results)
+    num_shards = len(results)
+    
+    # Report results
+    success_rate = (total_success / total_attempted * 100) if total_attempted > 0 else 0
+    print(f"\n✓ Download complete: {total_success} images in {num_shards} shards ({success_rate:.1f}% success rate)")
+    print(f"  Note: LAION datasets have high link rot - many URLs no longer work.")
+    
+    if total_success == 0:
+        print("\n⚠️  WARNING: No images were downloaded successfully!")
+        print("  This is likely due to LAION link rot. Try increasing MAX_ENTRIES.")
+    
+    return {
+        "total_success": total_success,
+        "total_attempted": total_attempted,
+        "num_shards": num_shards,
+    }
+
+
+# =============================================================================
+# Single-Node Multiprocessing Approach (Legacy)
+# =============================================================================
+
+async def process_batch_single_node(batch: pd.DataFrame, output_dir: str, batch_num: int) -> int:
+    """Process a batch of URLs and return the number of successfully downloaded images."""
+    tar_filename = os.path.join(output_dir, f"{batch_num:05d}.tar")
+    shard_id = f"{batch_num:05d}"
+    
+    # Download images
+    images = await download_batch_images(batch)
+    
+    # Write tar shard
+    stats = write_tar_shard(images, tar_filename, shard_id)
+    return stats["success_count"]
 
 
 def process_parquet_chunk(chunk: tuple[int, pd.DataFrame], output_dir: str) -> int:
     """Process a chunk and return the number of successfully downloaded images."""
     batch_num, batch = chunk
-    return asyncio.run(process_batch(batch, output_dir, batch_num))
+    return asyncio.run(process_batch_single_node(batch, output_dir, batch_num))
 
 
 def download_webdataset(
@@ -169,11 +415,12 @@ def download_webdataset(
     num_processes: int = 2,
     max_entries: int | None = None,
 ) -> None:
-    """Stream a large Parquet of URLs/TEXT into WebDataset tar shards.
-
-    Uses pyarrow dataset streaming to avoid loading the entire Parquet into memory,
-    so it can scale to 100M+ rows (e.g., LAION subsets).
-
+    """
+    Single-node approach: Stream parquet into WebDataset tar shards using multiprocessing.
+    
+    This is the legacy approach that runs on a single machine. For distributed
+    processing across a Ray cluster, use `parquet_to_webdataset_ray()` instead.
+    
     Args:
         parquet_path: Path to the parquet file containing URLs and text
         output_dir: Directory to save the webdataset tar files
@@ -181,9 +428,16 @@ def download_webdataset(
         num_processes: Number of parallel download processes
         max_entries: Maximum number of entries to process (for testing). None = no limit.
     """
+    import math
+    from functools import partial
+    from multiprocessing import Pool
+    
+    import pyarrow.dataset as pa_ds
+    from tqdm import tqdm
+    
     os.makedirs(output_dir, exist_ok=True)
 
-    # Stream the Parquet in batches; resolve URL/TEXT in a case-insensitive way and map TEXT->caption if needed
+    # Stream the Parquet in batches
     dataset = pa_ds.dataset(parquet_path, format="parquet")
     schema = dataset.schema
     available = set(schema.names)
@@ -268,4 +522,3 @@ def batch_iter():
             os.rmdir(tmp_dir)
     except OSError as e:
         logger.debug(f"Failed to remove tmp dir {tmp_dir}: {e}")
-        
\ No newline at end of file
diff --git a/nemo_curator_semantic_dedup/image_dedup_example.py b/nemo_curator_semantic_dedup/image_dedup_example.py
index d8022be..fdcd198 100644
--- a/nemo_curator_semantic_dedup/image_dedup_example.py
+++ b/nemo_curator_semantic_dedup/image_dedup_example.py
@@ -17,7 +17,7 @@
 import time
 
 import ray
-from helper import download_webdataset
+from helper import download_webdataset, parquet_to_webdataset_ray
 
 from nemo_curator.backends.experimental.ray_actor_pool import RayActorPoolExecutor
 from nemo_curator.backends.experimental.ray_data import RayDataExecutor
@@ -132,27 +132,42 @@ def main(args: argparse.Namespace) -> None:
     print(f"Model directory: {args.model_dir}")
     print(f"Tar files per partition: {args.tar_files_per_partition}")
     print(f"Task batch size: {args.batch_size}")
+    print(f"Use Ray Data for parquet->tar: {args.use_ray_data}")
+    if args.max_entries:
+        print(f"Max entries (testing): {args.max_entries}")
     print("\n" + "=" * 50 + "\n")
 
     # Step 1: Download and prepare webdataset from parquet file
     if not args.skip_download:
-        print("Step 1: Downloading webdataset from parquet file...")
+        print("Step 1: Converting parquet to WebDataset tar files...")
+        print(f"  Approach: {'Ray Data (distributed)' if args.use_ray_data else 'Single-node multiprocessing'}")
         download_start = time.time()
 
         # Create output directory if it doesn't exist
         os.makedirs(args.input_wds_dataset_dir, exist_ok=True)
 
-        # Download webdataset using helper function
-        download_webdataset(
-            parquet_path=args.input_parquet,
-            output_dir=args.input_wds_dataset_dir,
-            num_processes=args.download_processes,
-            entries_per_tar=args.entries_per_tar,
-            max_entries=args.max_entries,
-        )
+        if args.use_ray_data:
+            # Use Ray Data for distributed processing across the cluster
+            stats = parquet_to_webdataset_ray(
+                parquet_path=args.input_parquet,
+                output_dir=args.input_wds_dataset_dir,
+                entries_per_tar=args.entries_per_tar,
+                max_entries=args.max_entries,
+                concurrency=args.download_concurrency,
+            )
+            print(f"  Created {stats['num_shards']} tar shards with {stats['total_success']} images")
+        else:
+            # Legacy single-node approach
+            download_webdataset(
+                parquet_path=args.input_parquet,
+                output_dir=args.input_wds_dataset_dir,
+                num_processes=args.download_processes,
+                entries_per_tar=args.entries_per_tar,
+                max_entries=args.max_entries,
+            )
 
         download_time = time.time() - download_start
-        print(f"✓ Dataset download completed in {download_time:.2f} seconds")
+        print(f"✓ Dataset conversion completed in {download_time:.2f} seconds")
         print(f"✓ Webdataset saved to: {args.input_wds_dataset_dir}")
         print("\n" + "=" * 50 + "\n")
     else:
@@ -307,6 +322,24 @@ def get_env_float(env_var: str, arg_value: float, default: float) -> float:
         default=None,
         help="Maximum entries to download for testing (env: MAX_ENTRIES). None = no limit."
     )
+    parser.add_argument(
+        "--use-ray-data",
+        action="store_true",
+        default=None,
+        help="Use Ray Data for distributed parquet->tar conversion (env: USE_RAY_DATA, default: true)"
+    )
+    parser.add_argument(
+        "--no-ray-data",
+        action="store_true",
+        default=False,
+        help="Disable Ray Data, use single-node multiprocessing instead"
+    )
+    parser.add_argument(
+        "--download-concurrency",
+        type=int,
+        default=None,
+        help="Number of concurrent download tasks for Ray Data (env: DOWNLOAD_CONCURRENCY)"
+    )
 
     # Image reader arguments
     parser.add_argument(
@@ -354,6 +387,12 @@ def get_env_float(env_var: str, arg_value: float, default: float) -> float:
     cli_args = parser.parse_args()
     
     # Resolve arguments from environment variables or command-line args
+    # Determine if Ray Data should be used (default: True unless --no-ray-data is set)
+    use_ray_data_env = os.environ.get("USE_RAY_DATA", "true").lower() in ("true", "1", "yes")
+    use_ray_data = use_ray_data_env if not cli_args.no_ray_data else False
+    if cli_args.use_ray_data:
+        use_ray_data = True
+    
     args = argparse.Namespace(
         input_parquet=get_env_or_arg("INPUT_PARQUET", cli_args.input_parquet),
         input_wds_dataset_dir=get_env_or_arg("INPUT_WDS_DIR", cli_args.input_wds_dataset_dir),
@@ -365,6 +404,8 @@ def get_env_float(env_var: str, arg_value: float, default: float) -> float:
         entries_per_tar=get_env_int("ENTRIES_PER_TAR", cli_args.entries_per_tar, 1000),
         max_entries=int(get_env_or_arg("MAX_ENTRIES", cli_args.max_entries)) if get_env_or_arg("MAX_ENTRIES", cli_args.max_entries) else None,
         skip_download=get_env_bool("SKIP_DOWNLOAD", cli_args.skip_download, False),
+        use_ray_data=use_ray_data,
+        download_concurrency=get_env_int("DOWNLOAD_CONCURRENCY", cli_args.download_concurrency, None) if get_env_or_arg("DOWNLOAD_CONCURRENCY", cli_args.download_concurrency) else None,
         tar_files_per_partition=get_env_int("TAR_FILES_PER_PARTITION", cli_args.tar_files_per_partition, 1),
         batch_size=get_env_int("BATCH_SIZE", cli_args.batch_size, 100),
         embedding_batch_size=get_env_int("EMBEDDING_BATCH_SIZE", cli_args.embedding_batch_size, 32),
diff --git a/nemo_curator_semantic_dedup/job.yaml b/nemo_curator_semantic_dedup/job.yaml
index 7467363..af08da5 100644
--- a/nemo_curator_semantic_dedup/job.yaml
+++ b/nemo_curator_semantic_dedup/job.yaml
@@ -1,5 +1,13 @@
 # NeMo Curator Image Semantic Deduplication Job
 # View the docs: https://docs.anyscale.com/reference/job-api#jobconfig
+#
+# This job runs a two-phase pipeline:
+#   Phase 1: Convert parquet (URLs) → WebDataset tar files (using Ray Data, distributed)
+#   Phase 2: Run NeMo Curator image deduplication (CLIP embeddings → semantic dedup)
+#
+# The parquet → tar conversion uses Ray Data to distribute image downloads
+# across all nodes in the cluster, providing much better scalability than
+# single-node processing.
 
 name: nemo-curator-image-dedup
 
@@ -50,16 +58,23 @@ env_vars:
   BATCH_SIZE: "4"
   EMBEDDING_BATCH_SIZE: "8"
   TAR_FILES_PER_PARTITION: "1"
-  DOWNLOAD_PROCESSES: "8"
   ENTRIES_PER_TAR: "500"
-
-  SKIP_DOWNLOAD: "false" # Always keep false
+  
+  # Ray Data settings for parquet -> tar conversion
+  # Uses distributed processing across all nodes in the cluster
+  USE_RAY_DATA: "true"  # Set to "false" for single-node multiprocessing
+  # DOWNLOAD_CONCURRENCY: ""  # Auto-detected from cluster resources if not set
+  
+  SKIP_DOWNLOAD: "false"  # Set to "true" to skip parquet->tar and use existing tars
   
   # Don't hide GPUs from tasks that request num_gpus=0 (needed for DALI)
   RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO: "0"
+  
+  # Disable Python output buffering for real-time logs
+  PYTHONUNBUFFERED: "1"
 
-# The entrypoint script
-entrypoint: python examples/nemo_curator_semantic_dedup/image_dedup_example.py
+# The entrypoint script (-u for unbuffered output)
+entrypoint: python -u examples/nemo_curator_semantic_dedup/image_dedup_example.py
 
 # Don't retry on failure - easier to debug
 max_retries: 0