diff --git a/.prettierignore b/.prettierignore
index e66246bae..92c713b9d 100644
--- a/.prettierignore
+++ b/.prettierignore
@@ -2,6 +2,6 @@
 .github
 dist
 docs
-scripts
+scripts/**/*.py
 types
 *.md
diff --git a/.prettierrc b/.prettierrc
index a9e0acb4f..39d89ff84 100644
--- a/.prettierrc
+++ b/.prettierrc
@@ -14,6 +14,13 @@
         "tabWidth": 2,
         "printWidth": 10000000
       }
+    },
+    {
+      "files": ["scripts/**/*.{js,mjs,cjs}"],
+      "options": {
+        "tabWidth": 2,
+        "printWidth": 120
+      }
     }
   ]
 }
diff --git a/package-lock.json b/package-lock.json
index 81c2b6aeb..8c03c290b 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -18,15 +18,13 @@
         "@types/jest": "^30.0.0",
         "@types/node": "^24.0.11",
         "@webgpu/types": "^0.1.64",
+        "esbuild": "^0.27.2",
         "jest": "^30.0.4",
         "jest-environment-node": "^30.0.4",
         "jsdoc-to-markdown": "^9.1.1",
         "prettier": "3.4.2",
         "typescript": "^5.8.3",
-        "wavefile": "11.0.0",
-        "webpack": "^5.99.9",
-        "webpack-cli": "^6.0.1",
-        "webpack-dev-server": "^5.2.2"
+        "wavefile": "11.0.0"
       }
     },
     "node_modules/@babel/code-frame": {
@@ -595,16 +593,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/@discoveryjs/json-ext": {
-      "version": "0.6.3",
-      "resolved": "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.6.3.tgz",
-      "integrity": "sha512-4B4OijXeVNOPZlYA2oEwWOTkzyltLao+xbotHQeqN++Rv27Y6s818+n2Qkp8q+Fxhn0t/5lA5X1Mxktud8eayQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=14.17.0"
-      }
-    },
     "node_modules/@emnapi/core": {
       "version": "1.7.1",
       "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.7.1.tgz",
@@ -639,6 +627,448 @@
         "tslib": "^2.4.0"
       }
     },
+    "node_modules/@esbuild/aix-ppc64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.2.tgz",
+      "integrity": "sha512-GZMB+a0mOMZs4MpDbj8RJp4cw+w1WV5NYD6xzgvzUJ5Ek2jerwfO2eADyI6ExDSUED+1X8aMbegahsJi+8mgpw==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "aix"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.2.tgz",
+      "integrity": "sha512-DVNI8jlPa7Ujbr1yjU2PfUSRtAUZPG9I1RwW4F4xFB1Imiu2on0ADiI/c3td+KmDtVKNbi+nffGDQMfcIMkwIA==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.2.tgz",
+      "integrity": "sha512-pvz8ZZ7ot/RBphf8fv60ljmaoydPU12VuXHImtAs0XhLLw+EXBi2BLe3OYSBslR4rryHvweW5gmkKFwTiFy6KA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/android-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.2.tgz",
+      "integrity": "sha512-z8Ank4Byh4TJJOh4wpz8g2vDy75zFL0TlZlkUkEwYXuPSgX8yzep596n6mT7905kA9uHZsf/o2OJZubl2l3M7A==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "android"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.2.tgz",
+      "integrity": "sha512-davCD2Zc80nzDVRwXTcQP/28fiJbcOwvdolL0sOiOsbwBa72kegmVU0Wrh1MYrbuCL98Omp5dVhQFWRKR2ZAlg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/darwin-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.2.tgz",
+      "integrity": "sha512-ZxtijOmlQCBWGwbVmwOF/UCzuGIbUkqB1faQRf5akQmxRJ1ujusWsb3CVfk/9iZKr2L5SMU5wPBi1UWbvL+VQA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.2.tgz",
+      "integrity": "sha512-lS/9CN+rgqQ9czogxlMcBMGd+l8Q3Nj1MFQwBZJyoEKI50XGxwuzznYdwcav6lpOGv5BqaZXqvBSiB/kJ5op+g==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/freebsd-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.2.tgz",
+      "integrity": "sha512-tAfqtNYb4YgPnJlEFu4c212HYjQWSO/w/h/lQaBK7RbwGIkBOuNKQI9tqWzx7Wtp7bTPaGC6MJvWI608P3wXYA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "freebsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.2.tgz",
+      "integrity": "sha512-vWfq4GaIMP9AIe4yj1ZUW18RDhx6EPQKjwe7n8BbIecFtCQG4CfHGaHuh7fdfq+y3LIA2vGS/o9ZBGVxIDi9hw==",
+      "cpu": [
+        "arm"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.2.tgz",
+      "integrity": "sha512-hYxN8pr66NsCCiRFkHUAsxylNOcAQaxSSkHMMjcpx0si13t1LHFphxJZUiGwojB1a/Hd5OiPIqDdXONia6bhTw==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ia32": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.2.tgz",
+      "integrity": "sha512-MJt5BRRSScPDwG2hLelYhAAKh9imjHK5+NE/tvnRLbIqUWa+0E9N4WNMjmp/kXXPHZGqPLxggwVhz7QP8CTR8w==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-loong64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.2.tgz",
+      "integrity": "sha512-lugyF1atnAT463aO6KPshVCJK5NgRnU4yb3FUumyVz+cGvZbontBgzeGFO1nF+dPueHD367a2ZXe1NtUkAjOtg==",
+      "cpu": [
+        "loong64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-mips64el": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.2.tgz",
+      "integrity": "sha512-nlP2I6ArEBewvJ2gjrrkESEZkB5mIoaTswuqNFRv/WYd+ATtUpe9Y09RnJvgvdag7he0OWgEZWhviS1OTOKixw==",
+      "cpu": [
+        "mips64el"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-ppc64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.2.tgz",
+      "integrity": "sha512-C92gnpey7tUQONqg1n6dKVbx3vphKtTHJaNG2Ok9lGwbZil6DrfyecMsp9CrmXGQJmZ7iiVXvvZH6Ml5hL6XdQ==",
+      "cpu": [
+        "ppc64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-riscv64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.2.tgz",
+      "integrity": "sha512-B5BOmojNtUyN8AXlK0QJyvjEZkWwy/FKvakkTDCziX95AowLZKR6aCDhG7LeF7uMCXEJqwa8Bejz5LTPYm8AvA==",
+      "cpu": [
+        "riscv64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-s390x": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.2.tgz",
+      "integrity": "sha512-p4bm9+wsPwup5Z8f4EpfN63qNagQ47Ua2znaqGH6bqLlmJ4bx97Y9JdqxgGZ6Y8xVTixUnEkoKSHcpRlDnNr5w==",
+      "cpu": [
+        "s390x"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/linux-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.2.tgz",
+      "integrity": "sha512-uwp2Tip5aPmH+NRUwTcfLb+W32WXjpFejTIOWZFw/v7/KnpCDKG66u4DLcurQpiYTiYwQ9B7KOeMJvLCu/OvbA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.2.tgz",
+      "integrity": "sha512-Kj6DiBlwXrPsCRDeRvGAUb/LNrBASrfqAIok+xB0LxK8CHqxZ037viF13ugfsIpePH93mX7xfJp97cyDuTZ3cw==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/netbsd-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.2.tgz",
+      "integrity": "sha512-HwGDZ0VLVBY3Y+Nw0JexZy9o/nUAWq9MlV7cahpaXKW6TOzfVno3y3/M8Ga8u8Yr7GldLOov27xiCnqRZf0tCA==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "netbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.2.tgz",
+      "integrity": "sha512-DNIHH2BPQ5551A7oSHD0CKbwIA/Ox7+78/AWkbS5QoRzaqlev2uFayfSxq68EkonB+IKjiuxBFoV8ESJy8bOHA==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openbsd-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.2.tgz",
+      "integrity": "sha512-/it7w9Nb7+0KFIzjalNJVR5bOzA9Vay+yIPLVHfIQYG/j+j9VTH84aNB8ExGKPU4AzfaEvN9/V4HV+F+vo8OEg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openbsd"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/openharmony-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.2.tgz",
+      "integrity": "sha512-LRBbCmiU51IXfeXk59csuX/aSaToeG7w48nMwA6049Y4J4+VbWALAuXcs+qcD04rHDuSCSRKdmY63sruDS5qag==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "openharmony"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/sunos-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.2.tgz",
+      "integrity": "sha512-kMtx1yqJHTmqaqHPAzKCAkDaKsffmXkPHThSfRwZGyuqyIeBvf08KSsYXl+abf5HDAPMJIPnbBfXvP2ZC2TfHg==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "sunos"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-arm64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.2.tgz",
+      "integrity": "sha512-Yaf78O/B3Kkh+nKABUF++bvJv5Ijoy9AN1ww904rOXZFLWVc5OLOfL56W+C8F9xn5JQZa3UX6m+IktJnIb1Jjg==",
+      "cpu": [
+        "arm64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-ia32": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.2.tgz",
+      "integrity": "sha512-Iuws0kxo4yusk7sw70Xa2E2imZU5HoixzxfGCdxwBdhiDgt9vX9VUCBhqcwY7/uh//78A1hMkkROMJq9l27oLQ==",
+      "cpu": [
+        "ia32"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@esbuild/win32-x64": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.2.tgz",
+      "integrity": "sha512-sRdU18mcKf7F+YgheI/zGf5alZatMUTKj/jNS6l744f9u3WFu4v7twcUI9vu4mknF4Y9aDlblIie0IM+5xxaqQ==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">=18"
+      }
+    },
     "node_modules/@huggingface/jinja": {
       "version": "0.5.1",
       "license": "MIT",
@@ -1103,17 +1533,6 @@
         "node": ">=6.0.0"
       }
     },
-    "node_modules/@jridgewell/source-map": {
-      "version": "0.3.10",
-      "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.10.tgz",
-      "integrity": "sha512-0pPkgz9dY+bijgistcTTJ5mR+ocqRXLuhXHYdzoMmmoJ2C9S46RCm2GMUbatPEUK9Yjy26IrAy8D/M00lLkv+Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@jridgewell/gen-mapping": "^0.3.5",
-        "@jridgewell/trace-mapping": "^0.3.25"
-      }
-    },
     "node_modules/@jridgewell/sourcemap-codec": {
       "version": "1.5.4",
       "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.4.tgz",
@@ -1143,62 +1562,6 @@
         "node": ">=v12.0.0"
       }
     },
-    "node_modules/@jsonjoy.com/base64": {
-      "version": "1.1.2",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">=10.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/streamich"
-      },
-      "peerDependencies": {
-        "tslib": "2"
-      }
-    },
-    "node_modules/@jsonjoy.com/json-pack": {
-      "version": "1.1.0",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@jsonjoy.com/base64": "^1.1.1",
-        "@jsonjoy.com/util": "^1.1.2",
-        "hyperdyperid": "^1.2.0",
-        "thingies": "^1.20.0"
-      },
-      "engines": {
-        "node": ">=10.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/streamich"
-      },
-      "peerDependencies": {
-        "tslib": "2"
-      }
-    },
-    "node_modules/@jsonjoy.com/util": {
-      "version": "1.5.0",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">=10.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/streamich"
-      },
-      "peerDependencies": {
-        "tslib": "2"
-      }
-    },
-    "node_modules/@leichtgewicht/ip-codec": {
-      "version": "2.0.5",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/@napi-rs/wasm-runtime": {
       "version": "0.2.12",
       "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz",
@@ -1353,139 +1716,46 @@
     "node_modules/@types/babel__core": {
       "version": "7.20.5",
       "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz",
-      "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/parser": "^7.20.7",
-        "@babel/types": "^7.20.7",
-        "@types/babel__generator": "*",
-        "@types/babel__template": "*",
-        "@types/babel__traverse": "*"
-      }
-    },
-    "node_modules/@types/babel__generator": {
-      "version": "7.27.0",
-      "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz",
-      "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/types": "^7.0.0"
-      }
-    },
-    "node_modules/@types/babel__template": {
-      "version": "7.4.4",
-      "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz",
-      "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/parser": "^7.1.0",
-        "@babel/types": "^7.0.0"
-      }
-    },
-    "node_modules/@types/babel__traverse": {
-      "version": "7.28.0",
-      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz",
-      "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/types": "^7.28.2"
-      }
-    },
-    "node_modules/@types/body-parser": {
-      "version": "1.19.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/connect": "*",
-        "@types/node": "*"
-      }
-    },
-    "node_modules/@types/bonjour": {
-      "version": "3.5.13",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/node": "*"
-      }
-    },
-    "node_modules/@types/connect": {
-      "version": "3.4.35",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/node": "*"
-      }
-    },
-    "node_modules/@types/connect-history-api-fallback": {
-      "version": "1.5.4",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/express-serve-static-core": "*",
-        "@types/node": "*"
-      }
-    },
-    "node_modules/@types/eslint": {
-      "version": "9.6.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/estree": "*",
-        "@types/json-schema": "*"
-      }
-    },
-    "node_modules/@types/eslint-scope": {
-      "version": "3.7.7",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/eslint": "*",
-        "@types/estree": "*"
-      }
-    },
-    "node_modules/@types/estree": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
-      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/express": {
-      "version": "4.17.21",
+      "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@types/body-parser": "*",
-        "@types/express-serve-static-core": "^4.17.33",
-        "@types/qs": "*",
-        "@types/serve-static": "*"
+        "@babel/parser": "^7.20.7",
+        "@babel/types": "^7.20.7",
+        "@types/babel__generator": "*",
+        "@types/babel__template": "*",
+        "@types/babel__traverse": "*"
       }
     },
-    "node_modules/@types/express-serve-static-core": {
-      "version": "4.17.33",
+    "node_modules/@types/babel__generator": {
+      "version": "7.27.0",
+      "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz",
+      "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@types/node": "*",
-        "@types/qs": "*",
-        "@types/range-parser": "*"
+        "@babel/types": "^7.0.0"
       }
     },
-    "node_modules/@types/http-errors": {
-      "version": "2.0.4",
+    "node_modules/@types/babel__template": {
+      "version": "7.4.4",
+      "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz",
+      "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==",
       "dev": true,
-      "license": "MIT"
+      "license": "MIT",
+      "dependencies": {
+        "@babel/parser": "^7.1.0",
+        "@babel/types": "^7.0.0"
+      }
     },
-    "node_modules/@types/http-proxy": {
-      "version": "1.17.10",
+    "node_modules/@types/babel__traverse": {
+      "version": "7.28.0",
+      "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz",
+      "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@types/node": "*"
+        "@babel/types": "^7.28.2"
       }
     },
     "node_modules/@types/istanbul-lib-coverage": {
@@ -1526,13 +1796,6 @@
         "pretty-format": "^30.0.0"
       }
     },
-    "node_modules/@types/json-schema": {
-      "version": "7.0.15",
-      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
-      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/@types/linkify-it": {
       "version": "5.0.0",
       "dev": true,
@@ -1552,11 +1815,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/@types/mime": {
-      "version": "1.3.5",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/@types/node": {
       "version": "24.0.14",
       "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.14.tgz",
@@ -1566,64 +1824,6 @@
         "undici-types": "~7.8.0"
       }
     },
-    "node_modules/@types/node-forge": {
-      "version": "1.3.11",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/node": "*"
-      }
-    },
-    "node_modules/@types/qs": {
-      "version": "6.9.7",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/range-parser": {
-      "version": "1.2.4",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/retry": {
-      "version": "0.12.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/send": {
-      "version": "0.17.4",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/mime": "^1",
-        "@types/node": "*"
-      }
-    },
-    "node_modules/@types/serve-index": {
-      "version": "1.9.4",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/express": "*"
-      }
-    },
-    "node_modules/@types/serve-static": {
-      "version": "1.15.7",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/http-errors": "*",
-        "@types/node": "*",
-        "@types/send": "*"
-      }
-    },
-    "node_modules/@types/sockjs": {
-      "version": "0.3.36",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/node": "*"
-      }
-    },
     "node_modules/@types/stack-utils": {
       "version": "2.0.3",
       "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz",
@@ -1631,14 +1831,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/@types/ws": {
-      "version": "8.5.13",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/node": "*"
-      }
-    },
     "node_modules/@types/yargs": {
       "version": "17.0.33",
       "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.33.tgz",
@@ -1854,316 +2046,90 @@
       ],
       "dev": true,
       "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-x64-musl": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-musl/-/resolver-binding-linux-x64-musl-1.11.1.tgz",
-      "integrity": "sha512-rV0YSoyhK2nZ4vEswT/QwqzqQXw5I6CjoaYMOX0TqBlWhojUf8P94mvI7nuJTeaCkkds3QE4+zS8Ko+GdXuZtA==",
-      "cpu": [
-        "x64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-wasm32-wasi": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-wasm32-wasi/-/resolver-binding-wasm32-wasi-1.11.1.tgz",
-      "integrity": "sha512-5u4RkfxJm+Ng7IWgkzi3qrFOvLvQYnPBmjmZQ8+szTK/b31fQCnleNl1GgEt7nIsZRIf5PLhPwT0WM+q45x/UQ==",
-      "cpu": [
-        "wasm32"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@napi-rs/wasm-runtime": "^0.2.11"
-      },
-      "engines": {
-        "node": ">=14.0.0"
-      }
-    },
-    "node_modules/@unrs/resolver-binding-win32-arm64-msvc": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-arm64-msvc/-/resolver-binding-win32-arm64-msvc-1.11.1.tgz",
-      "integrity": "sha512-nRcz5Il4ln0kMhfL8S3hLkxI85BXs3o8EYoattsJNdsX4YUU89iOkVn7g0VHSRxFuVMdM4Q1jEpIId1Ihim/Uw==",
-      "cpu": [
-        "arm64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-win32-ia32-msvc": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-ia32-msvc/-/resolver-binding-win32-ia32-msvc-1.11.1.tgz",
-      "integrity": "sha512-DCEI6t5i1NmAZp6pFonpD5m7i6aFrpofcp4LA2i8IIq60Jyo28hamKBxNrZcyOwVOZkgsRp9O2sXWBWP8MnvIQ==",
-      "cpu": [
-        "ia32"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-win32-x64-msvc": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-x64-msvc/-/resolver-binding-win32-x64-msvc-1.11.1.tgz",
-      "integrity": "sha512-lrW200hZdbfRtztbygyaq/6jP6AKE8qQN2KvPcJ+x7wiD038YtnYtZ82IMNJ69GJibV7bwL3y9FgK+5w/pYt6g==",
-      "cpu": [
-        "x64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ]
-    },
-    "node_modules/@webassemblyjs/ast": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/helper-numbers": "1.13.2",
-        "@webassemblyjs/helper-wasm-bytecode": "1.13.2"
-      }
-    },
-    "node_modules/@webassemblyjs/floating-point-hex-parser": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@webassemblyjs/helper-api-error": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@webassemblyjs/helper-buffer": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@webassemblyjs/helper-numbers": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/floating-point-hex-parser": "1.13.2",
-        "@webassemblyjs/helper-api-error": "1.13.2",
-        "@xtuc/long": "4.2.2"
-      }
-    },
-    "node_modules/@webassemblyjs/helper-wasm-bytecode": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@webassemblyjs/helper-wasm-section": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/ast": "1.14.1",
-        "@webassemblyjs/helper-buffer": "1.14.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
-        "@webassemblyjs/wasm-gen": "1.14.1"
-      }
-    },
-    "node_modules/@webassemblyjs/ieee754": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@xtuc/ieee754": "^1.2.0"
-      }
-    },
-    "node_modules/@webassemblyjs/leb128": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@xtuc/long": "4.2.2"
-      }
-    },
-    "node_modules/@webassemblyjs/utf8": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@webassemblyjs/wasm-edit": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/ast": "1.14.1",
-        "@webassemblyjs/helper-buffer": "1.14.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
-        "@webassemblyjs/helper-wasm-section": "1.14.1",
-        "@webassemblyjs/wasm-gen": "1.14.1",
-        "@webassemblyjs/wasm-opt": "1.14.1",
-        "@webassemblyjs/wasm-parser": "1.14.1",
-        "@webassemblyjs/wast-printer": "1.14.1"
-      }
-    },
-    "node_modules/@webassemblyjs/wasm-gen": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/ast": "1.14.1",
-        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
-        "@webassemblyjs/ieee754": "1.13.2",
-        "@webassemblyjs/leb128": "1.13.2",
-        "@webassemblyjs/utf8": "1.13.2"
-      }
-    },
-    "node_modules/@webassemblyjs/wasm-opt": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/ast": "1.14.1",
-        "@webassemblyjs/helper-buffer": "1.14.1",
-        "@webassemblyjs/wasm-gen": "1.14.1",
-        "@webassemblyjs/wasm-parser": "1.14.1"
-      }
-    },
-    "node_modules/@webassemblyjs/wasm-parser": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/ast": "1.14.1",
-        "@webassemblyjs/helper-api-error": "1.13.2",
-        "@webassemblyjs/helper-wasm-bytecode": "1.13.2",
-        "@webassemblyjs/ieee754": "1.13.2",
-        "@webassemblyjs/leb128": "1.13.2",
-        "@webassemblyjs/utf8": "1.13.2"
-      }
-    },
-    "node_modules/@webassemblyjs/wast-printer": {
-      "version": "1.14.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@webassemblyjs/ast": "1.14.1",
-        "@xtuc/long": "4.2.2"
-      }
-    },
-    "node_modules/@webgpu/types": {
-      "version": "0.1.64",
-      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.64.tgz",
-      "integrity": "sha512-84kRIAGV46LJTlJZWxShiOrNL30A+9KokD7RB3dRCIqODFjodS5tCD5yyiZ8kIReGVZSDfA3XkkwyyOIF6K62A==",
-      "dev": true,
-      "license": "BSD-3-Clause"
-    },
-    "node_modules/@webpack-cli/configtest": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/@webpack-cli/configtest/-/configtest-3.0.1.tgz",
-      "integrity": "sha512-u8d0pJ5YFgneF/GuvEiDA61Tf1VDomHHYMjv/wc9XzYj7nopltpG96nXN5dJRstxZhcNpV1g+nT6CydO7pHbjA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18.12.0"
-      },
-      "peerDependencies": {
-        "webpack": "^5.82.0",
-        "webpack-cli": "6.x.x"
-      }
-    },
-    "node_modules/@webpack-cli/info": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/@webpack-cli/info/-/info-3.0.1.tgz",
-      "integrity": "sha512-coEmDzc2u/ffMvuW9aCjoRzNSPDl/XLuhPdlFRpT9tZHmJ/039az33CE7uH+8s0uL1j5ZNtfdv0HkfaKRBGJsQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18.12.0"
-      },
-      "peerDependencies": {
-        "webpack": "^5.82.0",
-        "webpack-cli": "6.x.x"
-      }
-    },
-    "node_modules/@webpack-cli/serve": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/@webpack-cli/serve/-/serve-3.0.1.tgz",
-      "integrity": "sha512-sbgw03xQaCLiT6gcY/6u3qBDn01CWw/nbaXl3gTdTFuJJ75Gffv3E3DBpgvY2fkkrdS1fpjaXNOmJlnbtKauKg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18.12.0"
-      },
-      "peerDependencies": {
-        "webpack": "^5.82.0",
-        "webpack-cli": "6.x.x"
-      },
-      "peerDependenciesMeta": {
-        "webpack-dev-server": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/@xtuc/ieee754": {
-      "version": "1.2.0",
-      "dev": true,
-      "license": "BSD-3-Clause"
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
-    "node_modules/@xtuc/long": {
-      "version": "4.2.2",
+    "node_modules/@unrs/resolver-binding-linux-x64-musl": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-musl/-/resolver-binding-linux-x64-musl-1.11.1.tgz",
+      "integrity": "sha512-rV0YSoyhK2nZ4vEswT/QwqzqQXw5I6CjoaYMOX0TqBlWhojUf8P94mvI7nuJTeaCkkds3QE4+zS8Ko+GdXuZtA==",
+      "cpu": [
+        "x64"
+      ],
       "dev": true,
-      "license": "Apache-2.0"
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "linux"
+      ]
     },
-    "node_modules/accepts": {
-      "version": "1.3.8",
+    "node_modules/@unrs/resolver-binding-wasm32-wasi": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-wasm32-wasi/-/resolver-binding-wasm32-wasi-1.11.1.tgz",
+      "integrity": "sha512-5u4RkfxJm+Ng7IWgkzi3qrFOvLvQYnPBmjmZQ8+szTK/b31fQCnleNl1GgEt7nIsZRIf5PLhPwT0WM+q45x/UQ==",
+      "cpu": [
+        "wasm32"
+      ],
       "dev": true,
       "license": "MIT",
+      "optional": true,
       "dependencies": {
-        "mime-types": "~2.1.34",
-        "negotiator": "0.6.3"
+        "@napi-rs/wasm-runtime": "^0.2.11"
       },
       "engines": {
-        "node": ">= 0.6"
+        "node": ">=14.0.0"
       }
     },
-    "node_modules/acorn": {
-      "version": "8.15.0",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
-      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
+    "node_modules/@unrs/resolver-binding-win32-arm64-msvc": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-arm64-msvc/-/resolver-binding-win32-arm64-msvc-1.11.1.tgz",
+      "integrity": "sha512-nRcz5Il4ln0kMhfL8S3hLkxI85BXs3o8EYoattsJNdsX4YUU89iOkVn7g0VHSRxFuVMdM4Q1jEpIId1Ihim/Uw==",
+      "cpu": [
+        "arm64"
+      ],
       "dev": true,
       "license": "MIT",
-      "bin": {
-        "acorn": "bin/acorn"
-      },
-      "engines": {
-        "node": ">=0.4.0"
-      }
+      "optional": true,
+      "os": [
+        "win32"
+      ]
     },
-    "node_modules/acorn-import-phases": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/acorn-import-phases/-/acorn-import-phases-1.0.4.tgz",
-      "integrity": "sha512-wKmbr/DDiIXzEOiWrTTUcDm24kQ2vGfZQvM2fwg2vXqR5uW6aapr7ObPtj1th32b9u90/Pf4AItvdTh42fBmVQ==",
+    "node_modules/@unrs/resolver-binding-win32-ia32-msvc": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-ia32-msvc/-/resolver-binding-win32-ia32-msvc-1.11.1.tgz",
+      "integrity": "sha512-DCEI6t5i1NmAZp6pFonpD5m7i6aFrpofcp4LA2i8IIq60Jyo28hamKBxNrZcyOwVOZkgsRp9O2sXWBWP8MnvIQ==",
+      "cpu": [
+        "ia32"
+      ],
       "dev": true,
       "license": "MIT",
-      "engines": {
-        "node": ">=10.13.0"
-      },
-      "peerDependencies": {
-        "acorn": "^8.14.0"
-      }
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@unrs/resolver-binding-win32-x64-msvc": {
+      "version": "1.11.1",
+      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-x64-msvc/-/resolver-binding-win32-x64-msvc-1.11.1.tgz",
+      "integrity": "sha512-lrW200hZdbfRtztbygyaq/6jP6AKE8qQN2KvPcJ+x7wiD038YtnYtZ82IMNJ69GJibV7bwL3y9FgK+5w/pYt6g==",
+      "cpu": [
+        "x64"
+      ],
+      "dev": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "win32"
+      ]
+    },
+    "node_modules/@webgpu/types": {
+      "version": "0.1.64",
+      "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.64.tgz",
+      "integrity": "sha512-84kRIAGV46LJTlJZWxShiOrNL30A+9KokD7RB3dRCIqODFjodS5tCD5yyiZ8kIReGVZSDfA3XkkwyyOIF6K62A==",
+      "dev": true,
+      "license": "BSD-3-Clause"
     },
     "node_modules/adm-zip": {
       "version": "0.5.16",
@@ -2174,52 +2140,6 @@
         "node": ">=12.0"
       }
     },
-    "node_modules/ajv": {
-      "version": "8.17.1",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
-      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3",
-        "fast-uri": "^3.0.1",
-        "json-schema-traverse": "^1.0.0",
-        "require-from-string": "^2.0.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/ajv-formats": {
-      "version": "2.1.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ajv": "^8.0.0"
-      },
-      "peerDependencies": {
-        "ajv": "^8.0.0"
-      },
-      "peerDependenciesMeta": {
-        "ajv": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/ajv-keywords": {
-      "version": "5.1.0",
-      "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz",
-      "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3"
-      },
-      "peerDependencies": {
-        "ajv": "^8.8.2"
-      }
-    },
     "node_modules/ansi-escapes": {
       "version": "4.3.2",
       "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.2.tgz",
@@ -2236,17 +2156,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/ansi-html-community": {
-      "version": "0.0.8",
-      "dev": true,
-      "engines": [
-        "node >= 0.8.0"
-      ],
-      "license": "Apache-2.0",
-      "bin": {
-        "ansi-html": "bin/ansi-html"
-      }
-    },
     "node_modules/ansi-regex": {
       "version": "6.2.2",
       "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz",
@@ -2405,59 +2314,11 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/batch": {
-      "version": "0.6.1",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/binary-extensions": {
-      "version": "2.3.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/bluebird": {
       "version": "3.7.2",
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/body-parser": {
-      "version": "1.20.3",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "bytes": "3.1.2",
-        "content-type": "~1.0.5",
-        "debug": "2.6.9",
-        "depd": "2.0.0",
-        "destroy": "1.2.0",
-        "http-errors": "2.0.0",
-        "iconv-lite": "0.4.24",
-        "on-finished": "2.4.1",
-        "qs": "6.13.0",
-        "raw-body": "2.5.2",
-        "type-is": "~1.6.18",
-        "unpipe": "1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.8",
-        "npm": "1.2.8000 || >= 1.4.16"
-      }
-    },
-    "node_modules/bonjour-service": {
-      "version": "1.3.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.3",
-        "multicast-dns": "^7.2.5"
-      }
-    },
     "node_modules/boolean": {
       "version": "3.2.0",
       "license": "MIT"
@@ -2531,30 +2392,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/bundle-name": {
-      "version": "4.1.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "run-applescript": "^7.0.0"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/bytes": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
-      "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/cache-point": {
       "version": "3.0.1",
       "dev": true,
@@ -2574,24 +2411,6 @@
         }
       }
     },
-    "node_modules/call-bind": {
-      "version": "1.0.7",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-define-property": "^1.0.0",
-        "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.4",
-        "set-function-length": "^1.2.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/callsites": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
@@ -2681,37 +2500,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/chokidar": {
-      "version": "3.6.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "anymatch": "~3.1.2",
-        "braces": "~3.0.2",
-        "glob-parent": "~5.1.2",
-        "is-binary-path": "~2.1.0",
-        "is-glob": "~4.0.1",
-        "normalize-path": "~3.0.0",
-        "readdirp": "~3.6.0"
-      },
-      "engines": {
-        "node": ">= 8.10.0"
-      },
-      "funding": {
-        "url": "https://paulmillr.com/funding/"
-      },
-      "optionalDependencies": {
-        "fsevents": "~2.3.2"
-      }
-    },
-    "node_modules/chrome-trace-event": {
-      "version": "1.0.3",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.0"
-      }
-    },
     "node_modules/ci-info": {
       "version": "4.3.0",
       "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-4.3.0.tgz",
@@ -2813,21 +2601,6 @@
         "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
       }
     },
-    "node_modules/clone-deep": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-4.0.1.tgz",
-      "integrity": "sha512-neHB9xuzh/wk0dIHweyAXv2aPGZIVk3pLMe+/RNzINf17fe0OG96QroktYAUm7SM1PBnzTabaLboqqxDyMU+SQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-plain-object": "^2.0.4",
-        "kind-of": "^6.0.2",
-        "shallow-clone": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/co": {
       "version": "4.6.0",
       "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@@ -2879,11 +2652,6 @@
         "simple-swizzle": "^0.2.2"
       }
     },
-    "node_modules/colorette": {
-      "version": "2.0.20",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/command-line-args": {
       "version": "6.0.1",
       "dev": true,
@@ -2920,109 +2688,35 @@
         "node": ">=12.20.0"
       }
     },
-    "node_modules/commander": {
-      "version": "2.20.3",
-      "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz",
-      "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/common-sequence": {
-      "version": "3.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12.17"
-      }
-    },
-    "node_modules/compressible": {
-      "version": "2.0.18",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "mime-db": ">= 1.43.0 < 2"
-      },
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/compression": {
-      "version": "1.8.1",
-      "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz",
-      "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "bytes": "3.1.2",
-        "compressible": "~2.0.18",
-        "debug": "2.6.9",
-        "negotiator": "~0.6.4",
-        "on-headers": "~1.1.0",
-        "safe-buffer": "5.2.1",
-        "vary": "~1.1.2"
-      },
-      "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/compression/node_modules/negotiator": {
-      "version": "0.6.4",
-      "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz",
-      "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/concat-map": {
-      "version": "0.0.1",
-      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
-      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/config-master": {
-      "version": "3.1.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "walk-back": "^2.0.1"
-      }
-    },
-    "node_modules/config-master/node_modules/walk-back": {
-      "version": "2.0.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/connect-history-api-fallback": {
-      "version": "2.0.0",
+      "version": "3.0.0",
       "dev": true,
       "license": "MIT",
       "engines": {
-        "node": ">=0.8"
+        "node": ">=12.17"
       }
     },
-    "node_modules/content-disposition": {
-      "version": "0.5.4",
+    "node_modules/concat-map": {
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/config-master": {
+      "version": "3.1.0",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "safe-buffer": "5.2.1"
-      },
-      "engines": {
-        "node": ">= 0.6"
+        "walk-back": "^2.0.1"
       }
     },
-    "node_modules/content-type": {
-      "version": "1.0.5",
+    "node_modules/config-master/node_modules/walk-back": {
+      "version": "2.0.1",
       "dev": true,
       "license": "MIT",
       "engines": {
-        "node": ">= 0.6"
+        "node": ">=0.10.0"
       }
     },
     "node_modules/convert-source-map": {
@@ -3032,24 +2726,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/cookie": {
-      "version": "0.7.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/cookie-signature": {
-      "version": "1.0.6",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/core-util-is": {
-      "version": "1.0.3",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/cross-spawn": {
       "version": "7.0.6",
       "dev": true,
@@ -3071,14 +2747,6 @@
         "node": ">=12.17"
       }
     },
-    "node_modules/debug": {
-      "version": "2.6.9",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ms": "2.0.0"
-      }
-    },
     "node_modules/dedent": {
       "version": "1.7.0",
       "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.7.0.tgz",
@@ -3104,32 +2772,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/default-browser": {
-      "version": "5.2.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "bundle-name": "^4.1.0",
-        "default-browser-id": "^5.0.0"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/default-browser-id": {
-      "version": "5.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/define-data-property": {
       "version": "1.1.4",
       "license": "MIT",
@@ -3145,17 +2787,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/define-lazy-prop": {
-      "version": "3.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/define-properties": {
       "version": "1.2.1",
       "license": "MIT",
@@ -3171,23 +2802,6 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
-    "node_modules/depd": {
-      "version": "2.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
-    "node_modules/destroy": {
-      "version": "1.2.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8",
-        "npm": "1.2.8000 || >= 1.4.16"
-      }
-    },
     "node_modules/detect-libc": {
       "version": "2.0.4",
       "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz",
@@ -3236,17 +2850,6 @@
         }
       }
     },
-    "node_modules/dns-packet": {
-      "version": "5.6.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@leichtgewicht/ip-codec": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=6"
-      }
-    },
     "node_modules/eastasianwidth": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
@@ -3254,11 +2857,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/ee-first": {
-      "version": "1.1.1",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/electron-to-chromium": {
       "version": "1.5.50",
       "dev": true,
@@ -3284,28 +2882,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/encodeurl": {
-      "version": "2.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
-    "node_modules/enhanced-resolve": {
-      "version": "5.18.2",
-      "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.18.2.tgz",
-      "integrity": "sha512-6Jw4sE1maoRJo3q8MsSIn2onJFbLTOjY9hlx4DZXmOKvLRd1Ok2kXmAGXaafL2+ijsJZ1ClYbl/pmqr9+k4iUQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "graceful-fs": "^4.2.4",
-        "tapable": "^2.2.0"
-      },
-      "engines": {
-        "node": ">=10.13.0"
-      }
-    },
     "node_modules/entities": {
       "version": "4.5.0",
       "dev": true,
@@ -3317,19 +2893,6 @@
         "url": "https://github.com/fb55/entities?sponsor=1"
       }
     },
-    "node_modules/envinfo": {
-      "version": "7.14.0",
-      "resolved": "https://registry.npmjs.org/envinfo/-/envinfo-7.14.0.tgz",
-      "integrity": "sha512-CO40UI41xDQzhLB1hWyqUKgFhs250pNcGbyGKe1l/e4FSaI/+YE4IMG76GDt0In67WLPACIITC+sOi08x4wIvg==",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "envinfo": "dist/cli.js"
-      },
-      "engines": {
-        "node": ">=4"
-      }
-    },
     "node_modules/error-ex": {
       "version": "1.3.4",
       "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.4.tgz",
@@ -3364,15 +2927,52 @@
         "node": ">= 0.4"
       }
     },
-    "node_modules/es-module-lexer": {
-      "version": "1.2.1",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/es6-error": {
       "version": "4.1.1",
       "license": "MIT"
     },
+    "node_modules/esbuild": {
+      "version": "0.27.2",
+      "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.2.tgz",
+      "integrity": "sha512-HyNQImnsOC7X9PMNaCIeAm4ISCQXs5a5YasTXVliKv4uuBo1dKrG0A+uQS8M5eXjVMnLg3WgXaKvprHlFJQffw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "bin": {
+        "esbuild": "bin/esbuild"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "optionalDependencies": {
+        "@esbuild/aix-ppc64": "0.27.2",
+        "@esbuild/android-arm": "0.27.2",
+        "@esbuild/android-arm64": "0.27.2",
+        "@esbuild/android-x64": "0.27.2",
+        "@esbuild/darwin-arm64": "0.27.2",
+        "@esbuild/darwin-x64": "0.27.2",
+        "@esbuild/freebsd-arm64": "0.27.2",
+        "@esbuild/freebsd-x64": "0.27.2",
+        "@esbuild/linux-arm": "0.27.2",
+        "@esbuild/linux-arm64": "0.27.2",
+        "@esbuild/linux-ia32": "0.27.2",
+        "@esbuild/linux-loong64": "0.27.2",
+        "@esbuild/linux-mips64el": "0.27.2",
+        "@esbuild/linux-ppc64": "0.27.2",
+        "@esbuild/linux-riscv64": "0.27.2",
+        "@esbuild/linux-s390x": "0.27.2",
+        "@esbuild/linux-x64": "0.27.2",
+        "@esbuild/netbsd-arm64": "0.27.2",
+        "@esbuild/netbsd-x64": "0.27.2",
+        "@esbuild/openbsd-arm64": "0.27.2",
+        "@esbuild/openbsd-x64": "0.27.2",
+        "@esbuild/openharmony-arm64": "0.27.2",
+        "@esbuild/sunos-x64": "0.27.2",
+        "@esbuild/win32-arm64": "0.27.2",
+        "@esbuild/win32-ia32": "0.27.2",
+        "@esbuild/win32-x64": "0.27.2"
+      }
+    },
     "node_modules/escalade": {
       "version": "3.2.0",
       "dev": true,
@@ -3381,11 +2981,6 @@
         "node": ">=6"
       }
     },
-    "node_modules/escape-html": {
-      "version": "1.0.3",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/escape-string-regexp": {
       "version": "2.0.0",
       "dev": true,
@@ -3394,18 +2989,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/eslint-scope": {
-      "version": "5.1.1",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "esrecurse": "^4.3.0",
-        "estraverse": "^4.1.1"
-      },
-      "engines": {
-        "node": ">=8.0.0"
-      }
-    },
     "node_modules/esprima": {
       "version": "4.0.1",
       "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
@@ -3420,54 +3003,6 @@
         "node": ">=4"
       }
     },
-    "node_modules/esrecurse": {
-      "version": "4.3.0",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "estraverse": "^5.2.0"
-      },
-      "engines": {
-        "node": ">=4.0"
-      }
-    },
-    "node_modules/esrecurse/node_modules/estraverse": {
-      "version": "5.3.0",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=4.0"
-      }
-    },
-    "node_modules/estraverse": {
-      "version": "4.3.0",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=4.0"
-      }
-    },
-    "node_modules/etag": {
-      "version": "1.8.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/eventemitter3": {
-      "version": "4.0.7",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/events": {
-      "version": "3.3.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.8.x"
-      }
-    },
     "node_modules/execa": {
       "version": "5.1.1",
       "resolved": "https://registry.npmjs.org/execa/-/execa-5.1.1.tgz",
@@ -3527,61 +3062,6 @@
         "node": "^18.14.0 || ^20.0.0 || ^22.0.0 || >=24.0.0"
       }
     },
-    "node_modules/express": {
-      "version": "4.21.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "accepts": "~1.3.8",
-        "array-flatten": "1.1.1",
-        "body-parser": "1.20.3",
-        "content-disposition": "0.5.4",
-        "content-type": "~1.0.4",
-        "cookie": "0.7.1",
-        "cookie-signature": "1.0.6",
-        "debug": "2.6.9",
-        "depd": "2.0.0",
-        "encodeurl": "~2.0.0",
-        "escape-html": "~1.0.3",
-        "etag": "~1.8.1",
-        "finalhandler": "1.3.1",
-        "fresh": "0.5.2",
-        "http-errors": "2.0.0",
-        "merge-descriptors": "1.0.3",
-        "methods": "~1.1.2",
-        "on-finished": "2.4.1",
-        "parseurl": "~1.3.3",
-        "path-to-regexp": "0.1.12",
-        "proxy-addr": "~2.0.7",
-        "qs": "6.13.0",
-        "range-parser": "~1.2.1",
-        "safe-buffer": "5.2.1",
-        "send": "0.19.0",
-        "serve-static": "1.16.2",
-        "setprototypeof": "1.2.0",
-        "statuses": "2.0.1",
-        "type-is": "~1.6.18",
-        "utils-merge": "1.0.1",
-        "vary": "~1.1.2"
-      },
-      "engines": {
-        "node": ">= 0.10.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/express"
-      }
-    },
-    "node_modules/express/node_modules/array-flatten": {
-      "version": "1.1.1",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/fast-deep-equal": {
-      "version": "3.1.3",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/fast-glob": {
       "version": "3.3.2",
       "dev": true,
@@ -3604,19 +3084,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/fast-uri": {
-      "version": "3.0.3",
-      "dev": true,
-      "license": "BSD-3-Clause"
-    },
-    "node_modules/fastest-levenshtein": {
-      "version": "1.0.16",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 4.9.1"
-      }
-    },
     "node_modules/fastq": {
       "version": "1.17.1",
       "dev": true,
@@ -3625,17 +3092,6 @@
         "reusify": "^1.0.4"
       }
     },
-    "node_modules/faye-websocket": {
-      "version": "0.11.4",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "websocket-driver": ">=0.5.1"
-      },
-      "engines": {
-        "node": ">=0.8.0"
-      }
-    },
     "node_modules/fb-watchman": {
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/fb-watchman/-/fb-watchman-2.0.2.tgz",
@@ -3677,23 +3133,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/finalhandler": {
-      "version": "1.3.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "debug": "2.6.9",
-        "encodeurl": "~2.0.0",
-        "escape-html": "~1.0.3",
-        "on-finished": "2.4.1",
-        "parseurl": "~1.3.3",
-        "statuses": "2.0.1",
-        "unpipe": "~1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/find-replace": {
       "version": "5.0.2",
       "dev": true,
@@ -3722,39 +3161,10 @@
         "node": ">=8"
       }
     },
-    "node_modules/flat": {
-      "version": "5.0.2",
-      "resolved": "https://registry.npmjs.org/flat/-/flat-5.0.2.tgz",
-      "integrity": "sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==",
-      "dev": true,
-      "license": "BSD-3-Clause",
-      "bin": {
-        "flat": "cli.js"
-      }
-    },
     "node_modules/flatbuffers": {
       "version": "25.1.24",
       "license": "Apache-2.0"
     },
-    "node_modules/follow-redirects": {
-      "version": "1.15.6",
-      "dev": true,
-      "funding": [
-        {
-          "type": "individual",
-          "url": "https://github.com/sponsors/RubenVerborgh"
-        }
-      ],
-      "license": "MIT",
-      "engines": {
-        "node": ">=4.0"
-      },
-      "peerDependenciesMeta": {
-        "debug": {
-          "optional": true
-        }
-      }
-    },
     "node_modules/foreground-child": {
       "version": "3.3.1",
       "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz",
@@ -3772,22 +3182,6 @@
         "url": "https://github.com/sponsors/isaacs"
       }
     },
-    "node_modules/forwarded": {
-      "version": "0.2.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/fresh": {
-      "version": "0.5.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
     "node_modules/fs.realpath": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
@@ -3909,11 +3303,6 @@
         "node": ">= 6"
       }
     },
-    "node_modules/glob-to-regexp": {
-      "version": "0.4.1",
-      "dev": true,
-      "license": "BSD-2-Clause"
-    },
     "node_modules/global-agent": {
       "version": "3.0.0",
       "license": "BSD-3-Clause",
@@ -3962,11 +3351,6 @@
       "version": "1.0.9",
       "license": "ISC"
     },
-    "node_modules/handle-thing": {
-      "version": "2.0.1",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/handlebars": {
       "version": "4.7.8",
       "dev": true,
@@ -3983,19 +3367,8 @@
       "engines": {
         "node": ">=0.4.7"
       },
-      "optionalDependencies": {
-        "uglify-js": "^3.1.4"
-      }
-    },
-    "node_modules/has": {
-      "version": "1.0.3",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "function-bind": "^1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.4.0"
+      "optionalDependencies": {
+        "uglify-js": "^3.1.4"
       }
     },
     "node_modules/has-flag": {
@@ -4046,44 +3419,6 @@
         "node": ">= 0.4"
       }
     },
-    "node_modules/hpack.js": {
-      "version": "2.1.6",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "inherits": "^2.0.1",
-        "obuf": "^1.0.0",
-        "readable-stream": "^2.0.1",
-        "wbuf": "^1.1.0"
-      }
-    },
-    "node_modules/hpack.js/node_modules/readable-stream": {
-      "version": "2.3.8",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "core-util-is": "~1.0.0",
-        "inherits": "~2.0.3",
-        "isarray": "~1.0.0",
-        "process-nextick-args": "~2.0.0",
-        "safe-buffer": "~5.1.1",
-        "string_decoder": "~1.1.1",
-        "util-deprecate": "~1.0.1"
-      }
-    },
-    "node_modules/hpack.js/node_modules/safe-buffer": {
-      "version": "5.1.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/hpack.js/node_modules/string_decoder": {
-      "version": "1.1.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "safe-buffer": "~5.1.0"
-      }
-    },
     "node_modules/html-escaper": {
       "version": "2.0.2",
       "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz",
@@ -4091,67 +3426,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/http-deceiver": {
-      "version": "1.2.7",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/http-errors": {
-      "version": "2.0.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "depd": "2.0.0",
-        "inherits": "2.0.4",
-        "setprototypeof": "1.2.0",
-        "statuses": "2.0.1",
-        "toidentifier": "1.0.1"
-      },
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
-    "node_modules/http-parser-js": {
-      "version": "0.5.8",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/http-proxy": {
-      "version": "1.18.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "eventemitter3": "^4.0.0",
-        "follow-redirects": "^1.0.0",
-        "requires-port": "^1.0.0"
-      },
-      "engines": {
-        "node": ">=8.0.0"
-      }
-    },
-    "node_modules/http-proxy-middleware": {
-      "version": "2.0.9",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/http-proxy": "^1.17.8",
-        "http-proxy": "^1.18.1",
-        "is-glob": "^4.0.1",
-        "is-plain-obj": "^3.0.0",
-        "micromatch": "^4.0.2"
-      },
-      "engines": {
-        "node": ">=12.0.0"
-      },
-      "peerDependencies": {
-        "@types/express": "^4.17.13"
-      },
-      "peerDependenciesMeta": {
-        "@types/express": {
-          "optional": true
-        }
-      }
-    },
     "node_modules/human-signals": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz",
@@ -4162,25 +3436,6 @@
         "node": ">=10.17.0"
       }
     },
-    "node_modules/hyperdyperid": {
-      "version": "1.2.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10.18"
-      }
-    },
-    "node_modules/iconv-lite": {
-      "version": "0.4.24",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "safer-buffer": ">= 2.1.2 < 3"
-      },
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/import-local": {
       "version": "3.2.0",
       "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.2.0.tgz",
@@ -4228,62 +3483,10 @@
       "dev": true,
       "license": "ISC"
     },
-    "node_modules/interpret": {
-      "version": "3.1.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10.13.0"
-      }
-    },
-    "node_modules/ipaddr.js": {
-      "version": "2.2.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 10"
-      }
-    },
     "node_modules/is-arrayish": {
       "version": "0.3.2",
       "license": "MIT"
     },
-    "node_modules/is-binary-path": {
-      "version": "2.1.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "binary-extensions": "^2.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/is-core-module": {
-      "version": "2.12.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "has": "^1.0.3"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-docker": {
-      "version": "3.0.0",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "is-docker": "cli.js"
-      },
-      "engines": {
-        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/is-extglob": {
       "version": "2.1.1",
       "dev": true,
@@ -4323,34 +3526,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/is-inside-container": {
-      "version": "1.0.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-docker": "^3.0.0"
-      },
-      "bin": {
-        "is-inside-container": "cli.js"
-      },
-      "engines": {
-        "node": ">=14.16"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/is-network-error": {
-      "version": "1.1.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=16"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/is-number": {
       "version": "7.0.0",
       "dev": true,
@@ -4359,30 +3534,6 @@
         "node": ">=0.12.0"
       }
     },
-    "node_modules/is-plain-obj": {
-      "version": "3.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/is-plain-object": {
-      "version": "2.0.4",
-      "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
-      "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "isobject": "^3.0.1"
-      },
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/is-stream": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
@@ -4396,40 +3547,11 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/is-wsl": {
-      "version": "3.1.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-inside-container": "^1.0.0"
-      },
-      "engines": {
-        "node": ">=16"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/isarray": {
-      "version": "1.0.0",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/isexe": {
       "version": "2.0.0",
       "dev": true,
       "license": "ISC"
     },
-    "node_modules/isobject": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
-      "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/istanbul-lib-coverage": {
       "version": "3.2.2",
       "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz",
@@ -5336,13 +4458,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/json-schema-traverse": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
-      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/json-stringify-safe": {
       "version": "5.0.1",
       "license": "ISC"
@@ -5360,16 +4475,6 @@
         "node": ">=6"
       }
     },
-    "node_modules/kind-of": {
-      "version": "6.0.3",
-      "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz",
-      "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
     "node_modules/klaw": {
       "version": "3.0.0",
       "dev": true,
@@ -5378,15 +4483,6 @@
         "graceful-fs": "^4.1.9"
       }
     },
-    "node_modules/launch-editor": {
-      "version": "2.9.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "picocolors": "^1.0.0",
-        "shell-quote": "^1.8.1"
-      }
-    },
     "node_modules/leven": {
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz",
@@ -5412,14 +4508,6 @@
         "uc.micro": "^2.0.0"
       }
     },
-    "node_modules/loader-runner": {
-      "version": "4.3.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.11.5"
-      }
-    },
     "node_modules/locate-path": {
       "version": "5.0.0",
       "dev": true,
@@ -5547,103 +4635,31 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/media-typer": {
-      "version": "0.3.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/memfs": {
-      "version": "4.14.1",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@jsonjoy.com/json-pack": "^1.0.3",
-        "@jsonjoy.com/util": "^1.3.0",
-        "tree-dump": "^1.0.1",
-        "tslib": "^2.0.0"
-      },
-      "engines": {
-        "node": ">= 4.0.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/streamich"
-      }
-    },
-    "node_modules/merge-descriptors": {
-      "version": "1.0.3",
-      "dev": true,
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/merge-stream": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
-      "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/merge2": {
-      "version": "1.4.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/methods": {
-      "version": "1.1.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/micromatch": {
-      "version": "4.0.8",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "braces": "^3.0.3",
-        "picomatch": "^2.3.1"
-      },
-      "engines": {
-        "node": ">=8.6"
-      }
-    },
-    "node_modules/mime": {
-      "version": "1.6.0",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "mime": "cli.js"
-      },
-      "engines": {
-        "node": ">=4"
-      }
+      "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==",
+      "dev": true,
+      "license": "MIT"
     },
-    "node_modules/mime-db": {
-      "version": "1.52.0",
+    "node_modules/merge2": {
+      "version": "1.4.1",
       "dev": true,
       "license": "MIT",
       "engines": {
-        "node": ">= 0.6"
+        "node": ">= 8"
       }
     },
-    "node_modules/mime-types": {
-      "version": "2.1.35",
+    "node_modules/micromatch": {
+      "version": "4.0.8",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "mime-db": "1.52.0"
+        "braces": "^3.0.3",
+        "picomatch": "^2.3.1"
       },
       "engines": {
-        "node": ">= 0.6"
+        "node": ">=8.6"
       }
     },
     "node_modules/mimic-fn": {
@@ -5656,11 +4672,6 @@
         "node": ">=6"
       }
     },
-    "node_modules/minimalistic-assert": {
-      "version": "1.0.1",
-      "dev": true,
-      "license": "ISC"
-    },
     "node_modules/minimatch": {
       "version": "9.0.5",
       "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
@@ -5706,23 +4717,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/ms": {
-      "version": "2.0.0",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/multicast-dns": {
-      "version": "7.2.5",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "dns-packet": "^5.2.2",
-        "thunky": "^1.0.2"
-      },
-      "bin": {
-        "multicast-dns": "cli.js"
-      }
-    },
     "node_modules/napi-postinstall": {
       "version": "0.3.4",
       "resolved": "https://registry.npmjs.org/napi-postinstall/-/napi-postinstall-0.3.4.tgz",
@@ -5746,27 +4740,11 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/negotiator": {
-      "version": "0.6.3",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
     "node_modules/neo-async": {
       "version": "2.6.2",
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/node-forge": {
-      "version": "1.3.1",
-      "dev": true,
-      "license": "(BSD-3-Clause OR GPL-2.0)",
-      "engines": {
-        "node": ">= 6.13.0"
-      }
-    },
     "node_modules/node-int64": {
       "version": "0.4.0",
       "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz",
@@ -5800,17 +4778,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/object-inspect": {
-      "version": "1.13.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/object-keys": {
       "version": "1.1.1",
       "license": "MIT",
@@ -5826,32 +4793,6 @@
         "node": ">=8.0.0"
       }
     },
-    "node_modules/obuf": {
-      "version": "1.1.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/on-finished": {
-      "version": "2.4.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ee-first": "1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
-    "node_modules/on-headers": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz",
-      "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/once": {
       "version": "1.4.0",
       "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
@@ -5915,23 +4856,6 @@
         "protobufjs": "^7.2.4"
       }
     },
-    "node_modules/open": {
-      "version": "10.1.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "default-browser": "^5.2.1",
-        "define-lazy-prop": "^3.0.0",
-        "is-inside-container": "^1.0.0",
-        "is-wsl": "^3.1.0"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/p-limit": {
       "version": "2.3.0",
       "dev": true,
@@ -5957,22 +4881,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/p-retry": {
-      "version": "6.2.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/retry": "0.12.2",
-        "is-network-error": "^1.0.0",
-        "retry": "^0.13.1"
-      },
-      "engines": {
-        "node": ">=16.17"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/p-try": {
       "version": "2.2.0",
       "dev": true,
@@ -6007,14 +4915,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/parseurl": {
-      "version": "1.3.3",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/path-exists": {
       "version": "4.0.0",
       "dev": true,
@@ -6041,11 +4941,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/path-parse": {
-      "version": "1.0.7",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/path-scurry": {
       "version": "1.11.1",
       "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz",
@@ -6070,11 +4965,6 @@
       "dev": true,
       "license": "ISC"
     },
-    "node_modules/path-to-regexp": {
-      "version": "0.1.12",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/picocolors": {
       "version": "1.1.1",
       "dev": true,
@@ -6158,11 +5048,6 @@
         "url": "https://github.com/chalk/ansi-styles?sponsor=1"
       }
     },
-    "node_modules/process-nextick-args": {
-      "version": "2.0.1",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/protobufjs": {
       "version": "7.2.6",
       "hasInstallScript": true,
@@ -6185,26 +5070,6 @@
         "node": ">=12.0.0"
       }
     },
-    "node_modules/proxy-addr": {
-      "version": "2.0.7",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "forwarded": "0.2.0",
-        "ipaddr.js": "1.9.1"
-      },
-      "engines": {
-        "node": ">= 0.10"
-      }
-    },
-    "node_modules/proxy-addr/node_modules/ipaddr.js": {
-      "version": "1.9.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.10"
-      }
-    },
     "node_modules/punycode.js": {
       "version": "2.3.1",
       "dev": true,
@@ -6230,20 +5095,6 @@
       ],
       "license": "MIT"
     },
-    "node_modules/qs": {
-      "version": "6.13.0",
-      "dev": true,
-      "license": "BSD-3-Clause",
-      "dependencies": {
-        "side-channel": "^1.0.6"
-      },
-      "engines": {
-        "node": ">=0.6"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/queue-microtask": {
       "version": "1.2.3",
       "dev": true,
@@ -6263,38 +5114,6 @@
       ],
       "license": "MIT"
     },
-    "node_modules/randombytes": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz",
-      "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "safe-buffer": "^5.1.0"
-      }
-    },
-    "node_modules/range-parser": {
-      "version": "1.2.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/raw-body": {
-      "version": "2.5.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "bytes": "3.1.2",
-        "http-errors": "2.0.0",
-        "iconv-lite": "0.4.24",
-        "unpipe": "1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/react-is": {
       "version": "18.3.1",
       "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.3.1.tgz",
@@ -6302,41 +5121,6 @@
       "dev": true,
       "license": "MIT"
     },
-    "node_modules/readable-stream": {
-      "version": "3.6.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "inherits": "^2.0.3",
-        "string_decoder": "^1.1.1",
-        "util-deprecate": "^1.0.1"
-      },
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/readdirp": {
-      "version": "3.6.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "picomatch": "^2.2.1"
-      },
-      "engines": {
-        "node": ">=8.10.0"
-      }
-    },
-    "node_modules/rechoir": {
-      "version": "0.8.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "resolve": "^1.20.0"
-      },
-      "engines": {
-        "node": ">= 10.13.0"
-      }
-    },
     "node_modules/require-directory": {
       "version": "2.1.1",
       "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
@@ -6347,19 +5131,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/require-from-string": {
-      "version": "2.0.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/requires-port": {
-      "version": "1.0.0",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/requizzle": {
       "version": "0.2.4",
       "dev": true,
@@ -6368,22 +5139,6 @@
         "lodash": "^4.17.21"
       }
     },
-    "node_modules/resolve": {
-      "version": "1.22.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-core-module": "^2.11.0",
-        "path-parse": "^1.0.7",
-        "supports-preserve-symlinks-flag": "^1.0.0"
-      },
-      "bin": {
-        "resolve": "bin/resolve"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/resolve-cwd": {
       "version": "3.0.0",
       "dev": true,
@@ -6403,14 +5158,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/retry": {
-      "version": "0.13.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 4"
-      }
-    },
     "node_modules/reusify": {
       "version": "1.0.4",
       "dev": true,
@@ -6439,17 +5186,6 @@
       "version": "1.1.3",
       "license": "BSD-3-Clause"
     },
-    "node_modules/run-applescript": {
-      "version": "7.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
     "node_modules/run-parallel": {
       "version": "1.2.0",
       "dev": true,
@@ -6472,117 +5208,20 @@
         "queue-microtask": "^1.2.2"
       }
     },
-    "node_modules/safe-buffer": {
-      "version": "5.2.1",
-      "dev": true,
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ],
-      "license": "MIT"
-    },
-    "node_modules/safer-buffer": {
-      "version": "2.1.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/schema-utils": {
-      "version": "4.3.2",
-      "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.2.tgz",
-      "integrity": "sha512-Gn/JaSk/Mt9gYubxTtSn/QCV4em9mpAPiR1rqy/Ocu19u/G9J5WWdNoUT4SiV6mFC3y6cxyFcFwdzPM3FgxGAQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/json-schema": "^7.0.9",
-        "ajv": "^8.9.0",
-        "ajv-formats": "^2.1.1",
-        "ajv-keywords": "^5.1.0"
-      },
-      "engines": {
-        "node": ">= 10.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      }
-    },
-    "node_modules/select-hose": {
-      "version": "2.0.0",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/selfsigned": {
-      "version": "2.4.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/node-forge": "^1.3.0",
-        "node-forge": "^1"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/semver": {
       "version": "7.7.2",
       "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz",
       "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==",
-      "license": "ISC",
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
-    "node_modules/semver-compare": {
-      "version": "1.0.0",
-      "license": "MIT"
-    },
-    "node_modules/send": {
-      "version": "0.19.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "debug": "2.6.9",
-        "depd": "2.0.0",
-        "destroy": "1.2.0",
-        "encodeurl": "~1.0.2",
-        "escape-html": "~1.0.3",
-        "etag": "~1.8.1",
-        "fresh": "0.5.2",
-        "http-errors": "2.0.0",
-        "mime": "1.6.0",
-        "ms": "2.1.3",
-        "on-finished": "2.4.1",
-        "range-parser": "~1.2.1",
-        "statuses": "2.0.1"
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver.js"
       },
       "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/send/node_modules/encodeurl": {
-      "version": "1.0.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
+        "node": ">=10"
       }
     },
-    "node_modules/send/node_modules/ms": {
-      "version": "2.1.3",
-      "dev": true,
+    "node_modules/semver-compare": {
+      "version": "1.0.0",
       "license": "MIT"
     },
     "node_modules/serialize-error": {
@@ -6608,121 +5247,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/serialize-javascript": {
-      "version": "6.0.2",
-      "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-6.0.2.tgz",
-      "integrity": "sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==",
-      "dev": true,
-      "license": "BSD-3-Clause",
-      "dependencies": {
-        "randombytes": "^2.1.0"
-      }
-    },
-    "node_modules/serve-index": {
-      "version": "1.9.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "accepts": "~1.3.4",
-        "batch": "0.6.1",
-        "debug": "2.6.9",
-        "escape-html": "~1.0.3",
-        "http-errors": "~1.6.2",
-        "mime-types": "~2.1.17",
-        "parseurl": "~1.3.2"
-      },
-      "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/serve-index/node_modules/depd": {
-      "version": "1.1.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/serve-index/node_modules/http-errors": {
-      "version": "1.6.3",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "depd": "~1.1.2",
-        "inherits": "2.0.3",
-        "setprototypeof": "1.1.0",
-        "statuses": ">= 1.4.0 < 2"
-      },
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/serve-index/node_modules/inherits": {
-      "version": "2.0.3",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/serve-index/node_modules/setprototypeof": {
-      "version": "1.1.0",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/serve-index/node_modules/statuses": {
-      "version": "1.5.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
-    "node_modules/serve-static": {
-      "version": "1.16.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "encodeurl": "~2.0.0",
-        "escape-html": "~1.0.3",
-        "parseurl": "~1.3.3",
-        "send": "0.19.0"
-      },
-      "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/set-function-length": {
-      "version": "1.2.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "define-data-property": "^1.1.4",
-        "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.4",
-        "gopd": "^1.0.1",
-        "has-property-descriptors": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/setprototypeof": {
-      "version": "1.2.0",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/shallow-clone": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-3.0.1.tgz",
-      "integrity": "sha512-/6KqX+GVUdqPuPPd2LxDDxzX6CAbjJehAAOKlNpqqUpAqPM6HeL8f+o3a+JsyGjn2lv0WY8UsTgUJjU9Ok55NA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "kind-of": "^6.0.2"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
     "node_modules/sharp": {
       "version": "0.34.3",
       "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.3.tgz",
@@ -6784,34 +5308,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/shell-quote": {
-      "version": "1.8.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/side-channel": {
-      "version": "1.0.6",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.4",
-        "object-inspect": "^1.13.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
     "node_modules/signal-exit": {
       "version": "4.1.0",
       "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
@@ -6842,16 +5338,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/sockjs": {
-      "version": "0.3.24",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "faye-websocket": "^0.11.3",
-        "uuid": "^8.3.2",
-        "websocket-driver": "^0.7.4"
-      }
-    },
     "node_modules/sort-array": {
       "version": "5.0.0",
       "dev": true,
@@ -6891,76 +5377,6 @@
         "source-map": "^0.6.0"
       }
     },
-    "node_modules/spdy": {
-      "version": "4.0.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "debug": "^4.1.0",
-        "handle-thing": "^2.0.0",
-        "http-deceiver": "^1.2.7",
-        "select-hose": "^2.0.0",
-        "spdy-transport": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=6.0.0"
-      }
-    },
-    "node_modules/spdy-transport": {
-      "version": "3.0.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "debug": "^4.1.0",
-        "detect-node": "^2.0.4",
-        "hpack.js": "^2.1.6",
-        "obuf": "^1.1.2",
-        "readable-stream": "^3.0.6",
-        "wbuf": "^1.7.3"
-      }
-    },
-    "node_modules/spdy-transport/node_modules/debug": {
-      "version": "4.3.4",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ms": "2.1.2"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/spdy-transport/node_modules/ms": {
-      "version": "2.1.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/spdy/node_modules/debug": {
-      "version": "4.3.4",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ms": "2.1.2"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/spdy/node_modules/ms": {
-      "version": "2.1.2",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/sprintf-js": {
       "version": "1.0.3",
       "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
@@ -6981,22 +5397,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/statuses": {
-      "version": "2.0.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
-    "node_modules/string_decoder": {
-      "version": "1.3.0",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "safe-buffer": "~5.2.0"
-      }
-    },
     "node_modules/string-length": {
       "version": "4.0.2",
       "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz",
@@ -7151,180 +5551,63 @@
     "node_modules/strip-final-newline": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-2.0.0.tgz",
-      "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/strip-json-comments": {
-      "version": "3.1.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/supports-color": {
-      "version": "7.2.0",
-      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
-      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "has-flag": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/supports-preserve-symlinks-flag": {
-      "version": "1.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/synckit": {
-      "version": "0.11.11",
-      "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.11.tgz",
-      "integrity": "sha512-MeQTA1r0litLUf0Rp/iisCaL8761lKAZHaimlbGK4j0HysC4PLfqygQj9srcs0m2RdtDYnF8UuYyKpbjHYp7Jw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@pkgr/core": "^0.2.9"
-      },
-      "engines": {
-        "node": "^14.18.0 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/synckit"
-      }
-    },
-    "node_modules/table-layout": {
-      "version": "4.1.1",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "array-back": "^6.2.2",
-        "wordwrapjs": "^5.1.0"
-      },
-      "engines": {
-        "node": ">=12.17"
-      }
-    },
-    "node_modules/tapable": {
-      "version": "2.2.2",
-      "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.2.tgz",
-      "integrity": "sha512-Re10+NauLTMCudc7T5WLFLAwDhQ0JWdrMK+9B2M8zR5hRExKmsRDCBA7/aV/pNJFltmBFO5BAMlQFi/vq3nKOg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/terser": {
-      "version": "5.43.1",
-      "resolved": "https://registry.npmjs.org/terser/-/terser-5.43.1.tgz",
-      "integrity": "sha512-+6erLbBm0+LROX2sPXlUYx/ux5PyE9K/a92Wrt6oA+WDAoFTdpHE5tCYCI5PNzq2y8df4rA+QgHLJuR4jNymsg==",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "@jridgewell/source-map": "^0.3.3",
-        "acorn": "^8.14.0",
-        "commander": "^2.20.0",
-        "source-map-support": "~0.5.20"
-      },
-      "bin": {
-        "terser": "bin/terser"
-      },
+      "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==",
+      "dev": true,
+      "license": "MIT",
       "engines": {
-        "node": ">=10"
+        "node": ">=6"
       }
     },
-    "node_modules/terser-webpack-plugin": {
-      "version": "5.3.14",
-      "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.14.tgz",
-      "integrity": "sha512-vkZjpUjb6OMS7dhV+tILUW6BhpDR7P2L/aQSAv+Uwk+m8KATX9EccViHTJR2qDtACKPIYndLGCyl3FMo+r2LMw==",
+    "node_modules/strip-json-comments": {
+      "version": "3.1.1",
       "dev": true,
       "license": "MIT",
-      "dependencies": {
-        "@jridgewell/trace-mapping": "^0.3.25",
-        "jest-worker": "^27.4.5",
-        "schema-utils": "^4.3.0",
-        "serialize-javascript": "^6.0.2",
-        "terser": "^5.31.1"
-      },
       "engines": {
-        "node": ">= 10.13.0"
+        "node": ">=8"
       },
       "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      },
-      "peerDependencies": {
-        "webpack": "^5.1.0"
-      },
-      "peerDependenciesMeta": {
-        "@swc/core": {
-          "optional": true
-        },
-        "esbuild": {
-          "optional": true
-        },
-        "uglify-js": {
-          "optional": true
-        }
+        "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/terser-webpack-plugin/node_modules/jest-worker": {
-      "version": "27.5.1",
-      "resolved": "https://registry.npmjs.org/jest-worker/-/jest-worker-27.5.1.tgz",
-      "integrity": "sha512-7vuh85V5cdDofPyxn58nrPjBktZo0u9x1g8WtjQol+jZDaE+fhN+cIvTj11GndBnMnyfrUOG1sZQxCdjKh+DKg==",
+    "node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "@types/node": "*",
-        "merge-stream": "^2.0.0",
-        "supports-color": "^8.0.0"
+        "has-flag": "^4.0.0"
       },
       "engines": {
-        "node": ">= 10.13.0"
+        "node": ">=8"
       }
     },
-    "node_modules/terser-webpack-plugin/node_modules/supports-color": {
-      "version": "8.1.1",
-      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz",
-      "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==",
+    "node_modules/synckit": {
+      "version": "0.11.11",
+      "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.11.tgz",
+      "integrity": "sha512-MeQTA1r0litLUf0Rp/iisCaL8761lKAZHaimlbGK4j0HysC4PLfqygQj9srcs0m2RdtDYnF8UuYyKpbjHYp7Jw==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "has-flag": "^4.0.0"
+        "@pkgr/core": "^0.2.9"
       },
       "engines": {
-        "node": ">=10"
+        "node": "^14.18.0 || >=16.0.0"
       },
       "funding": {
-        "url": "https://github.com/chalk/supports-color?sponsor=1"
+        "url": "https://opencollective.com/synckit"
       }
     },
-    "node_modules/terser/node_modules/source-map-support": {
-      "version": "0.5.21",
-      "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
-      "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
+    "node_modules/table-layout": {
+      "version": "4.1.1",
       "dev": true,
       "license": "MIT",
       "dependencies": {
-        "buffer-from": "^1.0.0",
-        "source-map": "^0.6.0"
+        "array-back": "^6.2.2",
+        "wordwrapjs": "^5.1.0"
+      },
+      "engines": {
+        "node": ">=12.17"
       }
     },
     "node_modules/test-exclude": {
@@ -7388,22 +5671,6 @@
         "node": "*"
       }
     },
-    "node_modules/thingies": {
-      "version": "1.21.0",
-      "dev": true,
-      "license": "Unlicense",
-      "engines": {
-        "node": ">=10.18"
-      },
-      "peerDependencies": {
-        "tslib": "^2"
-      }
-    },
-    "node_modules/thunky": {
-      "version": "1.1.0",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/tmpl": {
       "version": "1.0.5",
       "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",
@@ -7422,33 +5689,11 @@
         "node": ">=8.0"
       }
     },
-    "node_modules/toidentifier": {
-      "version": "1.0.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.6"
-      }
-    },
-    "node_modules/tree-dump": {
-      "version": "1.0.2",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">=10.0"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/streamich"
-      },
-      "peerDependencies": {
-        "tslib": "2"
-      }
-    },
     "node_modules/tslib": {
       "version": "2.6.3",
       "dev": true,
-      "license": "0BSD"
+      "license": "0BSD",
+      "optional": true
     },
     "node_modules/type-detect": {
       "version": "4.0.8",
@@ -7473,18 +5718,6 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
-    "node_modules/type-is": {
-      "version": "1.6.18",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "media-typer": "0.3.0",
-        "mime-types": "~2.1.24"
-      },
-      "engines": {
-        "node": ">= 0.6"
-      }
-    },
     "node_modules/typescript": {
       "version": "5.8.3",
       "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz",
@@ -7535,14 +5768,6 @@
       "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==",
       "license": "MIT"
     },
-    "node_modules/unpipe": {
-      "version": "1.0.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/unrs-resolver": {
       "version": "1.11.1",
       "resolved": "https://registry.npmjs.org/unrs-resolver/-/unrs-resolver-1.11.1.tgz",
@@ -7607,27 +5832,6 @@
         "browserslist": ">= 4.21.0"
       }
     },
-    "node_modules/util-deprecate": {
-      "version": "1.0.2",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/utils-merge": {
-      "version": "1.0.1",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4.0"
-      }
-    },
-    "node_modules/uuid": {
-      "version": "8.3.2",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "uuid": "dist/bin/uuid"
-      }
-    },
     "node_modules/v8-to-istanbul": {
       "version": "9.3.0",
       "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.3.0.tgz",
@@ -7643,14 +5847,6 @@
         "node": ">=10.12.0"
       }
     },
-    "node_modules/vary": {
-      "version": "1.1.2",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/walk-back": {
       "version": "5.1.1",
       "dev": true,
@@ -7669,18 +5865,6 @@
         "makeerror": "1.0.12"
       }
     },
-    "node_modules/watchpack": {
-      "version": "2.4.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "glob-to-regexp": "^0.4.1",
-        "graceful-fs": "^4.1.2"
-      },
-      "engines": {
-        "node": ">=10.13.0"
-      }
-    },
     "node_modules/wavefile": {
       "version": "11.0.0",
       "dev": true,
@@ -7692,246 +5876,6 @@
         "node": ">=8"
       }
     },
-    "node_modules/wbuf": {
-      "version": "1.7.3",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "minimalistic-assert": "^1.0.0"
-      }
-    },
-    "node_modules/webpack": {
-      "version": "5.100.2",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.100.2.tgz",
-      "integrity": "sha512-QaNKAvGCDRh3wW1dsDjeMdDXwZm2vqq3zn6Pvq4rHOEOGSaUMgOOjG2Y9ZbIGzpfkJk9ZYTHpDqgDfeBDcnLaw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/eslint-scope": "^3.7.7",
-        "@types/estree": "^1.0.8",
-        "@types/json-schema": "^7.0.15",
-        "@webassemblyjs/ast": "^1.14.1",
-        "@webassemblyjs/wasm-edit": "^1.14.1",
-        "@webassemblyjs/wasm-parser": "^1.14.1",
-        "acorn": "^8.15.0",
-        "acorn-import-phases": "^1.0.3",
-        "browserslist": "^4.24.0",
-        "chrome-trace-event": "^1.0.2",
-        "enhanced-resolve": "^5.17.2",
-        "es-module-lexer": "^1.2.1",
-        "eslint-scope": "5.1.1",
-        "events": "^3.2.0",
-        "glob-to-regexp": "^0.4.1",
-        "graceful-fs": "^4.2.11",
-        "json-parse-even-better-errors": "^2.3.1",
-        "loader-runner": "^4.2.0",
-        "mime-types": "^2.1.27",
-        "neo-async": "^2.6.2",
-        "schema-utils": "^4.3.2",
-        "tapable": "^2.1.1",
-        "terser-webpack-plugin": "^5.3.11",
-        "watchpack": "^2.4.1",
-        "webpack-sources": "^3.3.3"
-      },
-      "bin": {
-        "webpack": "bin/webpack.js"
-      },
-      "engines": {
-        "node": ">=10.13.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      },
-      "peerDependenciesMeta": {
-        "webpack-cli": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/webpack-cli": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/webpack-cli/-/webpack-cli-6.0.1.tgz",
-      "integrity": "sha512-MfwFQ6SfwinsUVi0rNJm7rHZ31GyTcpVE5pgVA3hwFRb7COD4TzjUUwhGWKfO50+xdc2MQPuEBBJoqIMGt3JDw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@discoveryjs/json-ext": "^0.6.1",
-        "@webpack-cli/configtest": "^3.0.1",
-        "@webpack-cli/info": "^3.0.1",
-        "@webpack-cli/serve": "^3.0.1",
-        "colorette": "^2.0.14",
-        "commander": "^12.1.0",
-        "cross-spawn": "^7.0.3",
-        "envinfo": "^7.14.0",
-        "fastest-levenshtein": "^1.0.12",
-        "import-local": "^3.0.2",
-        "interpret": "^3.1.1",
-        "rechoir": "^0.8.0",
-        "webpack-merge": "^6.0.1"
-      },
-      "bin": {
-        "webpack-cli": "bin/cli.js"
-      },
-      "engines": {
-        "node": ">=18.12.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      },
-      "peerDependencies": {
-        "webpack": "^5.82.0"
-      },
-      "peerDependenciesMeta": {
-        "webpack-bundle-analyzer": {
-          "optional": true
-        },
-        "webpack-dev-server": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/webpack-cli/node_modules/commander": {
-      "version": "12.1.0",
-      "resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz",
-      "integrity": "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      }
-    },
-    "node_modules/webpack-dev-middleware": {
-      "version": "7.4.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "colorette": "^2.0.10",
-        "memfs": "^4.6.0",
-        "mime-types": "^2.1.31",
-        "on-finished": "^2.4.1",
-        "range-parser": "^1.2.1",
-        "schema-utils": "^4.0.0"
-      },
-      "engines": {
-        "node": ">= 18.12.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      },
-      "peerDependencies": {
-        "webpack": "^5.0.0"
-      },
-      "peerDependenciesMeta": {
-        "webpack": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/webpack-dev-server": {
-      "version": "5.2.2",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/bonjour": "^3.5.13",
-        "@types/connect-history-api-fallback": "^1.5.4",
-        "@types/express": "^4.17.21",
-        "@types/express-serve-static-core": "^4.17.21",
-        "@types/serve-index": "^1.9.4",
-        "@types/serve-static": "^1.15.5",
-        "@types/sockjs": "^0.3.36",
-        "@types/ws": "^8.5.10",
-        "ansi-html-community": "^0.0.8",
-        "bonjour-service": "^1.2.1",
-        "chokidar": "^3.6.0",
-        "colorette": "^2.0.10",
-        "compression": "^1.7.4",
-        "connect-history-api-fallback": "^2.0.0",
-        "express": "^4.21.2",
-        "graceful-fs": "^4.2.6",
-        "http-proxy-middleware": "^2.0.9",
-        "ipaddr.js": "^2.1.0",
-        "launch-editor": "^2.6.1",
-        "open": "^10.0.3",
-        "p-retry": "^6.2.0",
-        "schema-utils": "^4.2.0",
-        "selfsigned": "^2.4.1",
-        "serve-index": "^1.9.1",
-        "sockjs": "^0.3.24",
-        "spdy": "^4.0.2",
-        "webpack-dev-middleware": "^7.4.2",
-        "ws": "^8.18.0"
-      },
-      "bin": {
-        "webpack-dev-server": "bin/webpack-dev-server.js"
-      },
-      "engines": {
-        "node": ">= 18.12.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/webpack"
-      },
-      "peerDependencies": {
-        "webpack": "^5.0.0"
-      },
-      "peerDependenciesMeta": {
-        "webpack": {
-          "optional": true
-        },
-        "webpack-cli": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/webpack-merge": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/webpack-merge/-/webpack-merge-6.0.1.tgz",
-      "integrity": "sha512-hXXvrjtx2PLYx4qruKl+kyRSLc52V+cCvMxRjmKwoA+CBbbF5GfIBtR6kCvl0fYGqTUPKB+1ktVmTHqMOzgCBg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "clone-deep": "^4.0.1",
-        "flat": "^5.0.2",
-        "wildcard": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=18.0.0"
-      }
-    },
-    "node_modules/webpack-sources": {
-      "version": "3.3.3",
-      "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.3.3.tgz",
-      "integrity": "sha512-yd1RBzSGanHkitROoPFd6qsrxt+oFhg/129YzheDGqeustzX0vTZJZsSsQjVQC4yzBQ56K55XU8gaNCtIzOnTg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10.13.0"
-      }
-    },
-    "node_modules/websocket-driver": {
-      "version": "0.7.4",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "http-parser-js": ">=0.5.1",
-        "safe-buffer": ">=5.1.0",
-        "websocket-extensions": ">=0.1.1"
-      },
-      "engines": {
-        "node": ">=0.8.0"
-      }
-    },
-    "node_modules/websocket-extensions": {
-      "version": "0.1.4",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">=0.8.0"
-      }
-    },
     "node_modules/which": {
       "version": "2.0.2",
       "dev": true,
@@ -7946,13 +5890,6 @@
         "node": ">= 8"
       }
     },
-    "node_modules/wildcard": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/wildcard/-/wildcard-2.0.1.tgz",
-      "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==",
-      "dev": true,
-      "license": "MIT"
-    },
     "node_modules/wordwrap": {
       "version": "1.0.0",
       "dev": true,
@@ -8082,26 +6019,6 @@
         "node": "^14.17.0 || ^16.13.0 || >=18.0.0"
       }
     },
-    "node_modules/ws": {
-      "version": "8.18.0",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10.0.0"
-      },
-      "peerDependencies": {
-        "bufferutil": "^4.0.1",
-        "utf-8-validate": ">=5.0.2"
-      },
-      "peerDependenciesMeta": {
-        "bufferutil": {
-          "optional": true
-        },
-        "utf-8-validate": {
-          "optional": true
-        }
-      }
-    },
     "node_modules/xmlcreate": {
       "version": "2.0.4",
       "dev": true,
diff --git a/package.json b/package.json
index ed843857d..2f856d89b 100644
--- a/package.json
+++ b/package.json
@@ -25,8 +25,8 @@
     "format": "prettier --write .",
     "format:check": "prettier --check .",
     "typegen": "tsc --build",
-    "dev": "webpack serve --no-client-overlay",
-    "build": "webpack && npm run typegen",
+    "dev": "node scripts/esbuild/dev.mjs",
+    "build": "node scripts/esbuild/build.mjs && npm run typegen",
     "test": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage",
     "readme": "python ./docs/scripts/build_readme.py",
     "docs-api": "node ./docs/scripts/generate.js",
@@ -64,15 +64,13 @@
     "@types/jest": "^30.0.0",
     "@types/node": "^24.0.11",
     "@webgpu/types": "^0.1.64",
+    "esbuild": "^0.27.2",
     "jest": "^30.0.4",
     "jest-environment-node": "^30.0.4",
     "jsdoc-to-markdown": "^9.1.1",
     "prettier": "3.4.2",
     "typescript": "^5.8.3",
-    "wavefile": "11.0.0",
-    "webpack": "^5.99.9",
-    "webpack-cli": "^6.0.1",
-    "webpack-dev-server": "^5.2.2"
+    "wavefile": "11.0.0"
   },
   "files": [
     "src",
diff --git a/scripts/esbuild/build.mjs b/scripts/esbuild/build.mjs
new file mode 100644
index 000000000..8fa39c0c8
--- /dev/null
+++ b/scripts/esbuild/build.mjs
@@ -0,0 +1,130 @@
+import { build as esbuild } from "esbuild";
+import path from "node:path";
+import { stripNodePrefixPlugin } from "./build/plugins/stripNodePrefixPlugin.mjs";
+import { ignoreModulesPlugin } from "./build/plugins/ignoreModulesPlugin.mjs";
+import { postBuildPlugin } from "./build/plugins/postBuildPlugin.mjs";
+import { externalNodeBuiltinsPlugin } from "./build/plugins/externalNodeBuiltinsPlugin.mjs";
+import {
+  NODE_IGNORE_MODULES,
+  NODE_EXTERNAL_MODULES,
+  WEB_IGNORE_MODULES,
+  WEB_EXTERNAL_MODULES,
+  OUT_DIR,
+  ROOT_DIR,
+  getEsbuildProdConfig,
+} from "./build/constants.mjs";
+import { reportSize } from "./build/reportSize.mjs";
+import prepareOutDir from "./build/prepareOutDir.mjs";
+
+/**
+ *
+ * Helper function to create build configurations.
+ * Equivalent to webpack's buildConfig function.
+ */
+async function buildTarget({
+  name = "",
+  suffix = ".js",
+  format = "esm", // 'esm' | 'cjs'
+  ignoreModules = [],
+  externalModules = [],
+  usePostBuild = false,
+}) {
+  const platform = format === "cjs" ? "node" : "neutral";
+
+  const regularFile = `transformers${name}${suffix}`;
+  const minFile = `transformers${name}.min${suffix}`;
+
+  const plugins = [];
+  // Add ignoreModulesPlugin FIRST so it can catch modules before stripNodePrefixPlugin marks them as external
+  if (ignoreModules.length > 0) {
+    plugins.push(ignoreModulesPlugin(ignoreModules));
+  }
+  plugins.push(stripNodePrefixPlugin());
+  plugins.push(externalNodeBuiltinsPlugin());
+  if (usePostBuild) {
+    plugins.push(postBuildPlugin(OUT_DIR, ROOT_DIR));
+  }
+
+  console.log(`\nBuilding ${regularFile}...`);
+  await esbuild({
+    ...getEsbuildProdConfig(ROOT_DIR),
+    platform,
+    format,
+    outfile: path.join(OUT_DIR, regularFile),
+    external: externalModules,
+    plugins,
+  });
+  reportSize(path.join(OUT_DIR, regularFile));
+
+  console.log(`\nBuilding ${minFile}...`);
+  await esbuild({
+    ...getEsbuildProdConfig(ROOT_DIR),
+    platform,
+    format,
+    outfile: path.join(OUT_DIR, minFile),
+    minify: true,
+    external: externalModules,
+    plugins,
+    legalComments: "none",
+  });
+  reportSize(path.join(OUT_DIR, minFile));
+}
+
+console.log("\nBuilding transformers.js with esbuild...\n");
+
+const startTime = performance.now();
+
+try {
+  prepareOutDir(OUT_DIR);
+
+  // Bundle build - bundles everything except ignored modules
+  console.log("\n=== Bundle Build (ESM) ===");
+  await buildTarget({
+    name: "",
+    suffix: ".js",
+    format: "esm",
+    ignoreModules: WEB_IGNORE_MODULES,
+    externalModules: [],
+    usePostBuild: true,
+  });
+
+  // Web build - external onnxruntime libs
+  console.log("\n=== Web Build (ESM) ===");
+  await buildTarget({
+    name: ".web",
+    suffix: ".js",
+    format: "esm",
+    ignoreModules: WEB_IGNORE_MODULES,
+    externalModules: WEB_EXTERNAL_MODULES,
+    usePostBuild: false,
+  });
+
+  // Node ESM build
+  console.log("\n=== Node Build (ESM) ===");
+  await buildTarget({
+    name: ".node",
+    suffix: ".mjs",
+    format: "esm",
+    ignoreModules: NODE_IGNORE_MODULES,
+    externalModules: NODE_EXTERNAL_MODULES,
+    usePostBuild: false,
+  });
+
+  // Node CJS build
+  console.log("\n=== Node Build (CJS) ===");
+  await buildTarget({
+    name: ".node",
+    suffix: ".cjs",
+    format: "cjs",
+    ignoreModules: NODE_IGNORE_MODULES,
+    externalModules: NODE_EXTERNAL_MODULES,
+    usePostBuild: false,
+  });
+
+  const endTime = performance.now();
+  const duration = (endTime - startTime).toFixed(2);
+  console.log(`\nAll builds completed successfully in ${duration}ms!\n`);
+} catch (error) {
+  console.error("\nBuild failed:", error);
+  process.exit(1);
+}
diff --git a/scripts/esbuild/build/constants.mjs b/scripts/esbuild/build/constants.mjs
new file mode 100644
index 000000000..14c80ae2f
--- /dev/null
+++ b/scripts/esbuild/build/constants.mjs
@@ -0,0 +1,38 @@
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+
+export const DIST_FOLDER = "dist";
+export const NODE_IGNORE_MODULES = ["onnxruntime-web"];
+export const NODE_EXTERNAL_MODULES = [
+  "onnxruntime-common",
+  "onnxruntime-node",
+  "sharp",
+  // node:* modules are handled by externalNodeBuiltinsPlugin
+];
+
+export const WEB_IGNORE_MODULES = ["onnxruntime-node", "sharp", "fs", "path", "url", "stream", "stream/promises"];
+export const WEB_EXTERNAL_MODULES = ["onnxruntime-common", "onnxruntime-web"];
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+export const ROOT_DIR = path.join(__dirname, "../../..");
+export const OUT_DIR = path.join(ROOT_DIR, DIST_FOLDER);
+
+export const getEsbuildDevConfig = (rootDir) => ({
+  bundle: true,
+  treeShaking: true,
+  logLevel: "info",
+  entryPoints: [path.join(rootDir, "src/transformers.js")],
+  platform: "neutral",
+  format: "esm",
+  sourcemap: true,
+  logOverride: {
+    // Suppress import.meta warning for CJS builds - it's handled gracefully in the code
+    "empty-import-meta": "silent",
+  },
+});
+
+export const getEsbuildProdConfig = (rootDir) => ({
+  ...getEsbuildDevConfig(rootDir),
+  logLevel: "warning",
+  sourcemap: false,
+});
diff --git a/scripts/esbuild/build/httpServer.mjs b/scripts/esbuild/build/httpServer.mjs
new file mode 100644
index 000000000..8c86514de
--- /dev/null
+++ b/scripts/esbuild/build/httpServer.mjs
@@ -0,0 +1,74 @@
+import { createServer } from "node:http";
+import { existsSync, readFileSync, statSync } from "node:fs";
+import path from "node:path";
+
+const MIME_TYPES = {
+  ".html": "text/html",
+  ".js": "text/javascript",
+  ".mjs": "text/javascript",
+  ".css": "text/css",
+  ".json": "application/json",
+  ".wasm": "application/wasm",
+  ".png": "image/png",
+  ".jpg": "image/jpeg",
+  ".gif": "image/gif",
+  ".svg": "image/svg+xml",
+  ".ico": "image/x-icon",
+};
+
+export const startServer = (dir, PORT = 8080) =>
+  new Promise((resolve) => {
+    const server = createServer((req, res) => {
+      // Enable CORS
+      res.setHeader("Access-Control-Allow-Origin", "*");
+      res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+      res.setHeader("Access-Control-Allow-Headers", "Content-Type");
+
+      if (req.method === "OPTIONS") {
+        res.writeHead(204);
+        res.end();
+        return;
+      }
+
+      let filePath = req.url === "/" ? "/index.html" : req.url;
+      filePath = filePath.split("?")[0]; // Remove query params
+
+      // Try to serve from outdir first, then fall back to rootDir
+      let fullPath = path.join(dir, filePath);
+
+      // Check if file exists
+      if (!existsSync(fullPath)) {
+        res.writeHead(404, { "Content-Type": "text/plain" });
+        res.end("404 Not Found");
+        return;
+      }
+
+      // Check if it's a directory
+      const stat = statSync(fullPath);
+      if (stat.isDirectory()) {
+        fullPath = path.join(fullPath, "index.html");
+        if (!existsSync(fullPath)) {
+          res.writeHead(404, { "Content-Type": "text/plain" });
+          res.end("404 Not Found");
+          return;
+        }
+      }
+
+      // Get MIME type
+      const ext = path.extname(fullPath);
+      const mimeType = MIME_TYPES[ext] || "application/octet-stream";
+
+      try {
+        const content = readFileSync(fullPath);
+        res.writeHead(200, { "Content-Type": mimeType });
+        res.end(content);
+      } catch (error) {
+        res.writeHead(500, { "Content-Type": "text/plain" });
+        res.end("500 Internal Server Error");
+      }
+    });
+
+    server.listen(PORT, () => {
+      resolve(server);
+    });
+  });
diff --git a/scripts/esbuild/build/plugins/externalNodeBuiltinsPlugin.mjs b/scripts/esbuild/build/plugins/externalNodeBuiltinsPlugin.mjs
new file mode 100644
index 000000000..ce0b28cee
--- /dev/null
+++ b/scripts/esbuild/build/plugins/externalNodeBuiltinsPlugin.mjs
@@ -0,0 +1,14 @@
+/**
+ * Plugin to automatically mark all node:* imports as external.
+ * This prevents having to manually list all Node.js built-in modules.
+ */
+export const externalNodeBuiltinsPlugin = () => ({
+  name: "external-node-builtins",
+  setup(build) {
+    // Mark all node:* imports as external
+    build.onResolve({ filter: /^node:/ }, (args) => ({
+      path: args.path,
+      external: true,
+    }));
+  },
+});
diff --git a/scripts/esbuild/build/plugins/ignoreModulesPlugin.mjs b/scripts/esbuild/build/plugins/ignoreModulesPlugin.mjs
new file mode 100644
index 000000000..c9d57e6c6
--- /dev/null
+++ b/scripts/esbuild/build/plugins/ignoreModulesPlugin.mjs
@@ -0,0 +1,33 @@
+/**
+ * Plugin to ignore/exclude certain modules by returning an empty module.
+ * Equivalent to webpack's resolve.alias with false value.
+ */
+export const ignoreModulesPlugin = (modules = []) => ({
+  name: "ignore-modules",
+  setup(build) {
+    // Escape special regex characters in module names
+    const escapeRegex = (str) => str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+    const escapedModules = modules.map(escapeRegex);
+
+    // Match both "module" and "node:module" patterns
+    const patterns = escapedModules.flatMap((mod) => [mod, `node:${mod}`]);
+    const filter = new RegExp(`^(${patterns.join("|")})$`);
+
+    build.onResolve({ filter }, (args) => {
+      return { path: args.path, namespace: "ignore-modules" };
+    });
+    build.onLoad({ filter: /.*/, namespace: "ignore-modules" }, () => {
+      return {
+        contents: `
+          const noop = () => {};
+          const emptyObj = {};
+          export default emptyObj;
+          export const Readable = { fromWeb: noop };
+          export const pipeline = noop;
+          export const createWriteStream = noop;
+          export const createReadStream = noop;
+        `,
+      };
+    });
+  },
+});
diff --git a/scripts/esbuild/build/plugins/postBuildPlugin.mjs b/scripts/esbuild/build/plugins/postBuildPlugin.mjs
new file mode 100644
index 000000000..8effd6970
--- /dev/null
+++ b/scripts/esbuild/build/plugins/postBuildPlugin.mjs
@@ -0,0 +1,38 @@
+import path from "node:path";
+import { copyFileSync, unlinkSync, existsSync } from "node:fs";
+
+/**
+ * Plugin to post-process build files.
+ * Equivalent to webpack's PostBuildPlugin.
+ */
+export const postBuildPlugin = (distDir, rootDir) => {
+  // it should copy the files only once. In watch mode for example it should not rerun every time
+  let completed = false;
+  return {
+    name: "post-build",
+    setup(build) {
+      build.onEnd(() => {
+        if (completed) return;
+        completed = true;
+
+        const ORT_JSEP_FILE = "ort-wasm-simd-threaded.jsep.mjs";
+        const ORT_BUNDLE_FILE = "ort.bundle.min.mjs";
+
+        // 1. Remove unnecessary files
+        const file = path.join(distDir, ORT_BUNDLE_FILE);
+        if (existsSync(file)) unlinkSync(file);
+
+        // 2. Copy unbundled JSEP file
+        try {
+          const ORT_SOURCE_DIR = path.join(rootDir, "node_modules/onnxruntime-web/dist");
+          const src = path.join(ORT_SOURCE_DIR, ORT_JSEP_FILE);
+          const dest = path.join(distDir, ORT_JSEP_FILE);
+          copyFileSync(src, dest);
+          console.log(`Copied ${ORT_JSEP_FILE}`);
+        } catch (error) {
+          console.warn(`!!! Warning: Could not copy ${ORT_JSEP_FILE}:`, error.message);
+        }
+      });
+    },
+  };
+};
diff --git a/scripts/esbuild/build/plugins/rebuildPlugin.mjs b/scripts/esbuild/build/plugins/rebuildPlugin.mjs
new file mode 100644
index 000000000..c385292d4
--- /dev/null
+++ b/scripts/esbuild/build/plugins/rebuildPlugin.mjs
@@ -0,0 +1,26 @@
+/**
+ * Plugin to log rebuild events with timing
+ */
+export const rebuildPlugin = (name) => {
+  let startTime = 0;
+
+  return {
+    name: "rebuild-logger",
+    setup(build) {
+      build.onStart(() => {
+        startTime = performance.now();
+      });
+
+      build.onEnd((result) => {
+        const endTime = performance.now();
+        const duration = (endTime - startTime).toFixed(2);
+
+        if (result.errors.length > 0) {
+          console.log(`\n${name} - Build failed with ${result.errors.length} error(s) in ${duration}ms`);
+        } else {
+          console.log(`\n${name} - Rebuilt in ${duration}ms`);
+        }
+      });
+    },
+  };
+};
diff --git a/scripts/esbuild/build/plugins/stripNodePrefixPlugin.mjs b/scripts/esbuild/build/plugins/stripNodePrefixPlugin.mjs
new file mode 100644
index 000000000..1a4cab983
--- /dev/null
+++ b/scripts/esbuild/build/plugins/stripNodePrefixPlugin.mjs
@@ -0,0 +1,15 @@
+/**
+ * Plugin to strip the "node:" prefix from module requests.
+ * Equivalent to webpack's StripNodePrefixPlugin.
+ */
+export const stripNodePrefixPlugin = () => ({
+  name: "strip-node-prefix",
+  setup(build) {
+    build.onResolve({ filter: /^node:/ }, (args) => {
+      return {
+        path: args.path.replace(/^node:/, ""),
+        external: true,
+      };
+    });
+  },
+});
diff --git a/scripts/esbuild/build/prepareOutDir.mjs b/scripts/esbuild/build/prepareOutDir.mjs
new file mode 100644
index 000000000..5f9302d31
--- /dev/null
+++ b/scripts/esbuild/build/prepareOutDir.mjs
@@ -0,0 +1,9 @@
+import { existsSync, mkdirSync, rmSync } from "node:fs";
+
+export default function prepareOutDir(dir) {
+  if (existsSync(dir)) {
+    rmSync(dir, { recursive: true, force: true });
+  }
+
+  mkdirSync(dir, { recursive: true });
+}
diff --git a/scripts/esbuild/build/reportSize.mjs b/scripts/esbuild/build/reportSize.mjs
new file mode 100644
index 000000000..b806f6b3f
--- /dev/null
+++ b/scripts/esbuild/build/reportSize.mjs
@@ -0,0 +1,16 @@
+import { readFileSync } from "node:fs";
+import { gzipSync } from "node:zlib";
+
+export const formatSize = (bytes) => {
+  if (bytes < 1024) return `${bytes}b`;
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}kb`;
+  return `${(bytes / (1024 * 1024)).toFixed(2)}mb`;
+};
+
+export const reportSize = (outfile) => {
+  const content = readFileSync(outfile);
+  const size = content.length;
+  const gzipSize = gzipSync(content).length;
+
+  console.log(`\n${outfile}\n${formatSize(size)} (gzip: ${formatSize(gzipSize)})`);
+};
diff --git a/scripts/esbuild/dev.mjs b/scripts/esbuild/dev.mjs
new file mode 100644
index 000000000..c879185cf
--- /dev/null
+++ b/scripts/esbuild/dev.mjs
@@ -0,0 +1,69 @@
+import { context } from "esbuild";
+import path from "node:path";
+import { postBuildPlugin } from "./build/plugins/postBuildPlugin.mjs";
+import { stripNodePrefixPlugin } from "./build/plugins/stripNodePrefixPlugin.mjs";
+import { ignoreModulesPlugin } from "./build/plugins/ignoreModulesPlugin.mjs";
+import { rebuildPlugin } from "./build/plugins/rebuildPlugin.mjs";
+import { externalNodeBuiltinsPlugin } from "./build/plugins/externalNodeBuiltinsPlugin.mjs";
+import { getEsbuildDevConfig, OUT_DIR, ROOT_DIR, WEB_IGNORE_MODULES } from "./build/constants.mjs";
+import { startServer } from "./build/httpServer.mjs";
+import prepareOutDir from "./build/prepareOutDir.mjs";
+
+const startTime = performance.now();
+
+prepareOutDir(OUT_DIR);
+
+console.log("\n=== BUILD ===");
+console.log("Building transformers.js with esbuild in watch mode...");
+
+// Create build contexts for watch mode
+const bundleContext = await context({
+  ...getEsbuildDevConfig(ROOT_DIR),
+  outfile: path.join(OUT_DIR, "transformers.js"),
+  plugins: [
+    ignoreModulesPlugin(WEB_IGNORE_MODULES),
+    stripNodePrefixPlugin(),
+    externalNodeBuiltinsPlugin(),
+    postBuildPlugin(OUT_DIR, ROOT_DIR),
+    rebuildPlugin("Bundle"),
+  ],
+});
+
+const webContext = await context({
+  ...getEsbuildDevConfig(ROOT_DIR),
+  outfile: path.join(OUT_DIR, "transformers.web.js"),
+  external: ["onnxruntime-common", "onnxruntime-web"],
+  plugins: [
+    ignoreModulesPlugin(WEB_IGNORE_MODULES),
+    stripNodePrefixPlugin(),
+    externalNodeBuiltinsPlugin(),
+    rebuildPlugin("Web"),
+  ],
+});
+
+console.log("\nInitial build starting...");
+
+await Promise.all([bundleContext.watch(), webContext.watch()]);
+
+const endTime = performance.now();
+const duration = (endTime - startTime).toFixed(2);
+console.log(`\nAll builds completed successfully in ${duration}ms!`);
+
+const PORT = 8080;
+
+console.log("\n=== SERVE ===");
+const server = await startServer(OUT_DIR, PORT);
+
+console.log(`\nServer running at http://localhost:${PORT}/`);
+console.log(`Serving files from: ${OUT_DIR}`);
+
+console.log(`\nWatching for changes...\n`);
+
+// Keep process alive and cleanup
+process.on("SIGINT", async () => {
+  console.log("\n\nStopping watch mode and server...");
+  server.close();
+  await bundleContext.dispose();
+  await webContext.dispose();
+  process.exit(0);
+});
diff --git a/src/generation/logits_process.js b/src/generation/logits_process.js
index 3bdff2a2f..ecd99b5e8 100644
--- a/src/generation/logits_process.js
+++ b/src/generation/logits_process.js
@@ -239,7 +239,7 @@ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor {
 export class WhisperTimeStampLogitsProcessor extends LogitsProcessor {
     /**
      * Constructs a new WhisperTimeStampLogitsProcessor.
-     * @param {import('../models/whisper/generation_whisper.js').WhisperGenerationConfig} generate_config The config object passed to the `generate()` method of a transformer model.
+     * @param {import('../models/model-processors/whisper/generation_whisper.js').WhisperGenerationConfig} generate_config The config object passed to the `generate()` method of a transformer model.
      * @param {number[]} init_tokens The initial tokens of the input sequence.
      */
     constructor(generate_config, init_tokens) {
diff --git a/src/models.js b/src/models.js
index 377bd7296..806f1f29a 100644
--- a/src/models.js
+++ b/src/models.js
@@ -37,8063 +37,46 @@
  * @module models
  */
 
-import { AutoConfig, getCacheShapes } from './configs.js';
+import { AutoConfig } from './configs.js';
+import { PreTrainedModel } from './models/pre-trained-model.js';
 
-import {
-    deviceToExecutionProviders,
-    createInferenceSession,
-    isONNXTensor,
-    isONNXProxy,
-    runInferenceSession,
-} from './backends/onnx.js';
-import {
-    DATA_TYPES,
-    DEFAULT_DEVICE_DTYPE_MAPPING,
-    DEFAULT_DTYPE_SUFFIX_MAPPING,
-    isWebGpuFp16Supported,
-} from './utils/dtypes.js';
-
-import { Callable } from './utils/generic.js';
-
-import { mergeArrays, pick } from './utils/core.js';
-
-import { getModelFile, getModelJSON, MAX_EXTERNAL_DATA_CHUNKS } from './utils/hub.js';
-
-import { GITHUB_ISSUE_URL } from './utils/constants.js';
-
-import {
-    LogitsProcessorList,
-    ForcedBOSTokenLogitsProcessor,
-    ForcedEOSTokenLogitsProcessor,
-    SuppressTokensAtBeginLogitsProcessor,
-    WhisperTimeStampLogitsProcessor,
-    NoRepeatNGramLogitsProcessor,
-    RepetitionPenaltyLogitsProcessor,
-    NoBadWordsLogitsProcessor,
-    MinLengthLogitsProcessor,
-    MinNewTokensLengthLogitsProcessor,
-    TemperatureLogitsWarper,
-    ClassifierFreeGuidanceLogitsProcessor,
-} from './generation/logits_process.js';
-
-import { GenerationConfig } from './generation/configuration_utils.js';
-
-import {
-    cat,
-    mean,
-    zeros,
-    zeros_like,
-    ones,
-    ones_like,
-    full,
-    full_like,
-    stack,
-    std_mean,
-    Tensor,
-    DataTypeMap,
-    randn,
-} from './utils/tensor.js';
-import { RawImage } from './utils/image.js';
-
-import { dynamic_time_warping, max, medianFilter } from './utils/maths.js';
-import { EosTokenCriteria, MaxLengthCriteria, StoppingCriteriaList } from './generation/stopping_criteria.js';
-import { LogitsSampler } from './generation/logits_sampler.js';
-import { apis, env } from './env.js';
-
-import { WhisperGenerationConfig } from './models/whisper/generation_whisper.js';
-import { whisper_language_to_code } from './models/whisper/common_whisper.js';
-
-//////////////////////////////////////////////////
-// Model types: used internally
-const MODEL_TYPES = {
-    EncoderOnly: 0,
-    EncoderDecoder: 1,
-    Seq2Seq: 2,
-    Vision2Seq: 3,
-    DecoderOnly: 4,
-    MaskGeneration: 5,
-    ImageTextToText: 6,
-    Musicgen: 7,
-    MultiModality: 8,
-    Phi3V: 9,
-    AudioTextToText: 10,
-    AutoEncoder: 11,
-    ImageAudioTextToText: 12,
-    Supertonic: 13,
-    Chatterbox: 14,
-};
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Helper functions
-
-// NOTE: These will be populated fully later
-const MODEL_TYPE_MAPPING = new Map();
-const MODEL_NAME_TO_CLASS_MAPPING = new Map();
-const MODEL_CLASS_TO_NAME_MAPPING = new Map();
-
-/**
- * Constructs an InferenceSession using a model file located at the specified path.
- * @param {string} pretrained_model_name_or_path The path to the directory containing the model file.
- * @param {string} fileName The name of the model file.
- * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
- * @param {boolean} [is_decoder=false] Whether the model is a decoder model.
- * @returns {Promise<{buffer_or_path: Uint8Array|string, session_options: Object, session_config: Object}>} A Promise that resolves to the data needed to create an InferenceSession object.
- * @private
- */
-async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) {
-    let custom_config = options.config?.['transformers.js_config'] ?? {};
-
-    let device = options.device ?? custom_config.device;
-    if (device && typeof device !== 'string') {
-        if (device.hasOwnProperty(fileName)) {
-            device = device[fileName];
-        } else {
-            console.warn(`device not specified for "${fileName}". Using the default device.`);
-            device = null;
-        }
-    }
-
-    // If the device is not specified, we use the default (supported) execution providers.
-    const selectedDevice = /** @type {import("./utils/devices.js").DeviceType} */ (
-        device ?? (apis.IS_NODE_ENV ? 'cpu' : 'wasm')
-    );
-
-    const executionProviders = deviceToExecutionProviders(selectedDevice);
-
-    // Update custom config with the selected device's config, if it exists
-    const device_config = custom_config.device_config ?? {};
-    if (device_config.hasOwnProperty(selectedDevice)) {
-        custom_config = {
-            ...custom_config,
-            ...device_config[selectedDevice],
-        };
-    }
-
-    // If options.dtype is specified, we use it to choose the suffix for the model file.
-    // Otherwise, we use the default dtype for the device.
-    let dtype = options.dtype ?? custom_config.dtype;
-    if (typeof dtype !== 'string') {
-        if (dtype && dtype.hasOwnProperty(fileName)) {
-            dtype = dtype[fileName];
-        } else {
-            dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32;
-            console.warn(
-                `dtype not specified for "${fileName}". Using the default dtype (${dtype}) for this device (${selectedDevice}).`,
-            );
-        }
-    }
-
-    if (dtype === DATA_TYPES.auto) {
-        // Try to choose the auto dtype based on the custom config
-        let config_dtype = custom_config.dtype;
-        if (typeof config_dtype !== 'string') {
-            config_dtype = config_dtype?.[fileName];
-        }
-
-        if (config_dtype && config_dtype !== DATA_TYPES.auto && DATA_TYPES.hasOwnProperty(config_dtype)) {
-            // Defined by the config, and is not "auto"
-            dtype = config_dtype;
-        } else {
-            // Choose default dtype based on device, falling back to fp32
-            dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32;
-        }
-    }
-
-    const selectedDtype = /** @type {import("./utils/dtypes.js").DataType} */ (dtype);
-
-    if (!DEFAULT_DTYPE_SUFFIX_MAPPING.hasOwnProperty(selectedDtype)) {
-        throw new Error(`Invalid dtype: ${selectedDtype}. Should be one of: ${Object.keys(DATA_TYPES).join(', ')}`);
-    } else if (
-        selectedDevice === 'webgpu' &&
-        // NOTE: Currently, we assume that the Native WebGPU EP always supports fp16. In future, we will add a check for this.
-        !apis.IS_NODE_ENV &&
-        selectedDtype === DATA_TYPES.fp16 &&
-        !(await isWebGpuFp16Supported())
-    ) {
-        throw new Error(`The device (${selectedDevice}) does not support fp16.`);
-    }
-
-    // Only valid for models with a decoder
-    const kv_cache_dtype_config = custom_config.kv_cache_dtype;
-    const kv_cache_dtype = kv_cache_dtype_config
-        ? typeof kv_cache_dtype_config === 'string'
-            ? kv_cache_dtype_config
-            : (kv_cache_dtype_config[selectedDtype] ?? 'float32')
-        : undefined;
-
-    if (kv_cache_dtype && !['float32', 'float16'].includes(kv_cache_dtype)) {
-        throw new Error(`Invalid kv_cache_dtype: ${kv_cache_dtype}. Should be one of: float32, float16`);
-    }
-
-    const session_config = {
-        dtype: selectedDtype,
-        kv_cache_dtype,
-        device: selectedDevice,
-    };
-
-    // Construct the model file name
-    const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[selectedDtype];
-    const baseName = `${fileName}${suffix}.onnx`;
-    const modelFileName = `${options.subfolder ?? ''}/${baseName}`;
-
-    const session_options = { ...options.session_options };
-
-    // Overwrite `executionProviders` if not specified
-    session_options.executionProviders ??= executionProviders;
-
-    // Overwrite `freeDimensionOverrides` if specified in config and not set in session options
-    const free_dimension_overrides = custom_config.free_dimension_overrides;
-    if (free_dimension_overrides) {
-        session_options.freeDimensionOverrides ??= free_dimension_overrides;
-    } else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
-        console.warn(
-            `WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${selectedDevice}"]. ` +
-                `When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`,
-        );
-    }
-
-    const return_path = apis.IS_NODE_ENV && env.useFSCache;
-    const bufferOrPathPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options, return_path);
-
-    // Handle onnx external data files
-    const use_external_data_format = options.use_external_data_format ?? custom_config.use_external_data_format;
-    /** @type {Promise<string|{path: string, data: Uint8Array}>[]} */
-    let externalDataPromises = [];
-    if (use_external_data_format) {
-        let external_data_format;
-        if (typeof use_external_data_format === 'object') {
-            if (use_external_data_format.hasOwnProperty(baseName)) {
-                external_data_format = use_external_data_format[baseName];
-            } else if (use_external_data_format.hasOwnProperty(fileName)) {
-                external_data_format = use_external_data_format[fileName];
-            } else {
-                external_data_format = false;
-            }
-        } else {
-            external_data_format = use_external_data_format;
-        }
-
-        const num_chunks = +external_data_format; // (false=0, true=1, number remains the same)
-        if (num_chunks > MAX_EXTERNAL_DATA_CHUNKS) {
-            throw new Error(
-                `The number of external data chunks (${num_chunks}) exceeds the maximum allowed value (${MAX_EXTERNAL_DATA_CHUNKS}).`,
-            );
-        }
-        for (let i = 0; i < num_chunks; ++i) {
-            const path = `${baseName}_data${i === 0 ? '' : '_' + i}`;
-            const fullPath = `${options.subfolder ?? ''}/${path}`;
-            externalDataPromises.push(
-                new Promise(async (resolve, reject) => {
-                    const data = await getModelFile(
-                        pretrained_model_name_or_path,
-                        fullPath,
-                        true,
-                        options,
-                        return_path,
-                    );
-                    resolve(data instanceof Uint8Array ? { path, data } : path);
-                }),
-            );
-        }
-    } else if (session_options.externalData !== undefined) {
-        externalDataPromises = session_options.externalData.map(async (ext) => {
-            // if the external data is a string, fetch the file and replace the string with its content
-            // @ts-expect-error TS2339
-            if (typeof ext.data === 'string') {
-                // @ts-expect-error TS2339
-                const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
-                // @ts-expect-error TS2698
-                return { ...ext, data: ext_buffer };
-            }
-            return ext;
-        });
-    }
-
-    if (externalDataPromises.length > 0) {
-        const externalData = await Promise.all(externalDataPromises);
-        if (!apis.IS_NODE_ENV) {
-            session_options.externalData = externalData;
-        }
-    }
-
-    if (is_decoder && selectedDevice === 'webgpu' && kv_cache_dtype_config !== false) {
-        const shapes = getCacheShapes(options.config, {
-            prefix: 'present',
-        });
-        if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
-            // Only set preferredOutputLocation if shapes are present and we aren't proxying ONNX
-            /** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
-            const preferredOutputLocation = {};
-            for (const key in shapes) {
-                preferredOutputLocation[key] = 'gpu-buffer';
-            }
-            session_options.preferredOutputLocation = preferredOutputLocation;
-        }
-    }
-
-    const buffer_or_path = await bufferOrPathPromise;
-
-    return { buffer_or_path, session_options, session_config };
-}
-
-/**
- * Helper function to create multiple InferenceSession objects.
- *
- * @param {string} pretrained_model_name_or_path The path to the directory containing the model file.
- * @param {Record<string, string>} names The names of the model files to load.
- * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
- * @param {string} [decoder_name] The name of the decoder model, if any.
- * @returns {Promise<Record<string, any>>} A Promise that resolves to a dictionary of InferenceSession objects.
- * @private
- */
-async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = undefined) {
-    return Object.fromEntries(
-        await Promise.all(
-            Object.keys(names).map(async (name) => {
-                const { buffer_or_path, session_options, session_config } = await getSession(
-                    pretrained_model_name_or_path,
-                    names[name],
-                    options,
-                    name === decoder_name,
-                );
-                const session = await createInferenceSession(buffer_or_path, session_options, session_config);
-                return [name, session];
-            }),
-        ),
-    );
-}
-
-/**
- * Helper function to load multiple optional configuration files
- * @param {string} pretrained_model_name_or_path The path to the directory containing the config file.
- * @param {Record<string, string>} names The names of the config files to load.
- * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the configs.
- * @returns {Promise<Record<string, any>>} A Promise that resolves to a dictionary of configuration objects.
- * @private
- */
-async function getOptionalConfigs(pretrained_model_name_or_path, names, options) {
-    return Object.fromEntries(
-        await Promise.all(
-            Object.keys(names).map(async (name) => {
-                const config = await getModelJSON(pretrained_model_name_or_path, names[name], false, options);
-                return [name, config];
-            }),
-        ),
-    );
-}
-
-/**
- * Validate model inputs
- * @param {Object} session The InferenceSession object that will be run.
- * @param {Object} inputs The inputs to check.
- * @returns {Record<string, Tensor>} The checked inputs.
- * @throws {Error} If any inputs are missing.
- * @private
- */
-function validateInputs(session, inputs) {
-    /**
-     * NOTE: Create either a shallow or deep copy based on `onnx.wasm.proxy`
-     * @type {Record<string, Tensor>}
-     */
-    const checkedInputs = Object.create(null);
-    const missingInputs = [];
-    for (const inputName of session.inputNames) {
-        const tensor = inputs[inputName];
-        // Rare case where one of the model's input names corresponds to a built-in
-        // object name (e.g., toString), which would cause a simple (!tensor) check to fail,
-        // because it's not undefined but a function.
-        if (!(tensor instanceof Tensor)) {
-            missingInputs.push(inputName);
-            continue;
-        }
-        // NOTE: When `env.wasm.proxy is true` the tensor is moved across the Worker
-        // boundary, transferring ownership to the worker and invalidating the tensor.
-        // So, in this case, we simply sacrifice a clone for it.
-        checkedInputs[inputName] = isONNXProxy() ? tensor.clone() : tensor;
-    }
-    if (missingInputs.length > 0) {
-        throw new Error(
-            `An error occurred during model execution: "Missing the following inputs: ${missingInputs.join(', ')}.`,
-        );
-    }
-
-    const numInputsProvided = Object.keys(inputs).length;
-    const numInputsNeeded = session.inputNames.length;
-    if (numInputsProvided > numInputsNeeded) {
-        // No missing inputs, but too many inputs were provided.
-        // Warn the user and ignore the extra inputs.
-        let ignored = Object.keys(inputs).filter((inputName) => !session.inputNames.includes(inputName));
-        console.warn(
-            `WARNING: Too many inputs were provided (${numInputsProvided} > ${numInputsNeeded}). The following inputs will be ignored: "${ignored.join(', ')}".`,
-        );
-    }
-
-    return checkedInputs;
-}
-
-/**
- * Executes an InferenceSession using the specified inputs.
- * NOTE: `inputs` must contain at least the input names of the model.
- *  - If additional inputs are passed, they will be ignored.
- *  - If inputs are missing, an error will be thrown.
- *
- * @param {Object} session The InferenceSession object to run.
- * @param {Object} inputs An object that maps input names to input tensors.
- * @returns {Promise<Object>} A Promise that resolves to an object that maps output names to output tensors.
- * @private
- */
-async function sessionRun(session, inputs) {
-    const checkedInputs = validateInputs(session, inputs);
-    try {
-        // pass the original ort tensor
-        const ortFeed = Object.fromEntries(Object.entries(checkedInputs).map(([k, v]) => [k, v.ort_tensor]));
-        const output = await runInferenceSession(session, ortFeed);
-        return replaceTensors(output);
-    } catch (e) {
-        // Error messages can be long (nested) and uninformative. For this reason,
-        // we apply minor formatting to show the most important information
-        const formatted = Object.fromEntries(
-            Object.entries(checkedInputs).map(([k, tensor]) => {
-                // Extract these properties from the underlying ORT tensor
-                const unpacked = {
-                    type: tensor.type,
-                    dims: tensor.dims,
-                    location: tensor.location,
-                };
-                if (unpacked.location !== 'gpu-buffer') {
-                    // Only return the data if it's not a GPU buffer
-                    unpacked.data = tensor.data;
-                }
-                return [k, unpacked];
-            }),
-        );
-
-        // This usually occurs when the inputs are of the wrong type.
-        console.error(`An error occurred during model execution: "${e}".`);
-        console.error('Inputs given to model:', formatted);
-        throw e;
-    }
-}
-
-/**
- * Replaces ONNX Tensor objects with custom Tensor objects to support additional functions.
- * @param {Object} obj The object to replace tensor objects in.
- * @returns {Object} The object with tensor objects replaced by custom Tensor objects.
- * @private
- */
-function replaceTensors(obj) {
-    for (let prop in obj) {
-        if (isONNXTensor(obj[prop])) {
-            obj[prop] = new Tensor(obj[prop]);
-        } else if (typeof obj[prop] === 'object') {
-            replaceTensors(obj[prop]);
-        }
-    }
-    return obj;
-}
-
-/**
- * Converts an array or Tensor of integers to an int64 Tensor.
- * @param {any[]|Tensor} items The input integers to be converted.
- * @returns {Tensor} The int64 Tensor with the converted values.
- * @throws {Error} If the input array is empty or the input is a batched Tensor and not all sequences have the same length.
- * @private
- */
-function toI64Tensor(items) {
-    if (items instanceof Tensor) {
-        return items;
-    }
-    // items is an array
-    if (items.length === 0) {
-        throw Error('items must be non-empty');
-    }
-
-    if (Array.isArray(items[0])) {
-        // batched
-        if (items.some((x) => x.length !== items[0].length)) {
-            throw Error(
-                "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.",
-            );
-        }
-
-        return new Tensor('int64', BigInt64Array.from(items.flat().map((x) => BigInt(x))), [
-            items.length,
-            items[0].length,
-        ]);
-    } else {
-        //flat
-        return new Tensor('int64', BigInt64Array.from(items.map((x) => BigInt(x))), [1, items.length]);
-    }
-}
-
-/**
- * Creates a boolean tensor with a single value.
- * @param {boolean} value The value of the tensor.
- * @returns {Tensor} The boolean tensor.
- * @private
- */
-function boolTensor(value) {
-    return new Tensor('bool', [value], [1]);
-}
-
-// JS doesn't support mixins, so we define some reused functions here, and allow "this" to be passed in
-/**
- * Perform forward pass on the seq2seq model (both encoder and decoder).
- * @param {Object} self The seq2seq model object.
- * @param {Object} model_inputs The input object for the model containing encoder and decoder inputs.
- * @returns {Promise<Seq2SeqLMOutput>} Promise that resolves with the output of the seq2seq model.
- * @private
- */
-async function seq2seqForward(self, model_inputs) {
-    let { encoder_outputs, input_ids, decoder_input_ids, ...other_decoder_inputs } = model_inputs;
-    // Encode if needed
-    if (!encoder_outputs) {
-        const encoder_inputs = pick(model_inputs, self.sessions['model'].inputNames);
-        // Encoder outputs are not given, so we must compute them.
-        encoder_outputs = (await encoderForward(self, encoder_inputs)).last_hidden_state;
-    }
-
-    other_decoder_inputs.input_ids = decoder_input_ids;
-    other_decoder_inputs.encoder_hidden_states = encoder_outputs;
-
-    if (self.sessions['decoder_model_merged'].inputNames.includes('encoder_attention_mask')) {
-        other_decoder_inputs.encoder_attention_mask = model_inputs.attention_mask;
-    }
-
-    const decoderResults = await decoderForward(self, other_decoder_inputs, true);
-
-    return decoderResults;
-}
-
-/**
- * Forward pass of an encoder model.
- * @param {Object} self The encoder model.
- * @param {Object} model_inputs The input data to be used for the forward pass.
- * @returns {Promise<Object>} The model's outputs.
- * @private
- */
-async function encoderForward(self, model_inputs) {
-    const session = self.sessions['model'];
-    const encoderFeeds = pick(model_inputs, session.inputNames);
-
-    if (session.inputNames.includes('inputs_embeds') && !encoderFeeds.inputs_embeds) {
-        if (!model_inputs.input_ids) {
-            throw new Error('Both `input_ids` and `inputs_embeds` are missing in the model inputs.');
-        }
-        encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids });
-    }
-    if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) {
-        if (!encoderFeeds.input_ids) {
-            throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.');
-        }
-        // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it,
-        // but they weren't created by the tokenizer.
-        encoderFeeds.token_type_ids = zeros_like(encoderFeeds.input_ids);
-    }
-    if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) {
-        if (!encoderFeeds.pixel_values) {
-            throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.');
-        }
-        // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it,
-        // but they weren't created by the processor.
-        const dims = encoderFeeds.pixel_values.dims;
-        encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]);
-    }
-
-    return await sessionRun(session, encoderFeeds);
-}
-
-async function autoEncoderForward(self, model_inputs) {
-    const encoded = await self.encode(model_inputs);
-    const decoded = await self.decode(encoded);
-    return decoded;
-}
-
-/**
- * Forward pass of a decoder model.
- * @param {Object} self The decoder model.
- * @param {Object} model_inputs The input data to be used for the forward pass.
- * @returns {Promise<Object>} The logits and past key values.
- * @private
- */
-async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
-    const session = self.sessions[is_encoder_decoder ? 'decoder_model_merged' : 'model'];
-
-    const { past_key_values, ...new_model_inputs } = model_inputs;
-
-    if (session.inputNames.includes('use_cache_branch')) {
-        new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
-    }
-    if (
-        session.inputNames.includes('position_ids') &&
-        new_model_inputs.attention_mask &&
-        !new_model_inputs.position_ids
-    ) {
-        // NOTE: Handle a special case for paligemma/gemma3 models, where positions are 1-indexed
-        const start_index = ['paligemma', 'gemma3_text', 'gemma3'].includes(self.config.model_type) ? 1 : 0;
-        new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
-    }
-
-    // Unpack the `past_key_values` object into model inputs
-    self.addPastKeyValues(new_model_inputs, past_key_values);
-
-    // Select only the inputs that are needed for the current session
-    const fixed = pick(new_model_inputs, session.inputNames);
-    return await sessionRun(session, fixed);
-}
-
-function default_merge_input_ids_with_features({
-    modality_token_id,
-    inputs_embeds,
-    modality_features,
-    input_ids,
-    attention_mask,
-}) {
-    const token_positions = input_ids.tolist().map((ids) =>
-        ids.reduce((acc, x, idx) => {
-            if (x == modality_token_id) acc.push(idx);
-            return acc;
-        }, []),
-    );
-    const n_tokens = token_positions.reduce((acc, x) => acc + x.length, 0);
-    const n_features = modality_features.dims[0];
-    if (n_tokens !== n_features) {
-        throw new Error(`Number of tokens and features do not match: tokens: ${n_tokens}, features ${n_features}`);
-    }
-
-    // Equivalent to performing a masked_scatter
-    let img = 0;
-    for (let i = 0; i < token_positions.length; ++i) {
-        const tokens = token_positions[i];
-        const embeds = inputs_embeds[i];
-        for (let j = 0; j < tokens.length; ++j) {
-            embeds[tokens[j]].data.set(modality_features[img++].data);
-        }
-    }
-    return { inputs_embeds, attention_mask };
-}
-
-function default_merge_input_ids_with_image_features({
-    image_token_id,
-    inputs_embeds,
-    image_features,
-    input_ids,
-    attention_mask,
-}) {
-    return default_merge_input_ids_with_features({
-        modality_token_id: image_token_id,
-        inputs_embeds,
-        modality_features: image_features,
-        input_ids,
-        attention_mask,
-    });
-}
-
-function default_merge_input_ids_with_audio_features({
-    audio_token_id,
-    inputs_embeds,
-    audio_features,
-    input_ids,
-    attention_mask,
-}) {
-    return default_merge_input_ids_with_features({
-        modality_token_id: audio_token_id,
-        inputs_embeds,
-        modality_features: audio_features,
-        input_ids,
-        attention_mask,
-    });
-}
-
-/**
- * Abstract forward pass function for image-text-to-text or audio-text-to-text models.
- * @param {Object} self The model object.
- * @param {Object} params Additional parameters.
- * @param {Function} [params.encode_function] The function to encode the modality values.
- * @param {Function} [params.merge_function] The function to merge the modality features with the input embeddings.
- * @param {string} [params.modality_input_name] The modality input name.
- * @param {string} [params.modality_output_name] The modality output name.
- * @param {Tensor} [params.input_ids=null]
- * @param {Tensor} [params.attention_mask=null]
- * @param {Tensor} [params.position_ids=null]
- * @param {Tensor} [params.inputs_embeds=null]
- * @param {Tensor} [params.past_key_values=null]
- * @param {Object} [params.generation_config=null]
- * @param {Object} [params.logits_processor=null]
- * @returns {Promise<Tensor>} The model's output tensor
- * @private
- */
-async function genericTextToTextForward(
-    self,
-    {
-        // Generic parameters:
-        encode_function,
-        merge_function,
-        modality_input_name,
-        modality_output_name,
-
-        // Produced by the tokenizer/processor:
-        input_ids = null,
-        attention_mask = null,
-
-        // Used during generation:
-        position_ids = null,
-        inputs_embeds = null,
-        past_key_values = null,
-
-        // Generic generation parameters
-        generation_config = null,
-        logits_processor = null,
-
-        // Additional parameters
-        ...kwargs
-    },
-) {
-    const modality_values = kwargs[modality_input_name];
-    if (!inputs_embeds) {
-        // 1. Extract the text embeddings.
-        inputs_embeds = await self.encode_text({ input_ids, ...kwargs });
-
-        // 2. Possibly, merge text and modality values
-        if (modality_values && input_ids.dims[1] !== 1) {
-            const modality_features = await encode_function({
-                // Pass the modality values under its expected key.
-                // The caller knows whether this is audio or image.
-                [modality_input_name]: modality_values,
-                ...kwargs,
-            });
-            ({ inputs_embeds, attention_mask } = merge_function({
-                [modality_output_name]: modality_features,
-                inputs_embeds,
-                input_ids,
-                attention_mask,
-            }));
-        } else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
-            // This branch handles the cache case.
-            const target_length = input_ids.dims[1]; // always 1
-            const past_length = Object.values(past_key_values)[0].dims.at(-2);
-
-            attention_mask = cat(
-                [
-                    ones([input_ids.dims[0], past_length]),
-                    attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]]),
-                ],
-                1,
-            );
-        }
-    }
-
-    if (!position_ids) {
-        if (self.config.model_type === 'qwen2_vl') {
-            // Special case for qwen2_vl models
-            // @ts-ignore
-            const { image_grid_thw, video_grid_thw } = kwargs;
-            [position_ids] = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask);
-        }
-    }
-
-    // 3. Call the decoder forward using the updated inputs.
-    const outputs = await decoderForward(
-        self,
-        {
-            inputs_embeds,
-            past_key_values,
-            attention_mask,
-            position_ids,
-            generation_config,
-            logits_processor,
-        },
-        true,
-    );
-    return outputs;
-}
-
-/**
- * Forward pass of an audio-text-to-text model.
- * @param {Object} self The audio-text-to-text model.
- * @param {Object} params The inputs for the audio-text-to-text forward pass.
- * @returns {Promise<Tensor>} The model's output tensor.
- * @private
- */
-async function audioTextToTextForward(self, params) {
-    return await genericTextToTextForward(self, {
-        ...params,
-        modality_input_name: 'audio_values',
-        modality_output_name: 'audio_features',
-        encode_function: self.encode_audio.bind(self),
-        merge_function: self._merge_input_ids_with_audio_features.bind(self),
-    });
-}
-
-/**
- * Forward pass of an image-text-to-text model.
- * @param {Object} self The image-text-to-text model.
- * @param {Object} params The inputs for the image-text-to-text forward pass.
- * @returns {Promise<Tensor>} The model's output tensor.
- * @private
- */
-async function imageTextToTextForward(self, params) {
-    return await genericTextToTextForward(self, {
-        ...params,
-        modality_input_name: 'pixel_values',
-        modality_output_name: 'image_features',
-        encode_function: self.encode_image.bind(self),
-        merge_function: self._merge_input_ids_with_image_features.bind(self),
-    });
-}
-
-/**
- * Helper function to perform the following:
- * ```python
- * x = attention_mask.long().cumsum(-1) - 1
- * x.masked_fill_(attention_mask == 0, 1)
- * ```
- * @param {Tensor} attention_mask
- * @returns {{data: BigInt64Array, dims: number[]}}
- */
-function cumsum_masked_fill(attention_mask, start_index = 0) {
-    const [bz, seq_len] = attention_mask.dims;
-    const attn_mask_data = attention_mask.data;
-
-    const data = new BigInt64Array(attn_mask_data.length);
-    for (let i = 0; i < bz; ++i) {
-        const start = i * seq_len;
-        let sum = BigInt(start_index);
-        for (let j = 0; j < seq_len; ++j) {
-            const index = start + j;
-            if (attn_mask_data[index] === 0n) {
-                data[index] = BigInt(1);
-            } else {
-                // === 1n
-                data[index] = sum;
-                sum += attn_mask_data[index];
-            }
-        }
-    }
-    return { data, dims: attention_mask.dims };
-}
-
-/**
- * If the model supports providing position_ids, we create position_ids on the fly for batch generation,
- * by computing the cumulative sum of the attention mask along the sequence length dimension.
- *
- * Equivalent to:
- * ```python
- * position_ids = attention_mask.long().cumsum(-1) - 1
- * position_ids.masked_fill_(attention_mask == 0, 1)
- * if past_key_values:
- *     position_ids = position_ids[:, -input_ids.shape[1] :]
- * ```
- */
-function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
-    const { input_ids, inputs_embeds, attention_mask } = model_inputs;
-
-    const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
-    let position_ids = new Tensor('int64', data, dims);
-    if (past_key_values) {
-        const offset = -(input_ids ?? inputs_embeds).dims.at(1);
-        position_ids = position_ids.slice(null, [offset, null]);
-    }
-    return position_ids;
-}
-
-function decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
-    const past_length = model_inputs.past_key_values ? Object.values(model_inputs.past_key_values)[0].dims.at(-2) : 0;
-
-    if (!model_inputs.attention_mask) {
-        // If the attention mask is not provided, we attempt to infer based on provided inputs
-        let dims;
-        for (const key of ['input_ids', 'inputs_embeds', 'position_ids']) {
-            if (model_inputs[key]) {
-                dims = model_inputs[key].dims;
-                break;
-            }
-        }
-        if (!dims) {
-            throw new Error('attention_mask is not provided, and unable to infer its shape from model inputs.');
-        }
-        model_inputs.attention_mask = ones([dims[0], past_length + dims[1]]);
-    }
-
-    if (model_inputs.past_key_values) {
-        const { input_ids, attention_mask } = model_inputs;
-
-        // Keep only the unprocessed tokens:
-        // 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-        // some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-        // input)
-        if (attention_mask && attention_mask.dims[1] > input_ids.dims[1]) {
-            // NOTE: not needed since we only pass the generated tokens to the next forward pass
-            // const offset = -(attention_mask.dims[1] - past_length);
-            // model_inputs.input_ids = input_ids.slice(null, [offset, null]);
-        }
-        // 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens.
-        // We can discard input_ids based on the past_length.
-        else if (past_length < input_ids.dims[1]) {
-            // NOTE: Required for phi models.
-            // See https://github.com/huggingface/transformers/issues/30809#issuecomment-2111918479 for more information.
-            model_inputs.input_ids = input_ids.slice(null, [past_length, null]);
-        }
-        // 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-        else {
-        }
-    }
-
-    return model_inputs;
-}
-
-function encoder_decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
-    if (model_inputs.past_key_values) {
-        input_ids = input_ids.map((x) => [x.at(-1)]);
-    }
-
-    return {
-        ...model_inputs,
-        decoder_input_ids: toI64Tensor(input_ids),
-    };
-}
-
-function multimodal_text_to_text_prepare_inputs_for_generation(self, ...args) {
-    if (self.config.is_encoder_decoder) {
-        return encoder_decoder_prepare_inputs_for_generation(self, ...args);
-    } else {
-        return decoder_prepare_inputs_for_generation(self, ...args);
-    }
-}
-
-function multimodality_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
-    const has_past_key_values = !!model_inputs.past_key_values;
-
-    if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
-        if (has_past_key_values) {
-            model_inputs.input_ids = cat([model_inputs.input_ids, model_inputs.input_ids], 0);
-            // NOTE: attention_mask handled in generation
-        } else {
-            model_inputs.input_ids = cat(
-                [model_inputs.input_ids, full_like(model_inputs.input_ids, BigInt(generation_config.pad_token_id))],
-                0,
-            );
-            model_inputs.attention_mask = cat(
-                [model_inputs.attention_mask, full_like(model_inputs.attention_mask, 0n)],
-                0,
-            );
-        }
-    }
-
-    if (has_past_key_values || !model_inputs.pixel_values) {
-        model_inputs.pixel_values = full([0, 0, 3, 384, 384], 1.0);
-    }
-
-    if (has_past_key_values) {
-        const num_img_tokens = 0;
-        const num_text_tokens = 1;
-        const has_image = num_img_tokens > 0 ? 1 : 0;
-
-        const batch_size = 1;
-        model_inputs.images_seq_mask = new Tensor(
-            'bool',
-            new Array(num_img_tokens + num_text_tokens).fill(true).fill(false, 0, num_text_tokens),
-            [batch_size, num_img_tokens + num_text_tokens],
-        );
-        model_inputs.images_emb_mask = new Tensor('bool', new Array(num_img_tokens).fill(!!has_image), [
-            batch_size,
-            1,
-            num_img_tokens,
-        ]);
-    }
-    return model_inputs;
-}
-
-function chatterbox_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
-    if (!model_inputs.position_ids && self.sessions['embed_tokens'].inputNames.includes('position_ids')) {
-        // If position_ids are not provided, we create them on the fly using the position of the START_SPEECH_TOKEN
-        const START_SPEECH_TOKEN = 6561;
-        if (model_inputs.input_ids.dims[1] === 1) {
-            const position_ids = Array.from(
-                {
-                    length: input_ids.length,
-                },
-                (_, i) => input_ids[i].length - input_ids[i].findLastIndex((x) => x == START_SPEECH_TOKEN) - 1,
-            );
-            model_inputs.position_ids = new Tensor('int64', position_ids, [input_ids.length, 1]);
-        } else {
-            const batched_input_ids = model_inputs.input_ids.tolist();
-            const position_ids_list = batched_input_ids.map((ids) => {
-                let position = 0;
-                return ids.map((id) => (id >= START_SPEECH_TOKEN ? 0 : position++));
-            });
-            model_inputs.position_ids = new Tensor('int64', position_ids_list.flat(), model_inputs.input_ids.dims);
-        }
-    }
-    if (model_inputs.input_ids.dims[1] === 1) {
-        // We are in generation mode and no longer need the audio inputs
-        delete model_inputs.audio_values;
-        delete model_inputs.audio_features;
-        delete model_inputs.audio_tokens;
-        delete model_inputs.speaker_embeddings;
-        delete model_inputs.speaker_features;
-    }
-    return decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config);
-}
-
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-/**
- * A base class for pre-trained models that provides the model configuration and an ONNX session.
- */
-export class PreTrainedModel extends Callable {
-    main_input_name = 'input_ids';
-    forward_params = ['input_ids', 'attention_mask'];
-
-    _return_dict_in_generate_keys = null;
-    /**
-     * Creates a new instance of the `PreTrainedModel` class.
-     * @param {import('./configs.js').PretrainedConfig} config The model configuration.
-     * @param {Record<string, any>} sessions The inference sessions for the model.
-     * @param {Record<string, Object>} configs Additional configuration files (e.g., generation_config.json).
-     */
-    constructor(config, sessions, configs) {
-        super();
-
-        this.config = config;
-        this.sessions = sessions;
-        this.configs = configs;
-
-        const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
-        const modelType = MODEL_TYPE_MAPPING.get(modelName);
-
-        this.can_generate = false;
-        this._forward = null;
-
-        this._prepare_inputs_for_generation = null;
-        switch (modelType) {
-            case MODEL_TYPES.DecoderOnly:
-                this.can_generate = true;
-                this._forward = decoderForward;
-                this._prepare_inputs_for_generation = decoder_prepare_inputs_for_generation;
-                break;
-            case MODEL_TYPES.Seq2Seq:
-            case MODEL_TYPES.Vision2Seq:
-            case MODEL_TYPES.Musicgen:
-                this.can_generate = true;
-
-                this._forward = seq2seqForward;
-                this._prepare_inputs_for_generation = encoder_decoder_prepare_inputs_for_generation;
-                break;
-
-            case MODEL_TYPES.EncoderDecoder:
-                this._forward = seq2seqForward;
-                break;
-            case MODEL_TYPES.ImageTextToText:
-                this.can_generate = true;
-                this._forward = imageTextToTextForward;
-                this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
-                break;
-            case MODEL_TYPES.AudioTextToText:
-                this.can_generate = true;
-                this._forward = audioTextToTextForward;
-                this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
-                break;
-            case MODEL_TYPES.Phi3V:
-            case MODEL_TYPES.ImageAudioTextToText:
-                this.can_generate = true;
-                this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation;
-                break;
-            case MODEL_TYPES.MultiModality:
-                this.can_generate = true;
-                this._prepare_inputs_for_generation = multimodality_prepare_inputs_for_generation;
-                break;
-            case MODEL_TYPES.AutoEncoder:
-                this._forward = autoEncoderForward;
-                break;
-            case MODEL_TYPES.Chatterbox:
-                this.can_generate = true;
-                this._prepare_inputs_for_generation = chatterbox_prepare_inputs_for_generation;
-            default:
-                // should be MODEL_TYPES.EncoderOnly
-                this._forward = encoderForward;
-                break;
-        }
-
-        if (this.can_generate) {
-            this.forward_params.push('past_key_values');
-        }
-
-        /** @type {import('./configs.js').TransformersJSConfig} */
-        this.custom_config = this.config['transformers.js_config'] ?? {};
-    }
-
-    /**
-     * Disposes of all the ONNX sessions that were created during inference.
-     * @returns {Promise<unknown[]>} An array of promises, one for each ONNX session that is being disposed.
-     * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry
-     */
-    async dispose() {
-        const promises = [];
-        for (const session of Object.values(this.sessions)) {
-            promises.push(session.release?.());
-        }
-        return await Promise.all(promises);
-    }
-
-    /**
-     * Instantiate one of the model classes of the library from a pretrained model.
-     *
-     * The model class to instantiate is selected based on the `model_type` property of the config object
-     * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
-     *
-     * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
-     * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-     *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-     *   user or organization name, like `dbmdz/bert-base-german-cased`.
-     * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`.
-     * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
-     *
-     * @returns {Promise<PreTrainedModel>} A new instance of the `PreTrainedModel` class.
-     */
-    static async from_pretrained(
-        pretrained_model_name_or_path,
-        {
-            progress_callback = null,
-            config = null,
-            cache_dir = null,
-            local_files_only = false,
-            revision = 'main',
-            model_file_name = null,
-            subfolder = 'onnx',
-            device = null,
-            dtype = null,
-            use_external_data_format = null,
-            session_options = {},
-        } = {},
-    ) {
-        let options = {
-            progress_callback,
-            config,
-            cache_dir,
-            local_files_only,
-            revision,
-            model_file_name,
-            subfolder,
-            device,
-            dtype,
-            use_external_data_format,
-            session_options,
-        };
-
-        const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
-        const modelType = MODEL_TYPE_MAPPING.get(modelName);
-
-        config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
-
-        let info;
-        if (modelType === MODEL_TYPES.DecoderOnly) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        model: options.model_file_name ?? 'model',
-                    },
-                    options,
-                    'model',
-                ),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        model: 'encoder_model',
-                        decoder_model_merged: 'decoder_model_merged',
-                    },
-                    options,
-                    'decoder_model_merged',
-                ),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.MaskGeneration) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        model: 'vision_encoder',
-                        prompt_encoder_mask_decoder: 'prompt_encoder_mask_decoder',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.EncoderDecoder) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        model: 'encoder_model',
-                        decoder_model_merged: 'decoder_model_merged',
-                    },
-                    options,
-                    'decoder_model_merged',
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.ImageTextToText) {
-            const sessions = {
-                embed_tokens: 'embed_tokens',
-                vision_encoder: 'vision_encoder',
-                decoder_model_merged: 'decoder_model_merged',
-            };
-            if (config.is_encoder_decoder) {
-                sessions['model'] = 'encoder_model';
-            }
-            info = await Promise.all([
-                constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.AudioTextToText) {
-            const sessions = {
-                embed_tokens: 'embed_tokens',
-                audio_encoder: 'audio_encoder',
-                decoder_model_merged: 'decoder_model_merged',
-            };
-            info = await Promise.all([
-                constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
-            const sessions = {
-                embed_tokens: 'embed_tokens',
-                audio_encoder: 'audio_encoder',
-                vision_encoder: 'vision_encoder',
-                decoder_model_merged: 'decoder_model_merged',
-            };
-            info = await Promise.all([
-                constructSessions(pretrained_model_name_or_path, sessions, options),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.Musicgen) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        model: 'text_encoder',
-                        decoder_model_merged: 'decoder_model_merged',
-                        encodec_decode: 'encodec_decode',
-                    },
-                    options,
-                    'decoder_model_merged',
-                ),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.MultiModality) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        prepare_inputs_embeds: 'prepare_inputs_embeds',
-                        model: 'language_model',
-                        lm_head: 'lm_head',
-                        gen_head: 'gen_head',
-                        gen_img_embeds: 'gen_img_embeds',
-                        image_decode: 'image_decode',
-                    },
-                    options,
-                    'model',
-                ),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.Phi3V) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        prepare_inputs_embeds: 'prepare_inputs_embeds',
-                        model: 'model',
-                        vision_encoder: 'vision_encoder',
-                    },
-                    options,
-                    'model',
-                ),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.Chatterbox) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        embed_tokens: 'embed_tokens',
-                        speech_encoder: 'speech_encoder',
-                        model: 'language_model',
-                        conditional_decoder: 'conditional_decoder',
-                    },
-                    options,
-                    'model',
-                ),
-                getOptionalConfigs(
-                    pretrained_model_name_or_path,
-                    {
-                        generation_config: 'generation_config.json',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.AutoEncoder) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        encoder_model: 'encoder_model',
-                        decoder_model: 'decoder_model',
-                    },
-                    options,
-                ),
-            ]);
-        } else if (modelType === MODEL_TYPES.Supertonic) {
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        text_encoder: 'text_encoder',
-                        latent_denoiser: 'latent_denoiser',
-                        voice_decoder: 'voice_decoder',
-                    },
-                    options,
-                ),
-            ]);
-        } else {
-            // should be MODEL_TYPES.EncoderOnly
-            if (modelType !== MODEL_TYPES.EncoderOnly) {
-                const type = modelName ?? config?.model_type;
-                if (type !== 'custom') {
-                    console.warn(
-                        `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`,
-                    );
-                }
-            }
-            info = await Promise.all([
-                constructSessions(
-                    pretrained_model_name_or_path,
-                    {
-                        model: options.model_file_name ?? 'model',
-                    },
-                    options,
-                ),
-            ]);
-        }
-
-        // @ts-ignore
-        return new this(config, ...info);
-    }
-
-    /**
-     * Runs the model with the provided inputs
-     * @param {Object} model_inputs Object containing input tensors
-     * @returns {Promise<Object>} Object containing output tensors
-     */
-    async _call(model_inputs) {
-        return await this.forward(model_inputs);
-    }
-
-    /**
-     * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
-     * will be chosen based on the model type.
-     * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
-     * @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
-     * @throws {Error} This method must be implemented in subclasses.
-     */
-    async forward(model_inputs) {
-        return await this._forward(this, model_inputs);
-    }
-
-    /**
-     * Get the model's generation config, if it exists.
-     * @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
-     */
-    get generation_config() {
-        return this.configs?.generation_config ?? null;
-    }
-
-    /**
-     * @param {GenerationConfig} generation_config
-     * @param {number} input_ids_seq_length The starting sequence length for the input ids.
-     * @returns {LogitsProcessorList}
-     * @private
-     */
-    _get_logits_processor(
-        generation_config,
-        input_ids_seq_length,
-        // encoder_input_ids, TODO
-        // prefix_allowed_tokens_fn, TODO
-        logits_processor = null,
-    ) {
-        const processors = new LogitsProcessorList();
-
-        // if (generation_config.diversity_penalty !== null && generation_config.diversity_penalty > 0.0) {
-        //     processors.push(new HammingDiversityLogitsProcessor(
-        //         generation_config.diversity_penalty,
-        //         generation_config.num_beams,
-        //         generation_config.num_beam_groups
-        //     ));
-        // }
-
-        // if (generation_config.encoder_repetition_penalty !== null && generation_config.encoder_repetition_penalty !== 1.0) {
-        //     processors.push(new EncoderRepetitionPenaltyLogitsProcessor(
-        //         generation_config.encoder_repetition_penalty,
-        //         encoder_input_ids
-        //     ));
-        // }
-
-        if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1.0) {
-            processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
-        }
-
-        if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
-            processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
-        }
-
-        // if (generation_config.encoder_no_repeat_ngram_size !== null && generation_config.encoder_no_repeat_ngram_size > 0) {
-        //     if (this.config.is_encoder_decoder) {
-        //         processors.push(new EncoderNoRepeatNGramLogitsProcessor(
-        //             generation_config.encoder_no_repeat_ngram_size,
-        //             encoder_input_ids
-        //         ));
-        //     } else {
-        //         throw new Error("It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture");
-        //     }
-        // }
-
-        if (generation_config.bad_words_ids !== null) {
-            processors.push(
-                new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id),
-            );
-        }
-
-        if (
-            generation_config.min_length !== null &&
-            generation_config.eos_token_id !== null &&
-            generation_config.min_length > 0
-        ) {
-            processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
-        }
-
-        if (
-            generation_config.min_new_tokens !== null &&
-            generation_config.eos_token_id !== null &&
-            generation_config.min_new_tokens > 0
-        ) {
-            processors.push(
-                new MinNewTokensLengthLogitsProcessor(
-                    input_ids_seq_length,
-                    generation_config.min_new_tokens,
-                    generation_config.eos_token_id,
-                ),
-            );
-        }
-
-        // if (prefix_allowed_tokens_fn !== null) {
-        //     processors.push(new PrefixConstrainedLogitsProcessor(
-        //         prefix_allowed_tokens_fn,
-        //         generation_config.num_beams / generation_config.num_beam_groups
-        //     ));
-        // }
-
-        if (generation_config.forced_bos_token_id !== null) {
-            processors.push(new ForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id));
-        }
-
-        if (generation_config.forced_eos_token_id !== null) {
-            processors.push(
-                new ForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id),
-            );
-        }
-
-        // if (generation_config.remove_invalid_values === true) {
-        //     processors.push(new InfNanRemoveLogitsProcessor());
-        // }
-
-        // if (generation_config.exponential_decay_length_penalty !== null) {
-        //     processors.push(new ExponentialDecayLengthPenalty(
-        //         generation_config.exponential_decay_length_penalty,
-        //         generation_config.eos_token_id,
-        //         input_ids_seq_length
-        //     ));
-        // }
-
-        // if (generation_config.suppress_tokens !== null) {
-        //     processors.push(new SuppressTokensLogitsProcessor(generation_config.suppress_tokens));
-        // }
-
-        if (generation_config.begin_suppress_tokens !== null) {
-            const begin_index =
-                input_ids_seq_length > 1 || generation_config.forced_bos_token_id === null
-                    ? input_ids_seq_length
-                    : input_ids_seq_length + 1;
-
-            processors.push(
-                new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index),
-            );
-        }
-
-        // DEPRECATED: https://github.com/huggingface/transformers/pull/29485
-        // if (generation_config.forced_decoder_ids !== null) {
-        //     processors.push(new ForceTokensLogitsProcessor(generation_config.forced_decoder_ids));
-        // }
-
-        // 8. prepare batched CFG externally
-        if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
-            processors.push(new ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale));
-        }
-
-        if (generation_config.temperature === 0 && generation_config.do_sample) {
-            console.warn(
-                '`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`.',
-            );
-            generation_config.do_sample = false;
-        }
-
-        if (generation_config.do_sample) {
-            if (generation_config.temperature !== null && generation_config.temperature !== 1.0) {
-                processors.push(new TemperatureLogitsWarper(generation_config.temperature));
-            }
-            // TODO: Add TopPLogitsWarper and TopKLogitsWarper
-            // if (generation_config.top_k !== null && generation_config.top_k !== 0) {
-            //     processors.push(new TopKLogitsWarper(generation_config.top_k));
-            // }
-            // if (generation_config.top_p !== null && generation_config.top_p < 1.0) {
-            //     processors.push(new TopPLogitsWarper(generation_config.top_p));
-            // }
-        }
-
-        if (logits_processor !== null) {
-            processors.extend(logits_processor);
-        }
-
-        // `LogitNormalization` should always be the last logit processor, when present
-        // if (generation_config.renormalize_logits === true) {
-        //     processors.push(new LogitNormalization());
-        // }
-
-        return processors;
-    }
-
-    /**
-     * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation.
-     * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object.
-     * @param {GenerationConfig|null} generation_config A `GenerationConfig` object containing generation parameters.
-     * @param {Object} kwargs Additional generation parameters to be used in place of those in the `generation_config` object.
-     * @returns {GenerationConfig} The final generation config object to be used by the model for text generation.
-     */
-    _prepare_generation_config(generation_config, kwargs, cls = GenerationConfig) {
-        // Create empty generation config (contains defaults)
-        // We pass `this.config` so that if `eos_token_id` or `bos_token_id` exist in the model's config, we will use them
-        const config = { ...this.config };
-        for (const key of ['decoder', 'generator', 'text_config']) {
-            // Special case: some models have generation attributes set in the decoder.
-            // Use them if still unset in the generation config.
-            if (key in config) {
-                Object.assign(config, config[key]);
-            }
-        }
-
-        const gen_config = new cls(config);
-
-        // Apply model's generation config, if it exists
-        Object.assign(gen_config, this.generation_config ?? {});
-
-        // Next, use any generation config specified by the user
-        // when calling `generate`
-        if (generation_config) {
-            Object.assign(gen_config, generation_config);
-        }
-
-        // Finally, if any kwargs were passed, use them to overwrite
-        if (kwargs) {
-            Object.assign(gen_config, pick(kwargs, Object.getOwnPropertyNames(gen_config)));
-        }
-
-        return gen_config;
-    }
-
-    /**
-     *
-     * @param {GenerationConfig} generation_config
-     * @param {StoppingCriteriaList} [stopping_criteria=null]
-     */
-    _get_stopping_criteria(generation_config, stopping_criteria = null) {
-        const criteria = new StoppingCriteriaList();
-
-        if (generation_config.max_length !== null) {
-            criteria.push(
-                new MaxLengthCriteria(generation_config.max_length, this.config.max_position_embeddings ?? null),
-            );
-        }
-        // if (generation_config.max_time !== null) {
-        //     criteria.push(new MaxTimeCriteria(generation_config.max_time));
-        // }
-        if (generation_config.eos_token_id !== null) {
-            criteria.push(new EosTokenCriteria(generation_config.eos_token_id));
-        }
-
-        if (stopping_criteria) {
-            criteria.extend(stopping_criteria);
-        }
-        return criteria;
-    }
-
-    /**
-     * Confirms that the model class is compatible with generation.
-     * If not, raises an exception that points to the right class to use.
-     */
-    _validate_model_class() {
-        if (!this.can_generate) {
-            const generate_compatible_mappings = [
-                MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
-                // MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, // TODO
-                MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
-                MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
-                MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
-            ];
-
-            const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
-
-            const generate_compatible_classes = new Set();
-            const modelType = this.config.model_type;
-            for (const model_mapping of generate_compatible_mappings) {
-                const supported_models = model_mapping.get(modelType);
-                if (supported_models) {
-                    generate_compatible_classes.add(supported_models[0]);
-                }
-            }
-
-            let errorMessage = `The current model class (${modelName}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;
-            if (generate_compatible_classes.size > 0) {
-                errorMessage += ` Please use the following class instead: ${[...generate_compatible_classes].join(', ')}`;
-            }
-            throw Error(errorMessage);
-        }
-    }
-
-    prepare_inputs_for_generation(...args) {
-        return this._prepare_inputs_for_generation(this, ...args);
-    }
-
-    /**
-     *
-     * @param {Object} inputs
-     * @param {bigint[][]} inputs.generated_input_ids
-     * @param {Object} inputs.outputs
-     * @param {Object} inputs.model_inputs
-     * @param {boolean} inputs.is_encoder_decoder
-     * @returns {Object} The updated model inputs for the next generation iteration.
-     */
-    _update_model_kwargs_for_generation({ generated_input_ids, outputs, model_inputs, is_encoder_decoder }) {
-        // update past_key_values
-        model_inputs['past_key_values'] = this.getPastKeyValues(outputs, model_inputs.past_key_values);
-
-        // update inputs for next run
-        model_inputs['input_ids'] = new Tensor('int64', generated_input_ids.flat(), [generated_input_ids.length, 1]);
-
-        if (!is_encoder_decoder) {
-            // update attention mask
-            model_inputs.attention_mask = cat(
-                [model_inputs.attention_mask, ones([model_inputs.attention_mask.dims[0], 1])],
-                1,
-            );
-        } else if ('decoder_attention_mask' in model_inputs) {
-            // TODO: update decoder attention mask if the model requires it
-        }
-
-        // force recreate position_ids in next iteration
-        model_inputs['position_ids'] = null;
-
-        return model_inputs;
-    }
-
-    /**
-     * This function extracts the model-specific `inputs` for generation.
-     * @param {Object} params
-     * @param {Tensor} [params.inputs=null]
-     * @param {number} [params.bos_token_id=null]
-     * @param {Record<string, Tensor|number[]>} [params.model_kwargs]
-     * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor>, model_input_name: string}} The model-specific inputs for generation.
-     */
-    _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
-        const model_inputs = pick(model_kwargs, this.forward_params);
-        const input_name = this.main_input_name;
-        if (input_name in model_inputs) {
-            if (inputs) {
-                throw new Error(
-                    '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' +
-                        'Make sure to either pass {inputs} or {input_name}=...',
-                );
-            }
-        } else {
-            model_inputs[input_name] = inputs;
-        }
-
-        const inputs_tensor = model_inputs[input_name];
-
-        return { inputs_tensor, model_inputs, model_input_name: input_name };
-    }
-
-    async _prepare_encoder_decoder_kwargs_for_generation({
-        inputs_tensor,
-        model_inputs,
-        model_input_name,
-        generation_config,
-    }) {
-        if (
-            this.sessions['model'].inputNames.includes('inputs_embeds') &&
-            !model_inputs.inputs_embeds &&
-            '_prepare_inputs_embeds' in this
-        ) {
-            // Encoder expects `inputs_embeds` instead of `input_ids`
-            const { input_ids, pixel_values, attention_mask, ...kwargs } = model_inputs;
-            // @ts-ignore
-            const prepared_inputs = await this._prepare_inputs_embeds(model_inputs);
-            model_inputs = {
-                ...kwargs,
-                ...pick(prepared_inputs, ['inputs_embeds', 'attention_mask']),
-            };
-        }
-        let { last_hidden_state } = await encoderForward(this, model_inputs);
-
-        // for classifier free guidance we need to add a 'null' input to our encoder hidden states
-        if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
-            last_hidden_state = cat([last_hidden_state, full_like(last_hidden_state, 0.0)], 0);
-
-            if ('attention_mask' in model_inputs) {
-                model_inputs['attention_mask'] = cat(
-                    [model_inputs['attention_mask'], zeros_like(model_inputs['attention_mask'])],
-                    0,
-                );
-            }
-        } else if (model_inputs.decoder_input_ids) {
-            // Ensure that the encoder outputs have the same batch size as the decoder inputs,
-            // allowing for more efficient batched generation for single inputs
-            const decoder_input_ids_batch_size = toI64Tensor(model_inputs.decoder_input_ids).dims[0];
-            if (decoder_input_ids_batch_size !== last_hidden_state.dims[0]) {
-                if (last_hidden_state.dims[0] !== 1) {
-                    throw new Error(
-                        `The encoder outputs have a different batch size (${last_hidden_state.dims[0]}) than the decoder inputs (${decoder_input_ids_batch_size}).`,
-                    );
-                }
-                last_hidden_state = cat(
-                    Array.from({ length: decoder_input_ids_batch_size }, () => last_hidden_state),
-                    0,
-                );
-            }
-        }
-        model_inputs['encoder_outputs'] = last_hidden_state;
-
-        return model_inputs;
-    }
-
-    /**
-     * Prepares `decoder_input_ids` for generation with encoder-decoder models
-     * @param {*} param0
-     */
-    _prepare_decoder_input_ids_for_generation({
-        batch_size,
-        model_input_name,
-        model_kwargs,
-        decoder_start_token_id,
-        bos_token_id,
-        generation_config,
-    }) {
-        let { decoder_input_ids, ...model_inputs } = model_kwargs;
-
-        // Prepare input ids if the user has not defined `decoder_input_ids` manually.
-        if (!(decoder_input_ids instanceof Tensor)) {
-            if (!decoder_input_ids) {
-                decoder_start_token_id ??= bos_token_id;
-
-                if (this.config.model_type === 'musicgen') {
-                    // Custom logic (TODO: move to Musicgen class)
-                    decoder_input_ids = Array.from(
-                        {
-                            // @ts-expect-error TS2339
-                            length: batch_size * this.config.decoder.num_codebooks,
-                        },
-                        () => [decoder_start_token_id],
-                    );
-                } else if (Array.isArray(decoder_start_token_id)) {
-                    if (decoder_start_token_id.length !== batch_size) {
-                        throw new Error(
-                            `\`decoder_start_token_id\` expcted to have length ${batch_size} but got ${decoder_start_token_id.length}`,
-                        );
-                    }
-                    decoder_input_ids = decoder_start_token_id;
-                } else {
-                    decoder_input_ids = Array.from(
-                        {
-                            length: batch_size,
-                        },
-                        () => [decoder_start_token_id],
-                    );
-                }
-            } else if (!Array.isArray(decoder_input_ids[0])) {
-                // Correct batch size
-                decoder_input_ids = Array.from(
-                    {
-                        length: batch_size,
-                    },
-                    () => decoder_input_ids,
-                );
-            }
-            decoder_input_ids = toI64Tensor(decoder_input_ids);
-        }
-
-        model_kwargs['decoder_attention_mask'] = ones_like(decoder_input_ids);
-
-        return { input_ids: decoder_input_ids, model_inputs };
-    }
-
-    /**
-     * Generates sequences of token ids for models with a language modeling head.
-     * @param {import('./generation/parameters.js').GenerationFunctionParameters} options
-     * @returns {Promise<ModelOutput|Tensor>} The output of the model, which can contain the generated token ids, attentions, and scores.
-     */
-    async generate({
-        inputs = null,
-        generation_config = null,
-        logits_processor = null,
-        stopping_criteria = null,
-        streamer = null,
-
-        // inputs_attention_mask = null,
-        ...kwargs
-    }) {
-        this._validate_model_class();
-
-        // Update generation config with defaults and kwargs
-        generation_config = this._prepare_generation_config(generation_config, kwargs);
-
-        // 3. Define model inputs
-        let { inputs_tensor, model_inputs, model_input_name } = this._prepare_model_inputs({
-            inputs,
-            model_kwargs: kwargs,
-        });
-
-        const is_encoder_decoder = this.config.is_encoder_decoder;
-
-        // 4. Define other model kwargs
-        if (!is_encoder_decoder) {
-            // decoder-only models should use left-padding for generation
-        } else if (!('encoder_outputs' in model_inputs)) {
-            // if model is encoder decoder encoder_outputs are created
-            // and added to `model_kwargs`
-            model_inputs = await this._prepare_encoder_decoder_kwargs_for_generation({
-                inputs_tensor,
-                model_inputs,
-                model_input_name,
-                generation_config,
-            });
-        }
-
-        // 5. Prepare `input_ids` which will be used for auto-regressive generation
-        // TODO: Update to align with HF transformers' implementation
-        let input_ids;
-        if (is_encoder_decoder) {
-            // Generating from the encoder outputs
-            ({ input_ids, model_inputs } = this._prepare_decoder_input_ids_for_generation({
-                batch_size: model_inputs[model_input_name].dims.at(0),
-                model_input_name,
-                model_kwargs: model_inputs,
-                decoder_start_token_id: generation_config.decoder_start_token_id,
-                bos_token_id: generation_config.bos_token_id,
-                generation_config,
-            }));
-        } else {
-            input_ids = model_inputs[model_input_name];
-        }
-
-        // 6. Prepare `max_length` depending on other stopping criteria.
-        let input_ids_length = input_ids.dims.at(-1);
-
-        if (generation_config.max_new_tokens !== null) {
-            generation_config.max_length = input_ids_length + generation_config.max_new_tokens;
-        }
-
-        // input_ids_length = model_inputs[model_input_name].dims.at(1);
-        // // inputs instanceof Tensor ?  : inputs.length;
-
-        // // decoder-only
-        // if (input_ids_length === 0) {
-        //     throw Error("Must supply a non-empty array of input token ids.")
-        // }
-
-        // let decoder_input_ids =
-        // generation_config.decoder_input_ids
-        // ?? generation_config.decoder_start_token_id
-        // ?? generation_config.bos_token_id
-        // ?? generation_config.eos_token_id;
-
-        // Update logits processor
-        // 8. prepare distribution pre_processing samplers
-        const prepared_logits_processor = this._get_logits_processor(
-            generation_config,
-            input_ids_length,
-            logits_processor,
-        );
-
-        // 9. prepare stopping criteria
-        const prepared_stopping_criteria = this._get_stopping_criteria(generation_config, stopping_criteria);
-
-        // /** @type {number[]} */
-        // let eos_token_ids = generation_config.eos_token_id;
-        // if (eos_token_ids !== null && !Array.isArray(eos_token_ids)) {
-        //     eos_token_ids = [eos_token_ids];
-        // }
-
-        const numInputs = model_inputs[model_input_name].dims.at(0);
-
-        // TODO:
-        // done is a list of booleans to keep track of which inputs are done
-        // const done = new Array(numInputs).fill(false);
-        // For efficiency purposes, we remove completed rows from model_inputs
-        // when the beam is complete, and we keep track of the row index
-        // const rowIndexToBatchIndex = new Map();
-
-        const sampler = LogitsSampler.getSampler(generation_config);
-
-        // TODO make > numInputs
-        const scores = new Array(numInputs).fill(0);
-        /** @type {bigint[][]} */
-        const all_input_ids = input_ids.tolist();
-        if (streamer) {
-            streamer.put(all_input_ids);
-        }
-        // const all_generated_input_ids = Array.from({ length: numInputs }, () => []);
-
-        // NOTE: For now, we don't support spawning new beams
-        // TODO: when we do, we simply copy past key values and accumulate into single large tensor
-
-        ////////////////////////////////////////////////////
-        // Generic search which handles 4 generation modes:
-        // - GenerationMode.GREEDY_SEARCH
-        // - GenerationMode.SAMPLE
-        // - GenerationMode.BEAM_SEARCH
-        // - GenerationMode.BEAM_SAMPLE
-        ////////////////////////////////////////////////////
-        let outputs;
-        let attentions = {};
-        let return_dict_items = {};
-        while (true) {
-            // prepare model inputs
-            model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
-            outputs = await this.forward(model_inputs);
-
-            if (generation_config.return_dict_in_generate) {
-                if (generation_config.output_attentions) {
-                    // Get attentions if they are present
-                    const token_attentions = this.getAttentions(outputs);
-                    for (const key in token_attentions) {
-                        if (!(key in attentions)) {
-                            attentions[key] = [];
-                        }
-                        attentions[key].push(token_attentions[key]);
-                    }
-                } else if (this._return_dict_in_generate_keys) {
-                    Object.assign(return_dict_items, pick(outputs, this._return_dict_in_generate_keys));
-                }
-            }
-
-            // Logits are of the form [batch_size, out_seq_length, vocab_size]
-            // In most cases, this will be [batch_size, 1, vocab_size]
-            // So, we select the last token's logits:
-            // (equivalent to `logits = outputs.logits[:, -1, :]`)
-            // The `.to('float32')` is necessary for models with float16 logits,
-            // and is a no-op for float32 logits.
-            // TODO: Support float16 sampling in the sampler directly
-            const logits = outputs.logits.slice(null, -1, null).to('float32');
-
-            const next_tokens_scores = prepared_logits_processor(all_input_ids, logits);
-
-            /** @type {[bigint][]} */
-            const generated_input_ids = [];
-            // const new_kv_cache = [];// NOTE: Only used for beam search when concatenating new kv
-            // Loop over each batch
-            for (let batch_idx = 0; batch_idx < next_tokens_scores.dims.at(0); ++batch_idx) {
-                const logs = next_tokens_scores[batch_idx];
-
-                const sampledTokens = await sampler(logs);
-                for (const [newTokenId, logProb] of sampledTokens) {
-                    const bigint = BigInt(newTokenId);
-                    // TODO: If branching, use previous beam as a starting point
-                    // update generated ids, model inputs, and length for next step
-                    scores[batch_idx] += logProb;
-                    all_input_ids[batch_idx].push(bigint);
-                    generated_input_ids.push([bigint]);
-
-                    // TODO: Support beam search
-                    break;
-                }
-            }
-            if (streamer) {
-                streamer.put(generated_input_ids);
-            }
-
-            const stop = prepared_stopping_criteria(all_input_ids);
-            if (stop.every((x) => x)) {
-                break;
-            }
-
-            model_inputs = this._update_model_kwargs_for_generation({
-                generated_input_ids,
-                outputs,
-                model_inputs,
-                is_encoder_decoder,
-            });
-        }
-
-        if (streamer) {
-            streamer.end();
-        }
-
-        // Retrieve and dispose all final past key values (including encoder attentions)
-        const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
-
-        // TODO: ensure all_input_ids is padded correctly...
-        const sequences = new Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
-
-        if (generation_config.return_dict_in_generate) {
-            return {
-                sequences,
-                past_key_values,
-                ...attentions,
-                ...return_dict_items,
-                // TODO:
-                // scores,
-                // logits,
-            };
-        } else {
-            // Dispose all remaining tensors
-            for (const tensor of Object.values(outputs)) {
-                if (tensor.location === 'gpu-buffer') {
-                    tensor.dispose();
-                }
-            }
-            return sequences;
-        }
-    }
-
-    /**
-     * Returns an object containing past key values from the given decoder results object.
-     *
-     * @param {Object} decoderResults The decoder results object.
-     * @param {Object} pastKeyValues The previous past key values.
-     * @returns {Object} An object containing past key values.
-     */
-    getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
-        const pkvs = Object.create(null);
-
-        for (const name in decoderResults) {
-            if (name.startsWith('present')) {
-                const newName = name
-                    // Hybrid cache architecture
-                    .replace('present_ssm', 'past_ssm') // Mamba
-                    .replace('present_conv', 'past_conv') // LFM2
-
-                    // Standard cache architecture
-                    .replace('present', 'past_key_values');
-                const is_encoder_pkv = name.includes('encoder');
-                if (is_encoder_pkv && pastKeyValues) {
-                    // Optimization introduced by optimum to reuse past key values.
-                    // So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
-                    // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
-                    pkvs[newName] = pastKeyValues[newName];
-                } else {
-                    // decoder or using first encoder PKVs
-                    pkvs[newName] = decoderResults[name];
-                }
-
-                if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
-                    // - Always dispose decoder PKVs
-                    // - Only dispose encoder past key values when requested (after generation)
-                    const t = pastKeyValues[newName];
-                    if (t.location === 'gpu-buffer') {
-                        t.dispose();
-                    }
-                }
-            }
-        }
-        return pkvs;
-    }
-
-    /**
-     * Returns an object containing attentions from the given model output object.
-     *
-     * @param {Object} model_output The output of the model.
-     * @returns {{cross_attentions?: Tensor[]}} An object containing attentions.
-     */
-    getAttentions(model_output) {
-        const attentions = {};
-
-        for (const attnName of ['cross_attentions', 'encoder_attentions', 'decoder_attentions']) {
-            for (const name in model_output) {
-                if (name.startsWith(attnName)) {
-                    if (!(attnName in attentions)) {
-                        attentions[attnName] = [];
-                    }
-                    attentions[attnName].push(model_output[name]);
-                }
-            }
-        }
-        return attentions;
-    }
-
-    /**
-     * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
-     *
-     * @param {Object} decoderFeeds The decoder feeds object to add past key values to.
-     * @param {Object} pastKeyValues An object containing past key values.
-     */
-    addPastKeyValues(decoderFeeds, pastKeyValues) {
-        if (pastKeyValues) {
-            Object.assign(decoderFeeds, pastKeyValues);
-        } else {
-            const session = this.sessions['decoder_model_merged'] ?? this.sessions['model'];
-            const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1;
-
-            const dtype = session?.config?.kv_cache_dtype ?? 'float32';
-            const cls = dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
-            const shapes = getCacheShapes(this.config, { batch_size });
-            for (const name in shapes) {
-                const size = shapes[name].reduce((a, b) => a * b, 1);
-                decoderFeeds[name] = new Tensor(dtype, new cls(size), shapes[name]);
-            }
-        }
-    }
-
-    async encode_image({ pixel_values }) {
-        // image_inputs === { pixel_values }
-        return (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
-    }
-
-    async encode_text({ input_ids }) {
-        // text_inputs === { input_ids, attention_mask }
-        return (await sessionRun(this.sessions['embed_tokens'], { input_ids })).inputs_embeds;
-    }
-
-    async encode_audio({ audio_values }) {
-        // audio_inputs === { audio_values }
-        return (await sessionRun(this.sessions['audio_encoder'], { audio_values })).audio_features;
-    }
-}
-
-//////////////////////////////////////////////////
-// Base model output class
-export class ModelOutput {}
-
-/**
- * Base class for model's outputs, with potential hidden states and attentions.
- */
-export class BaseModelOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model.
-     * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-     * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-     */
-    constructor({ last_hidden_state, hidden_states = null, attentions = null }) {
-        super();
-        this.last_hidden_state = last_hidden_state;
-        this.hidden_states = hidden_states;
-        this.attentions = attentions;
-    }
-}
-//////////////////////////////////////////////////
-// Bert models
-export class BertPreTrainedModel extends PreTrainedModel {}
-export class BertModel extends BertPreTrainedModel {}
-
-/**
- * BertForMaskedLM is a class representing a BERT model for masked language modeling.
- */
-export class BertForMaskedLM extends BertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * BertForSequenceClassification is a class representing a BERT model for sequence classification.
- */
-export class BertForSequenceClassification extends BertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * BertForTokenClassification is a class representing a BERT model for token classification.
- */
-export class BertForTokenClassification extends BertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * BertForQuestionAnswering is a class representing a BERT model for question answering.
- */
-export class BertForQuestionAnswering extends BertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// NeoBert models
-export class NeoBertPreTrainedModel extends PreTrainedModel {}
-export class NeoBertModel extends NeoBertPreTrainedModel {}
-
-export class NeoBertForMaskedLM extends NeoBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-export class NeoBertForSequenceClassification extends NeoBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-export class NeoBertForTokenClassification extends NeoBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-export class NeoBertForQuestionAnswering extends NeoBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// ModernBert models
-export class ModernBertPreTrainedModel extends PreTrainedModel {}
-export class ModernBertModel extends ModernBertPreTrainedModel {}
-
-export class ModernBertForMaskedLM extends ModernBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-export class ModernBertForSequenceClassification extends ModernBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-export class ModernBertForTokenClassification extends ModernBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// ModernBERT Decoder models
-export class ModernBertDecoderPreTrainedModel extends PreTrainedModel {}
-export class ModernBertDecoderModel extends ModernBertDecoderPreTrainedModel {}
-export class ModernBertDecoderForCausalLM extends ModernBertDecoderPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// NomicBert models
-export class NomicBertPreTrainedModel extends PreTrainedModel {}
-export class NomicBertModel extends NomicBertPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// RoFormer models
-export class RoFormerPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class RoFormerModel extends RoFormerPreTrainedModel {}
-
-/**
- * RoFormer Model with a `language modeling` head on top.
- */
-export class RoFormerForMaskedLM extends RoFormerPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class RoFormerForSequenceClassification extends RoFormerPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output)
- * e.g. for Named-Entity-Recognition (NER) tasks.
- */
-export class RoFormerForTokenClassification extends RoFormerPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD
- * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
- */
-export class RoFormerForQuestionAnswering extends RoFormerPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-// TODO: Add RoFormerForCausalLM and RoFormerForMultipleChoice
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// ConvBert models
-export class ConvBertPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class ConvBertModel extends ConvBertPreTrainedModel {}
-
-/**
- * ConvBERT Model with a language modeling head on top.
- */
-export class ConvBertForMaskedLM extends ConvBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class ConvBertForSequenceClassification extends ConvBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output)
- * e.g. for Named-Entity-Recognition (NER) tasks.
- */
-export class ConvBertForTokenClassification extends ConvBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD
- * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`)
- */
-export class ConvBertForQuestionAnswering extends ConvBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Electra models
-export class ElectraPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Electra Model transformer outputting raw hidden-states without any specific head on top.
- * Identical to the BERT model except that it uses an additional linear layer between the embedding
- * layer and the encoder if the hidden size and embedding size are different.
- */
-export class ElectraModel extends ElectraPreTrainedModel {}
-// TODO add ElectraForPreTraining
-/**
- * Electra model with a language modeling head on top.
- */
-export class ElectraForMaskedLM extends ElectraPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class ElectraForSequenceClassification extends ElectraPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * Electra model with a token classification head on top.
- */
-export class ElectraForTokenClassification extends ElectraPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * LECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD
- * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
- */
-export class ElectraForQuestionAnswering extends ElectraPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// CamemBERT models
-export class CamembertPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class CamembertModel extends CamembertPreTrainedModel {}
-
-/**
- * CamemBERT Model with a `language modeling` head on top.
- */
-export class CamembertForMaskedLM extends CamembertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks.
- */
-export class CamembertForSequenceClassification extends CamembertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
- */
-export class CamembertForTokenClassification extends CamembertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * CamemBERT Model with a span classification head on top for extractive question-answering tasks
- */
-export class CamembertForQuestionAnswering extends CamembertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// DeBERTa models
-export class DebertaPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class DebertaModel extends DebertaPreTrainedModel {}
-
-/**
- * DeBERTa Model with a `language modeling` head on top.
- */
-export class DebertaForMaskedLM extends DebertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class DebertaForSequenceClassification extends DebertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
- */
-export class DebertaForTokenClassification extends DebertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
- * layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
- */
-export class DebertaForQuestionAnswering extends DebertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// DeBERTa-v2 models
-export class DebertaV2PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare DeBERTa-V2 Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class DebertaV2Model extends DebertaV2PreTrainedModel {}
-
-/**
- * DeBERTa-V2 Model with a `language modeling` head on top.
- */
-export class DebertaV2ForMaskedLM extends DebertaV2PreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DeBERTa-V2 Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class DebertaV2ForSequenceClassification extends DebertaV2PreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DeBERTa-V2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
- */
-export class DebertaV2ForTokenClassification extends DebertaV2PreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DeBERTa-V2 Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
- * layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
- */
-export class DebertaV2ForQuestionAnswering extends DebertaV2PreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// DistilBert models
-export class DistilBertPreTrainedModel extends PreTrainedModel {}
-export class DistilBertModel extends DistilBertPreTrainedModel {}
-
-/**
- * DistilBertForSequenceClassification is a class representing a DistilBERT model for sequence classification.
- */
-export class DistilBertForSequenceClassification extends DistilBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DistilBertForTokenClassification is a class representing a DistilBERT model for token classification.
- */
-export class DistilBertForTokenClassification extends DistilBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DistilBertForQuestionAnswering is a class representing a DistilBERT model for question answering.
- */
-export class DistilBertForQuestionAnswering extends DistilBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * DistilBertForMaskedLM is a class representing a DistilBERT model for masking task.
- */
-export class DistilBertForMaskedLM extends DistilBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// ESM models
-export class EsmPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare ESM Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class EsmModel extends EsmPreTrainedModel {}
-
-/**
- * ESM Model with a `language modeling` head on top.
- */
-export class EsmForMaskedLM extends EsmPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class EsmForSequenceClassification extends EsmPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * ESM Model with a token classification head on top (a linear layer on top of the hidden-states output)
- * e.g. for Named-Entity-Recognition (NER) tasks.
- */
-export class EsmForTokenClassification extends EsmPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MobileBert models
-export class MobileBertPreTrainedModel extends PreTrainedModel {}
-export class MobileBertModel extends MobileBertPreTrainedModel {}
-
-/**
- * MobileBertForMaskedLM is a class representing a MobileBERT model for masking task.
- */
-export class MobileBertForMaskedLM extends MobileBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class MobileBertForSequenceClassification extends MobileBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * MobileBert Model with a span classification head on top for extractive question-answering tasks
- */
-export class MobileBertForQuestionAnswering extends MobileBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MPNet models
-export class MPNetPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class MPNetModel extends MPNetPreTrainedModel {}
-
-/**
- * MPNetForMaskedLM is a class representing a MPNet model for masked language modeling.
- */
-export class MPNetForMaskedLM extends MPNetPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * MPNetForSequenceClassification is a class representing a MPNet model for sequence classification.
- */
-export class MPNetForSequenceClassification extends MPNetPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * MPNetForTokenClassification is a class representing a MPNet model for token classification.
- */
-export class MPNetForTokenClassification extends MPNetPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * MPNetForQuestionAnswering is a class representing a MPNet model for question answering.
- */
-export class MPNetForQuestionAnswering extends MPNetPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// SqueezeBert models
-export class SqueezeBertPreTrainedModel extends PreTrainedModel {}
-export class SqueezeBertModel extends SqueezeBertPreTrainedModel {}
-export class SqueezeBertForMaskedLM extends SqueezeBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-export class SqueezeBertForSequenceClassification extends SqueezeBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-export class SqueezeBertForQuestionAnswering extends SqueezeBertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Albert models
-export class AlbertPreTrainedModel extends PreTrainedModel {}
-export class AlbertModel extends AlbertPreTrainedModel {}
-export class AlbertForSequenceClassification extends AlbertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-export class AlbertForQuestionAnswering extends AlbertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-export class AlbertForMaskedLM extends AlbertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// T5 models
-export class T5PreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        'input_ids',
-        'attention_mask',
-        'encoder_outputs',
-        'decoder_input_ids',
-        'decoder_attention_mask',
-        'past_key_values',
-    ];
-}
-
-export class T5Model extends T5PreTrainedModel {}
-
-/**
- * T5Model is a class representing a T5 model for conditional generation.
- */
-export class T5ForConditionalGeneration extends T5PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// LONGT5 models
-/**
- * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
- */
-export class LongT5PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class LongT5Model extends LongT5PreTrainedModel {}
-
-/**
- * LONGT5 Model with a `language modeling` head on top.
- */
-export class LongT5ForConditionalGeneration extends LongT5PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MT5 models
-export class MT5PreTrainedModel extends PreTrainedModel {}
-
-export class MT5Model extends MT5PreTrainedModel {}
-
-/**
- * A class representing a conditional sequence-to-sequence model based on the MT5 architecture.
- */
-export class MT5ForConditionalGeneration extends MT5PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Bart models
-export class BartPretrainedModel extends PreTrainedModel {}
-
-/**
- * The bare BART Model outputting raw hidden-states without any specific head on top.
- */
-export class BartModel extends BartPretrainedModel {}
-
-/**
- * The BART Model with a language modeling head. Can be used for summarization.
- */
-export class BartForConditionalGeneration extends BartPretrainedModel {}
-
-/**
- * Bart model with a sequence classification/head on top (a linear layer on top of the pooled output)
- */
-export class BartForSequenceClassification extends BartPretrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MBart models
-export class MBartPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare MBART Model outputting raw hidden-states without any specific head on top.
- */
-export class MBartModel extends MBartPreTrainedModel {}
-
-/**
- * The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.
- */
-export class MBartForConditionalGeneration extends MBartPreTrainedModel {}
-
-/**
- * MBart model with a sequence classification/head on top (a linear layer on top of the pooled output).
- */
-export class MBartForSequenceClassification extends MBartPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-export class MBartForCausalLM extends MBartPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Blenderbot models
-export class BlenderbotPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Blenderbot Model outputting raw hidden-states without any specific head on top.
- */
-export class BlenderbotModel extends BlenderbotPreTrainedModel {}
-
-/**
- * The Blenderbot Model with a language modeling head. Can be used for summarization.
- */
-export class BlenderbotForConditionalGeneration extends BlenderbotPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Blenderbot models
-export class BlenderbotSmallPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top.
- */
-export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel {}
-
-/**
- * The BlenderbotSmall Model with a language modeling head. Can be used for summarization.
- */
-export class BlenderbotSmallForConditionalGeneration extends BlenderbotSmallPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Roberta models
-export class RobertaPreTrainedModel extends PreTrainedModel {}
-export class RobertaModel extends RobertaPreTrainedModel {}
-
-/**
- * RobertaForMaskedLM class for performing masked language modeling on Roberta models.
- */
-export class RobertaForMaskedLM extends RobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * RobertaForSequenceClassification class for performing sequence classification on Roberta models.
- */
-export class RobertaForSequenceClassification extends RobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * RobertaForTokenClassification class for performing token classification on Roberta models.
- */
-export class RobertaForTokenClassification extends RobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * RobertaForQuestionAnswering class for performing question answering on Roberta models.
- */
-export class RobertaForQuestionAnswering extends RobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// XLM models
-/**
- * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
- */
-export class XLMPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare XLM Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class XLMModel extends XLMPreTrainedModel {}
-
-/**
- * The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
- */
-export class XLMWithLMHeadModel extends XLMPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
- */
-export class XLMForSequenceClassification extends XLMPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * XLM Model with a token classification head on top (a linear layer on top of the hidden-states output)
- */
-export class XLMForTokenClassification extends XLMPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * XLM Model with a span classification head on top for extractive question-answering tasks
- */
-export class XLMForQuestionAnswering extends XLMPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// XLMRoberta models
-export class XLMRobertaPreTrainedModel extends PreTrainedModel {}
-export class XLMRobertaModel extends XLMRobertaPreTrainedModel {}
-
-/**
- * XLMRobertaForMaskedLM class for performing masked language modeling on XLMRoberta models.
- */
-export class XLMRobertaForMaskedLM extends XLMRobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<MaskedLMOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new MaskedLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * XLMRobertaForSequenceClassification class for performing sequence classification on XLMRoberta models.
- */
-export class XLMRobertaForSequenceClassification extends XLMRobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * XLMRobertaForTokenClassification class for performing token classification on XLMRoberta models.
- */
-export class XLMRobertaForTokenClassification extends XLMRobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * XLMRobertaForQuestionAnswering class for performing question answering on XLMRoberta models.
- */
-export class XLMRobertaForQuestionAnswering extends XLMRobertaPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     *
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
-     */
-    async _call(model_inputs) {
-        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Audio Spectrogram Transformer (AST) models
-export class ASTPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare AST Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class ASTModel extends ASTPreTrainedModel {}
-
-/**
- * Audio Spectrogram Transformer model with an audio classification head on top
- * (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2.
- */
-export class ASTForAudioClassification extends ASTPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Whisper models
-export class WhisperPreTrainedModel extends PreTrainedModel {
-    requires_attention_mask = false;
-    main_input_name = 'input_features';
-    forward_params = [
-        'input_features',
-        'attention_mask',
-        'decoder_input_ids',
-        'decoder_attention_mask',
-        'past_key_values',
-    ];
-}
-
-/**
- * WhisperModel class for training Whisper models without a language model head.
- */
-export class WhisperModel extends WhisperPreTrainedModel {}
-
-/**
- * WhisperForConditionalGeneration class for generating conditional outputs from Whisper models.
- */
-export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
-    _prepare_generation_config(generation_config, kwargs) {
-        return /** @type {WhisperGenerationConfig} */ (
-            super._prepare_generation_config(generation_config, kwargs, WhisperGenerationConfig)
-        );
-    }
-
-    /**
-     *
-     * @param {WhisperGenerationConfig} generation_config
-     */
-    _retrieve_init_tokens(generation_config) {
-        // prefix tokens are of the form:
-        //  - Multilingual: <|startoftranscript|> <|lang_id|> <|task|> [<|notimestamps|>]
-        //  - English-only: <|startoftranscript|> [<|notimestamps|>]
-
-        // 1. Handle <|startoftranscript|> token
-        const init_tokens = [generation_config.decoder_start_token_id];
-
-        // 2. Handle <|lang_id|> and <|task> tokens
-        let language = generation_config.language;
-        const task = generation_config.task;
-        if (generation_config.is_multilingual) {
-            if (!language) {
-                // TODO: Implement language detection
-                console.warn('No language specified - defaulting to English (en).');
-                language = 'en';
-            }
-
-            // Add language token
-            const language_code = whisper_language_to_code(language);
-            const language_token = `<|${language_code}|>`;
-            init_tokens.push(generation_config.lang_to_id[language_token]);
-
-            // Add task token
-            // NOTE: Defaults to 'transcribe' if no task is specified
-            init_tokens.push(generation_config.task_to_id[task ?? 'transcribe']);
-        } else if (language || task) {
-            throw new Error(
-                'Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.',
-            );
-        }
-
-        // 3. Handle <|notimestamps|> token
-        if (
-            !generation_config.return_timestamps &&
-            generation_config.no_timestamps_token_id &&
-            init_tokens.at(-1) !== generation_config.no_timestamps_token_id
-        ) {
-            init_tokens.push(generation_config.no_timestamps_token_id);
-        } else if (
-            generation_config.return_timestamps &&
-            init_tokens.at(-1) === generation_config.no_timestamps_token_id
-        ) {
-            console.warn(
-                '<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`.',
-            );
-            init_tokens.pop();
-        }
-
-        // let's make sure we don't pass `null` tokens as prompt tokens
-        return init_tokens.filter((token) => token != null);
-    }
-
-    /**
-     * Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids.
-     * @param {import('./models/whisper/generation_whisper.js').WhisperGenerationFunctionParameters} options
-     * @returns {Promise<ModelOutput|Tensor>} The output of the model, which can contain the generated token ids, attentions, and scores.
-     */
-    async generate({
-        inputs = null,
-        generation_config = null,
-        logits_processor = null,
-        stopping_criteria = null,
-
-        // Whisper-specific options (passed to kwargs)
-        // prompt_ids = null,
-        // language = null,
-        // task = null,
-
-        ...kwargs
-    }) {
-        generation_config = this._prepare_generation_config(generation_config, kwargs);
-
-        const init_tokens = kwargs.decoder_input_ids ?? this._retrieve_init_tokens(generation_config);
-
-        if (generation_config.return_timestamps) {
-            logits_processor ??= new LogitsProcessorList();
-            logits_processor.push(new WhisperTimeStampLogitsProcessor(generation_config, init_tokens));
-        }
-
-        if (generation_config.begin_suppress_tokens) {
-            logits_processor ??= new LogitsProcessorList();
-            logits_processor.push(
-                new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, init_tokens.length),
-            );
-        }
-
-        if (generation_config.return_token_timestamps) {
-            if (!generation_config.alignment_heads) {
-                throw new Error(
-                    'Model generation config has no `alignment_heads`, token-level timestamps not available. ' +
-                        'See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.',
-                );
-            }
-
-            if (generation_config.task === 'translate') {
-                console.warn("Token-level timestamps may not be reliable for task 'translate'.");
-            }
-
-            generation_config.output_attentions = true;
-            generation_config.return_dict_in_generate = true;
-        }
-
-        const outputs = await super.generate({
-            inputs,
-            generation_config,
-            logits_processor,
-            decoder_input_ids: init_tokens,
-            ...kwargs,
-        });
-
-        if (generation_config.return_token_timestamps) {
-            outputs['token_timestamps'] = this._extract_token_timestamps(
-                // @ts-expect-error TS2345
-                outputs,
-                generation_config.alignment_heads,
-                generation_config.num_frames,
-            );
-        }
-
-        return outputs;
-    }
-
-    /**
-     * Calculates token-level timestamps using the encoder-decoder cross-attentions and
-     * dynamic time-warping (DTW) to map each output token to a position in the input audio.
-     * If `num_frames` is specified, the encoder-decoder cross-attentions will be cropped before applying DTW.
-     * @param {Object} generate_outputs Outputs generated by the model
-     * @param {Tensor[][]} generate_outputs.cross_attentions The cross attentions output by the model
-     * @param {Tensor} generate_outputs.sequences The sequences output by the model
-     * @param {number[][]} alignment_heads Alignment heads of the model
-     * @param {number} [num_frames=null] Number of frames in the input audio.
-     * @param {number} [time_precision=0.02] Precision of the timestamps in seconds
-     * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token
-     */
-    _extract_token_timestamps(generate_outputs, alignment_heads, num_frames = null, time_precision = 0.02) {
-        if (!generate_outputs.cross_attentions) {
-            throw new Error(
-                'Model outputs must contain cross attentions to extract timestamps. ' +
-                    'This is most likely because the model was not exported with `output_attentions=True`.',
-            );
-        }
-        if (num_frames == null) {
-            console.warn(
-                '`num_frames` has not been set, meaning the entire audio will be analyzed. ' +
-                    'This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).',
-            );
-        }
-
-        // @ts-expect-error TS2339
-        let median_filter_width = this.config.median_filter_width;
-        if (median_filter_width === undefined) {
-            console.warn('Model config has no `median_filter_width`, using default value of 7.');
-            median_filter_width = 7;
-        }
-
-        // TODO: Improve batch processing
-        const batch = generate_outputs.cross_attentions;
-        // Create a list with `decoder_layers` elements, each a tensor of shape
-        // (batch size, attention_heads, output length, input length).
-        const cross_attentions = Array.from(
-            // @ts-expect-error TS2339
-            { length: this.config.decoder_layers },
-            // Concatenate the cross attentions for each layer across sequence length dimension.
-            (_, i) =>
-                cat(
-                    batch.map((x) => x[i]),
-                    2,
-                ),
-        );
-
-        const weights = stack(
-            alignment_heads.map(([l, h]) => {
-                if (l >= cross_attentions.length) {
-                    throw new Error(
-                        `Layer index ${l} is out of bounds for cross attentions (length ${cross_attentions.length}).`,
-                    );
-                }
-                return num_frames
-                    ? cross_attentions[l].slice(null, h, null, [0, num_frames])
-                    : cross_attentions[l].slice(null, h);
-            }),
-        ).transpose(1, 0, 2, 3);
-
-        const [std, calculatedMean] = std_mean(weights, -2, 0, true);
-
-        // Normalize and smoothen the weights.
-        const smoothedWeights = weights.clone(); // [1, 8, seqLength, 1500]
-
-        for (let a = 0; a < smoothedWeights.dims[0]; ++a) {
-            const aTensor = smoothedWeights[a]; // [8, seqLength, 1500]
-
-            for (let b = 0; b < aTensor.dims[0]; ++b) {
-                const bTensor = aTensor[b]; // [seqLength, 1500]
-
-                const stdTensorData = std[a][b][0].data; // [1500]
-                const meanTensorData = calculatedMean[a][b][0].data; // [1500]
-
-                for (let c = 0; c < bTensor.dims[0]; ++c) {
-                    let cTensorData = bTensor[c].data; // [1500]
-                    for (let d = 0; d < cTensorData.length; ++d) {
-                        cTensorData[d] = (cTensorData[d] - meanTensorData[d]) / stdTensorData[d];
-                    }
-
-                    // Apply median filter.
-                    cTensorData.set(medianFilter(cTensorData, median_filter_width));
-                }
-            }
-        }
-
-        // Average the different cross-attention heads.
-        const batchedMatrices = [mean(smoothedWeights, 1)];
-
-        const timestampsShape = generate_outputs.sequences.dims;
-
-        const timestamps = new Tensor(
-            'float32',
-            new Float32Array(timestampsShape[0] * timestampsShape[1]),
-            timestampsShape,
-        );
-
-        // Perform dynamic time warping on each element of the batch.
-        for (let batch_idx = 0; batch_idx < timestampsShape[0]; ++batch_idx) {
-            // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions
-            // as the python implementation
-            const matrix = batchedMatrices[batch_idx].neg().squeeze_(0);
-            const [text_indices, time_indices] = dynamic_time_warping(matrix.tolist());
-
-            const diffs = Array.from(
-                { length: text_indices.length - 1 },
-                (v, i) => text_indices[i + 1] - text_indices[i],
-            );
-            const jumps = mergeArrays([1], diffs).map((x) => !!x); // convert to boolean
-
-            const jump_times = [];
-            for (let i = 0; i < jumps.length; ++i) {
-                if (jumps[i]) {
-                    // NOTE: No point in rounding here, since we set to Float32Array later
-                    jump_times.push(time_indices[i] * time_precision);
-                }
-            }
-            timestamps[batch_idx].data.set(jump_times, 1);
-        }
-
-        return timestamps;
-    }
-}
-//////////////////////////////////////////////////
-
-export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration {}
-
-//////////////////////////////////////////////////
-// Moonshine models
-export class MoonshinePreTrainedModel extends PreTrainedModel {
-    requires_attention_mask = false;
-    main_input_name = 'input_values';
-    forward_params = ['input_values', 'decoder_input_ids', 'past_key_values'];
-}
-
-/**
- * MoonshineModel class for training Moonshine models without a language model head.
- */
-export class MoonshineModel extends MoonshinePreTrainedModel {}
-
-export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-/**
- * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
- */
-export class VisionEncoderDecoderModel extends PreTrainedModel {
-    main_input_name = 'pixel_values';
-    forward_params = [
-        // Encoder inputs
-        'pixel_values',
-
-        // Decoder inpputs
-        'decoder_input_ids',
-        'encoder_hidden_states',
-        'past_key_values',
-    ];
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// LLaVa Models
-export class LlavaPreTrainedModel extends PreTrainedModel {
-    forward_params = ['input_ids', 'attention_mask', 'pixel_values', 'position_ids', 'past_key_values'];
-}
-
-/**
- * The LLAVA model which consists of a vision backbone and a language model.
- */
-export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
-    _merge_input_ids_with_image_features(kwargs) {
-        const vision_hidden_size = kwargs.image_features.dims.at(-1);
-        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
-
-        return default_merge_input_ids_with_image_features({
-            // @ts-ignore
-            image_token_id: this.config.image_token_index,
-            ...kwargs,
-            image_features: reshaped_image_hidden_states,
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-export class LlavaOnevisionForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration
-export class Moondream1ForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration
-
-export class Florence2PreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        // Encoder inputs
-        'input_ids',
-        'inputs_embeds',
-        'attention_mask',
-        'pixel_values',
-
-        // Decoder inputs
-        'encoder_outputs',
-        'decoder_input_ids',
-        'decoder_inputs_embeds',
-        'decoder_attention_mask',
-        'past_key_values',
-    ];
-    main_input_name = 'inputs_embeds';
-}
-
-export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel {
-    _merge_input_ids_with_image_features({ inputs_embeds, image_features, input_ids, attention_mask }) {
-        return {
-            inputs_embeds: cat(
-                [
-                    image_features, // image embeds
-                    inputs_embeds, // task prefix embeds
-                ],
-                1,
-            ),
-            attention_mask: cat(
-                [
-                    ones(image_features.dims.slice(0, 2)), // image attention mask
-                    attention_mask, // task prefix attention mask
-                ],
-                1,
-            ),
-        };
-    }
-
-    async _prepare_inputs_embeds({ input_ids, pixel_values, inputs_embeds, attention_mask }) {
-        if (!input_ids && !pixel_values) {
-            throw new Error('Either `input_ids` or `pixel_values` should be provided.');
-        }
-
-        // 1. Possibly, extract the input embeddings
-        let text_features, image_features;
-        if (input_ids) {
-            text_features = await this.encode_text({ input_ids });
-        }
-        if (pixel_values) {
-            image_features = await this.encode_image({ pixel_values });
-        }
-
-        // 2. Possibly, merge text and images
-        if (text_features && image_features) {
-            ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({
-                inputs_embeds: text_features,
-                image_features,
-                input_ids,
-                attention_mask,
-            }));
-        } else {
-            inputs_embeds = text_features || image_features;
-        }
-
-        return { inputs_embeds, attention_mask };
-    }
-
-    async forward({
-        input_ids,
-        pixel_values,
-        attention_mask,
-        decoder_input_ids,
-        decoder_attention_mask,
-        encoder_outputs,
-        past_key_values,
-
-        inputs_embeds,
-        decoder_inputs_embeds,
-    }) {
-        if (!inputs_embeds) {
-            ({ inputs_embeds, attention_mask } = await this._prepare_inputs_embeds({
-                input_ids,
-                pixel_values,
-                inputs_embeds,
-                attention_mask,
-            }));
-        }
-
-        if (!encoder_outputs) {
-            // Must compute encoder outputs
-            let { last_hidden_state } = await encoderForward(this, { inputs_embeds, attention_mask });
-            encoder_outputs = last_hidden_state;
-        }
-
-        if (!decoder_inputs_embeds) {
-            if (!decoder_input_ids) {
-                throw new Error('Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.');
-            }
-            decoder_inputs_embeds = await this.encode_text({ input_ids: decoder_input_ids });
-        }
-
-        const decoderFeeds = {
-            inputs_embeds: decoder_inputs_embeds,
-            attention_mask: decoder_attention_mask,
-            encoder_attention_mask: attention_mask,
-            encoder_hidden_states: encoder_outputs,
-            past_key_values,
-        };
-        const decoder_outputs = await decoderForward(this, decoderFeeds, true);
-        return decoder_outputs;
-    }
-}
-
-export class PaliGemmaPreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        'input_ids',
-        // 'inputs_embeds',
-        'attention_mask',
-        'pixel_values',
-        'position_ids',
-        'past_key_values',
-    ];
-}
-
-export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
-    _merge_input_ids_with_image_features(kwargs) {
-        const vision_hidden_size = kwargs.image_features.dims.at(-1);
-        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
-
-        return default_merge_input_ids_with_image_features({
-            // @ts-ignore
-            image_token_id: this.config.image_token_index,
-            ...kwargs,
-            image_features: reshaped_image_hidden_states,
-        });
-    }
-}
-
-export class LlavaQwen2ForCausalLM extends LlavaPreTrainedModel {
-    _merge_input_ids_with_image_features(kwargs) {
-        const vision_hidden_size = kwargs.image_features.dims.at(-1);
-        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
-
-        return default_merge_input_ids_with_image_features({
-            // @ts-ignore
-            image_token_id: this.config.image_token_index,
-            ...kwargs,
-            image_features: reshaped_image_hidden_states,
-        });
-    }
-}
-
-export class Gemma3nPreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        'input_ids',
-        'attention_mask',
-        'inputs_embeds',
-        'per_layer_inputs',
-
-        'position_ids',
-        'pixel_values',
-        'input_features',
-        'input_features_mask',
-        'past_key_values',
-    ];
-}
-export class Gemma3nForConditionalGeneration extends Gemma3nPreTrainedModel {
-    async forward({
-        // Produced by the tokenizer/processor:
-        input_ids = null,
-        attention_mask = null,
-        pixel_values = null,
-        input_features = null,
-        input_features_mask = null,
-
-        // Used during generation:
-        position_ids = null,
-        inputs_embeds = null,
-        per_layer_inputs = null,
-        past_key_values = null,
-
-        // Generic generation parameters
-        generation_config = null,
-        logits_processor = null,
-
-        // TODO: needed?
-        ...kwargs
-    }) {
-        if (!inputs_embeds || !per_layer_inputs) {
-            // 1. Extract the text embeddings.
-            ({ inputs_embeds, per_layer_inputs } = await sessionRun(this.sessions['embed_tokens'], {
-                input_ids,
-            }));
-            if (input_ids.dims[1] !== 1) {
-                if (pixel_values) {
-                    // Encode the image
-                    const { image_features } = await sessionRun(this.sessions['vision_encoder'], {
-                        pixel_values,
-                    });
-                    ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({
-                        image_features,
-                        inputs_embeds,
-                        input_ids,
-                        attention_mask,
-                    }));
-                }
-
-                if (input_features) {
-                    // Encode the audio
-                    const { audio_features } = await sessionRun(this.sessions['audio_encoder'], {
-                        input_features,
-                        input_features_mask,
-                    });
-                    ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_audio_features({
-                        audio_features,
-                        inputs_embeds,
-                        input_ids,
-                        attention_mask,
-                    }));
-                }
-            }
-        }
-
-        const outputs = await decoderForward(
-            this,
-            {
-                inputs_embeds,
-                per_layer_inputs,
-                past_key_values,
-                attention_mask,
-                position_ids,
-                generation_config,
-                logits_processor,
-            },
-            true,
-        );
-        return outputs;
-    }
-
-    _merge_input_ids_with_image_features(kwargs) {
-        const vision_hidden_size = kwargs.image_features.dims.at(-1);
-        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
-        return default_merge_input_ids_with_image_features({
-            // @ts-ignore
-            image_token_id: this.config.image_token_id,
-            ...kwargs,
-            image_features: reshaped_image_hidden_states,
-        });
-    }
-    _merge_input_ids_with_audio_features(kwargs) {
-        const audio_hidden_size = kwargs.audio_features.dims.at(-1);
-        const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
-
-        return default_merge_input_ids_with_audio_features({
-            // @ts-ignore
-            audio_token_id: this.config.audio_token_id,
-            ...kwargs,
-            audio_features: reshaped_audio_features,
-        });
-    }
-}
-
-//////////////////////////////////////////////////
-// Idefics3 Models
-export class Idefics3PreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        'input_ids',
-        'attention_mask',
-        'pixel_values',
-        'pixel_attention_mask',
-        'position_ids',
-        'past_key_values',
-    ];
-}
-
-/**
- * The Idefics3 model which consists of a vision backbone and a language model.
- */
-export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
-    async encode_image({ pixel_values, pixel_attention_mask }) {
-        const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask }))
-            .image_features;
-        return features;
-    }
-
-    _merge_input_ids_with_image_features(kwargs) {
-        const vision_hidden_size = kwargs.image_features.dims.at(-1);
-        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
-
-        return default_merge_input_ids_with_image_features({
-            // @ts-ignore
-            image_token_id: this.config.image_token_id,
-            ...kwargs,
-            image_features: reshaped_image_hidden_states,
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-/**
- * The SmolVLM Model with a language modeling head.
- * It is made up a SigLIP vision encoder, with a language modeling head on top.
- */
-export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {}
-
-//////////////////////////////////////////////////
-export class Phi3VPreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        'input_ids',
-        'inputs_embeds',
-        'attention_mask',
-        'position_ids',
-        'pixel_values',
-        'image_sizes',
-        'past_key_values',
-    ];
-}
-export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
-    async forward({
-        // Produced by the tokenizer/processor:
-        input_ids = null,
-        attention_mask = null,
-        pixel_values = null,
-        image_sizes = null,
-
-        // Used during generation:
-        position_ids = null,
-        inputs_embeds = null,
-        past_key_values = null,
-
-        // Generic generation parameters
-        generation_config = null,
-        logits_processor = null,
-
-        // TODO: needed?
-        ...kwargs
-    }) {
-        if (!inputs_embeds) {
-            let image_features;
-            if (pixel_values && input_ids.dims[1] !== 1) {
-                if (!image_sizes) {
-                    throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
-                }
-
-                // Encode the image
-                ({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
-                    pixel_values,
-                    image_sizes,
-                }));
-            } else {
-                const hidden_size = this.config.normalized_config.hidden_size;
-                image_features = new Tensor('float32', [], [0, hidden_size]);
-            }
-
-            ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
-                input_ids,
-                image_features,
-            }));
-        }
-
-        const outputs = await decoderForward(
-            this,
-            {
-                inputs_embeds,
-                past_key_values,
-                attention_mask,
-                position_ids,
-                generation_config,
-                logits_processor,
-            },
-            false,
-        );
-        return outputs;
-    }
-}
-
-//////////////////////////////////////////////////
-export class CLIPPreTrainedModel extends PreTrainedModel {}
-
-/**
- * CLIP Text and Vision Model with a projection layers on top
- *
- * **Example:** Perform zero-shot image classification with a `CLIPModel`.
- *
- * ```javascript
- * import { AutoTokenizer, AutoProcessor, CLIPModel, RawImage } from '@huggingface/transformers';
- *
- * // Load tokenizer, processor, and model
- * let tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16');
- * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
- * let model = await CLIPModel.from_pretrained('Xenova/clip-vit-base-patch16');
- *
- * // Run tokenization
- * let texts = ['a photo of a car', 'a photo of a football match']
- * let text_inputs = tokenizer(texts, { padding: true, truncation: true });
- *
- * // Read image and run processor
- * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
- * let image_inputs = await processor(image);
- *
- * // Run model with both text and pixel inputs
- * let output = await model({ ...text_inputs, ...image_inputs });
- * // {
- * //   logits_per_image: Tensor {
- * //     dims: [ 1, 2 ],
- * //     data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ],
- * //   },
- * //   logits_per_text: Tensor {
- * //     dims: [ 2, 1 ],
- * //     data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ],
- * //   },
- * //   text_embeds: Tensor {
- * //     dims: [ 2, 512 ],
- * //     data: Float32Array(1024) [ ... ],
- * //   },
- * //   image_embeds: Tensor {
- * //     dims: [ 1, 512 ],
- * //     data: Float32Array(512) [ ... ],
- * //   }
- * // }
- * ```
- */
-export class CLIPModel extends CLIPPreTrainedModel {}
-
-/**
- * The text model from CLIP without any head or projection on top.
- */
-export class CLIPTextModel extends CLIPPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'text_model',
-        });
-    }
-}
-
-/**
- * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output)
- *
- * **Example:** Compute text embeddings with `CLIPTextModelWithProjection`.
- *
- * ```javascript
- * import { AutoTokenizer, CLIPTextModelWithProjection } from '@huggingface/transformers';
- *
- * // Load tokenizer and text model
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16');
- * const text_model = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16');
- *
- * // Run tokenization
- * let texts = ['a photo of a car', 'a photo of a football match'];
- * let text_inputs = tokenizer(texts, { padding: true, truncation: true });
- *
- * // Compute embeddings
- * const { text_embeds } = await text_model(text_inputs);
- * // Tensor {
- * //   dims: [ 2, 512 ],
- * //   type: 'float32',
- * //   data: Float32Array(1024) [ ... ],
- * //   size: 1024
- * // }
- * ```
- */
-export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'text_model',
-        });
-    }
-}
-
-/**
- * The vision model from CLIP without any head or projection on top.
- */
-export class CLIPVisionModel extends CLIPPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'vision_model',
-        });
-    }
-}
-
-/**
- * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
- *
- * **Example:** Compute vision embeddings with `CLIPVisionModelWithProjection`.
- *
- * ```javascript
- * import { AutoProcessor, CLIPVisionModelWithProjection, RawImage} from '@huggingface/transformers';
- *
- * // Load processor and vision model
- * const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
- * const vision_model = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16');
- *
- * // Read image and run processor
- * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
- * let image_inputs = await processor(image);
- *
- * // Compute embeddings
- * const { image_embeds } = await vision_model(image_inputs);
- * // Tensor {
- * //   dims: [ 1, 512 ],
- * //   type: 'float32',
- * //   data: Float32Array(512) [ ... ],
- * //   size: 512
- * // }
- * ```
- */
-export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'vision_model',
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// SigLIP models
-export class SiglipPreTrainedModel extends PreTrainedModel {}
-
-/**
- * SigLIP Text and Vision Model with a projection layers on top
- *
- * **Example:** Perform zero-shot image classification with a `SiglipModel`.
- *
- * ```javascript
- * import { AutoTokenizer, AutoProcessor, SiglipModel, RawImage } from '@huggingface/transformers';
- *
- * // Load tokenizer, processor, and model
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224');
- * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224');
- * const model = await SiglipModel.from_pretrained('Xenova/siglip-base-patch16-224');
- *
- * // Run tokenization
- * const texts = ['a photo of 2 cats', 'a photo of 2 dogs'];
- * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true });
- *
- * // Read image and run processor
- * const image = await RawImage.read('http://images.cocodataset.org/val2017/000000039769.jpg');
- * const image_inputs = await processor(image);
- *
- * // Run model with both text and pixel inputs
- * const output = await model({ ...text_inputs, ...image_inputs });
- * // {
- * //   logits_per_image: Tensor {
- * //     dims: [ 1, 2 ],
- * //     data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ],
- * //   },
- * //   logits_per_text: Tensor {
- * //     dims: [ 2, 1 ],
- * //     data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ],
- * //   },
- * //   text_embeds: Tensor {
- * //     dims: [ 2, 768 ],
- * //     data: Float32Array(1536) [ ... ],
- * //   },
- * //   image_embeds: Tensor {
- * //     dims: [ 1, 768 ],
- * //     data: Float32Array(768) [ ... ],
- * //   }
- * // }
- * ```
- */
-export class SiglipModel extends SiglipPreTrainedModel {}
-
-/**
- * The text model from SigLIP without any head or projection on top.
- *
- * **Example:** Compute text embeddings with `SiglipTextModel`.
- *
- * ```javascript
- * import { AutoTokenizer, SiglipTextModel } from '@huggingface/transformers';
- *
- * // Load tokenizer and text model
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224');
- * const text_model = await SiglipTextModel.from_pretrained('Xenova/siglip-base-patch16-224');
- *
- * // Run tokenization
- * const texts = ['a photo of 2 cats', 'a photo of 2 dogs'];
- * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true });
- *
- * // Compute embeddings
- * const { pooler_output } = await text_model(text_inputs);
- * // Tensor {
- * //   dims: [ 2, 768 ],
- * //   type: 'float32',
- * //   data: Float32Array(1536) [ ... ],
- * //   size: 1536
- * // }
- * ```
- */
-export class SiglipTextModel extends SiglipPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'text_model',
-        });
-    }
-}
-
-/**
- * The vision model from SigLIP without any head or projection on top.
- *
- * **Example:** Compute vision embeddings with `SiglipVisionModel`.
- *
- * ```javascript
- * import { AutoProcessor, SiglipVisionModel, RawImage} from '@huggingface/transformers';
- *
- * // Load processor and vision model
- * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224');
- * const vision_model = await SiglipVisionModel.from_pretrained('Xenova/siglip-base-patch16-224');
- *
- * // Read image and run processor
- * const image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
- * const image_inputs = await processor(image);
- *
- * // Compute embeddings
- * const { pooler_output } = await vision_model(image_inputs);
- * // Tensor {
- * //   dims: [ 1, 768 ],
- * //   type: 'float32',
- * //   data: Float32Array(768) [ ... ],
- * //   size: 768
- * // }
- * ```
- */
-export class SiglipVisionModel extends CLIPPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'vision_model',
-        });
-    }
-}
-//////////////////////////////////////////////////
-// ChineseCLIP models
-export class ChineseCLIPPreTrainedModel extends PreTrainedModel {}
-
-export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// JinaCLIP models
-export class JinaCLIPPreTrainedModel extends PreTrainedModel {}
-
-export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
-    async forward(model_inputs) {
-        const missing_text_inputs = !model_inputs.input_ids;
-        const missing_image_inputs = !model_inputs.pixel_values;
-
-        if (missing_text_inputs && missing_image_inputs) {
-            throw new Error('Either `input_ids` or `pixel_values` should be provided.');
-        }
-
-        // If either `input_ids` or `pixel_values` aren't passed, we need to create dummy input since the model requires a value to be specified.
-        if (missing_text_inputs) {
-            // NOTE: We cannot pass zero-dimension tensor as input for input_ids.
-            // Fortunately, the majority of time is spent in the vision encoder, so this shouldn't significantly impact performance.
-            model_inputs.input_ids = ones([model_inputs.pixel_values.dims[0], 1]);
-        }
-
-        if (missing_image_inputs) {
-            // NOTE: Since we create a zero-sized tensor, this does not increase computation time.
-            // @ts-ignore
-            const { image_size } = this.config.vision_config;
-            model_inputs.pixel_values = full([0, 3, image_size, image_size], 0.0); // (pass zero-dimension tensor)
-        }
-
-        const { text_embeddings, image_embeddings, l2norm_text_embeddings, l2norm_image_embeddings } =
-            await super.forward(model_inputs);
-
-        const result = {};
-        if (!missing_text_inputs) {
-            result.text_embeddings = text_embeddings;
-            result.l2norm_text_embeddings = l2norm_text_embeddings;
-        }
-        if (!missing_image_inputs) {
-            result.image_embeddings = image_embeddings;
-            result.l2norm_image_embeddings = l2norm_image_embeddings;
-        }
-        return result;
-    }
-}
-
-export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'text_model',
-        });
-    }
-}
-
-export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'vision_model',
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// CLIPSeg models
-export class CLIPSegPreTrainedModel extends PreTrainedModel {}
-
-export class CLIPSegModel extends CLIPSegPreTrainedModel {}
-
-/**
- * CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
- *
- * **Example:** Perform zero-shot image segmentation with a `CLIPSegForImageSegmentation` model.
- *
- * ```javascript
- * import { AutoTokenizer, AutoProcessor, CLIPSegForImageSegmentation, RawImage } from '@huggingface/transformers';
- *
- * // Load tokenizer, processor, and model
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clipseg-rd64-refined');
- * const processor = await AutoProcessor.from_pretrained('Xenova/clipseg-rd64-refined');
- * const model = await CLIPSegForImageSegmentation.from_pretrained('Xenova/clipseg-rd64-refined');
- *
- * // Run tokenization
- * const texts = ['a glass', 'something to fill', 'wood', 'a jar'];
- * const text_inputs = tokenizer(texts, { padding: true, truncation: true });
- *
- * // Read image and run processor
- * const image = await RawImage.read('https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true');
- * const image_inputs = await processor(image);
- *
- * // Run model with both text and pixel inputs
- * const { logits } = await model({ ...text_inputs, ...image_inputs });
- * // logits: Tensor {
- * //   dims: [4, 352, 352],
- * //   type: 'float32',
- * //   data: Float32Array(495616) [ ... ],
- * //   size: 495616
- * // }
- * ```
- *
- * You can visualize the predictions as follows:
- * ```javascript
- * const preds = logits
- *   .unsqueeze_(1)
- *   .sigmoid_()
- *   .mul_(255)
- *   .round_()
- *   .to('uint8');
- *
- * for (let i = 0; i < preds.dims[0]; ++i) {
- *   const img = RawImage.fromTensor(preds[i]);
- *   img.save(`prediction_${i}.png`);
- * }
- * ```
- */
-export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// GPT2 models
-export class GPT2PreTrainedModel extends PreTrainedModel {}
-
-export class GPT2Model extends GPT2PreTrainedModel {}
-
-/**
- * GPT-2 language model head on top of the GPT-2 base model. This model is suitable for text generation tasks.
- */
-export class GPT2LMHeadModel extends GPT2PreTrainedModel {}
-// export class GPT2ForSequenceClassification extends GPT2PreTrainedModel {
-// TODO
-// }
-//////////////////////////////////////////////////
-
-
-//////////////////////////////////////////////////
-// GPT OSS models
-export class GptOssPreTrainedModel extends PreTrainedModel {}
-export class GptOssModel extends GptOssPreTrainedModel {}
-export class GptOssForCausalLM extends GptOssPreTrainedModel {}
-//////////////////////////////////////////////////
-
-
-//////////////////////////////////////////////////
-// JAIS models
-export class JAISPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare JAIS Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class JAISModel extends JAISPreTrainedModel {}
-
-/**
- * The JAIS Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
- */
-export class JAISLMHeadModel extends JAISPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// GPTNeo models
-export class GPTNeoPreTrainedModel extends PreTrainedModel {}
-export class GPTNeoModel extends GPTNeoPreTrainedModel {}
-
-export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// GPTNeoX models
-export class GPTNeoXPreTrainedModel extends PreTrainedModel {}
-export class GPTNeoXModel extends GPTNeoXPreTrainedModel {}
-
-export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// GPT-J models
-export class GPTJPreTrainedModel extends PreTrainedModel {}
-
-export class GPTJModel extends GPTJPreTrainedModel {}
-
-export class GPTJForCausalLM extends GPTJPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// GPTBigCode models
-export class GPTBigCodePreTrainedModel extends PreTrainedModel {}
-
-export class GPTBigCodeModel extends GPTBigCodePreTrainedModel {}
-
-export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// CodeGen models
-export class CodeGenPreTrainedModel extends PreTrainedModel {}
-/**
- * CodeGenModel is a class representing a code generation model without a language model head.
- */
-export class CodeGenModel extends CodeGenPreTrainedModel {}
-
-/**
- * CodeGenForCausalLM is a class that represents a code generation model based on the GPT-2 architecture. It extends the `CodeGenPreTrainedModel` class.
- */
-export class CodeGenForCausalLM extends CodeGenPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// LLama models
-
-/**
- * The bare LLama Model outputting raw hidden-states without any specific head on top.
- */
-export class LlamaPreTrainedModel extends PreTrainedModel {}
-/**
- * The bare LLaMA Model outputting raw hidden-states without any specific head on top.
- */
-export class LlamaModel extends LlamaPreTrainedModel {}
-
-export class LlamaForCausalLM extends LlamaPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Llama4PreTrainedModel extends PreTrainedModel {}
-export class Llama4ForCausalLM extends Llama4PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// NanoChat models
-export class NanoChatPreTrainedModel extends PreTrainedModel {}
-export class NanoChatModel extends NanoChatPreTrainedModel {}
-export class NanoChatForCausalLM extends NanoChatPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Apertus models
-export class ApertusPreTrainedModel extends PreTrainedModel {}
-export class ApertusModel extends ApertusPreTrainedModel {}
-export class ApertusForCausalLM extends ApertusPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Arcee models
-export class ArceePreTrainedModel extends PreTrainedModel {}
-export class ArceeModel extends ArceePreTrainedModel {}
-export class ArceeForCausalLM extends ArceePreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// LFM2 models
-export class Lfm2PreTrainedModel extends PreTrainedModel {}
-export class Lfm2Model extends Lfm2PreTrainedModel {}
-export class Lfm2ForCausalLM extends Lfm2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// SmolLM3 models
-export class SmolLM3PreTrainedModel extends PreTrainedModel {}
-export class SmolLM3Model extends SmolLM3PreTrainedModel {}
-export class SmolLM3ForCausalLM extends SmolLM3PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Helium models
-export class HeliumPreTrainedModel extends PreTrainedModel {}
-export class HeliumModel extends HeliumPreTrainedModel {}
-export class HeliumForCausalLM extends HeliumPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Glm models
-export class GlmPreTrainedModel extends PreTrainedModel {}
-export class GlmModel extends GlmPreTrainedModel {}
-export class GlmForCausalLM extends GlmPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// EXAONE models
-export class ExaonePreTrainedModel extends PreTrainedModel {}
-export class ExaoneModel extends ExaonePreTrainedModel {}
-export class ExaoneForCausalLM extends ExaonePreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MobileLLM models
-export class MobileLLMPreTrainedModel extends PreTrainedModel {}
-export class MobileLLMModel extends MobileLLMPreTrainedModel {}
-export class MobileLLMForCausalLM extends MobileLLMPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Olmo models
-export class OlmoPreTrainedModel extends PreTrainedModel {}
-export class OlmoModel extends OlmoPreTrainedModel {}
-export class OlmoForCausalLM extends OlmoPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Olmo2 models
-export class Olmo2PreTrainedModel extends PreTrainedModel {}
-export class Olmo2Model extends Olmo2PreTrainedModel {}
-export class Olmo2ForCausalLM extends Olmo2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Olmo3 models
-export class Olmo3PreTrainedModel extends PreTrainedModel {}
-export class Olmo3Model extends Olmo3PreTrainedModel {}
-export class Olmo3ForCausalLM extends Olmo3PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Granite models
-export class GranitePreTrainedModel extends PreTrainedModel {}
-export class GraniteModel extends GranitePreTrainedModel {}
-export class GraniteForCausalLM extends GranitePreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// GraniteMoeHybrid models
-export class GraniteMoeHybridPreTrainedModel extends PreTrainedModel {}
-export class GraniteMoeHybridModel extends GraniteMoeHybridPreTrainedModel {}
-export class GraniteMoeHybridForCausalLM extends GraniteMoeHybridPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Cohere models
-
-/**
- * The bare Cohere Model outputting raw hidden-states without any specific head on top.
- */
-export class CoherePreTrainedModel extends PreTrainedModel {}
-export class CohereModel extends CoherePreTrainedModel {}
-
-export class CohereForCausalLM extends CoherePreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Gemma models
-
-/**
- * The bare Gemma Model outputting raw hidden-states without any specific head on top.
- */
-export class GemmaPreTrainedModel extends PreTrainedModel {}
-/**
- * The bare Gemma Model outputting raw hidden-states without any specific head on top.
- */
-export class GemmaModel extends GemmaPreTrainedModel {}
-
-export class GemmaForCausalLM extends GemmaPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Gemma2 models
-
-/**
- * The bare Gemma2 Model outputting raw hidden-states without any specific head on top.
- */
-export class Gemma2PreTrainedModel extends PreTrainedModel {}
-/**
- * The bare Gemma2 Model outputting raw hidden-states without any specific head on top.
- */
-export class Gemma2Model extends Gemma2PreTrainedModel {}
-
-export class Gemma2ForCausalLM extends Gemma2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// VaultGemma models
-export class VaultGemmaPreTrainedModel extends PreTrainedModel {}
-export class VaultGemmaModel extends VaultGemmaPreTrainedModel {}
-export class VaultGemmaForCausalLM extends VaultGemmaPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Gemma3 models
-
-/**
- * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
- */
-export class Gemma3PreTrainedModel extends PreTrainedModel {}
-/**
- * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
- */
-export class Gemma3Model extends Gemma3PreTrainedModel {}
-
-export class Gemma3ForCausalLM extends Gemma3PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class OpenELMPreTrainedModel extends PreTrainedModel {}
-export class OpenELMModel extends OpenELMPreTrainedModel {}
-
-export class OpenELMForCausalLM extends OpenELMPreTrainedModel {}
-
-//////////////////////////////////////////////////
-// Qwen2 models
-
-/**
- * The bare Qwen2 Model outputting raw hidden-states without any specific head on top.
- */
-export class Qwen2PreTrainedModel extends PreTrainedModel {}
-/**
- * The bare Qwen2 Model outputting raw hidden-states without any specific head on top.
- */
-export class Qwen2Model extends Qwen2PreTrainedModel {}
-
-export class Qwen2ForCausalLM extends Qwen2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Qwen3 models
-
-/**
- * The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
- */
-export class Qwen3PreTrainedModel extends PreTrainedModel {}
-/**
- * The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
- */
-export class Qwen3Model extends Qwen3PreTrainedModel {}
-
-export class Qwen3ForCausalLM extends Qwen3PreTrainedModel {}
-//////////////////////////////////////////////////
-
-export class Qwen2VLPreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        // Text inputs
-        'input_ids',
-        'attention_mask',
-        'position_ids',
-        'past_key_values',
-
-        // Vision inputs
-        'pixel_values',
-        'image_grid_thw',
-    ];
-}
-export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
-    /**
-     * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
-     *
-     * Explanation:
-     *     Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
-     *
-     *     For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
-     *     Examples:
-     *         input_ids: [T T T T T], here T is for text.
-     *         temporal position_ids: [0, 1, 2, 3, 4]
-     *         height position_ids: [0, 1, 2, 3, 4]
-     *         width position_ids: [0, 1, 2, 3, 4]
-     *
-     *     For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
-     *     and 1D rotary position embeddin for text part.
-     *     Examples:
-     *         Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
-     *         input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
-     *         vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
-     *         vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
-     *         vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
-     *         text temporal position_ids: [3, 4, 5, 6, 7]
-     *         text height position_ids: [3, 4, 5, 6, 7]
-     *         text width position_ids: [3, 4, 5, 6, 7]
-     *         Here we calculate the text start position_ids as the max vision position_ids plus 1.
-     *
-     * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
-     * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
-     * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
-     * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
-     * - 1 for tokens that are **not masked**,
-     * - 0 for tokens that are **masked**.
-     * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
-     * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
-     * - mrope_position_deltas: Tensor of shape `(batch_size)`.
-     */
-    get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
-        // @ts-ignore
-        const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
-        const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
-
-        const mrope_position_deltas = [];
-        if (image_grid_thw || video_grid_thw) {
-            let total_input_ids = input_ids.tolist();
-            if (!attention_mask) {
-                attention_mask = ones_like(input_ids);
-            }
-
-            const attention_mask_list = attention_mask.tolist();
-            const position_ids_list = Array.from({ length: 3 }, (_) =>
-                Array.from({ length: input_ids.dims[0] }, (_) => Array.from({ length: input_ids.dims[1] }, (_) => 1)),
-            );
-
-            const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
-            const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
-
-            let image_index = 0;
-            let video_index = 0;
-            for (let i = 0; i < total_input_ids.length; ++i) {
-                const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
-
-                const vision_start_indices = ids.reduce((acc, x, idx) => {
-                    if (x == vision_start_token_id) acc.push(idx);
-                    return acc;
-                }, []);
-
-                const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
-                const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
-                const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
-
-                /** @type {number[][]} */
-                let llm_pos_ids_list = [];
-                let st = 0;
-                let remain_images = image_nums;
-                let remain_videos = video_nums;
-                for (let j = 0; j < vision_tokens.length; ++j) {
-                    const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
-                    const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
-
-                    const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
-
-                    const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
-
-                    let ed;
-                    let t, h, w;
-                    if (ed_image < ed_video) {
-                        [t, h, w] = image_grid_thw_list[image_index];
-                        ++image_index;
-                        --remain_images;
-                        ed = ed_image;
-                    } else {
-                        [t, h, w] = video_grid_thw_list[video_index];
-                        ++video_index;
-                        --remain_videos;
-                        ed = ed_video;
-                    }
-
-                    const [llm_grid_t, llm_grid_h, llm_grid_w] = [
-                        Number(t),
-                        Math.floor(Number(h) / spatial_merge_size),
-                        Math.floor(Number(w) / spatial_merge_size),
-                    ];
-                    const text_len = ed - st;
-                    const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
-
-                    llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
-
-                    const offset = text_len + st_idx;
-                    const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
-                    const t_index = Array.from(
-                        { length: grid_size },
-                        (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)),
-                    );
-                    const h_index = Array.from(
-                        { length: grid_size },
-                        (_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h),
-                    );
-                    const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w));
-
-                    llm_pos_ids_list.push([t_index, h_index, w_index].flat());
-
-                    st = ed + grid_size;
-                }
-
-                if (st < ids.length) {
-                    const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
-                    const text_len = ids.length - st;
-
-                    llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
-                }
-
-                // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
-                // meaning to perform concatenation along dim=1, we can do the following:
-                const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
-                /** @type {number[]} */
-                const llm_positions = new Array(num_items);
-                let index = 0;
-                for (let x = 0; x < 3; ++x) {
-                    for (let y = 0; y < llm_pos_ids_list.length; ++y) {
-                        const val = llm_pos_ids_list[y];
-                        const text_len = val.length / 3;
-                        for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
-                            llm_positions[index++] = val[z];
-                        }
-                    }
-                }
-
-                let count = 0;
-                const attn_mask = attention_mask_list[i];
-                for (let y = 0; y < attn_mask.length; ++y) {
-                    if (attn_mask[y] == 1) {
-                        for (let x = 0; x < 3; ++x) {
-                            position_ids_list[x][i][y] = llm_positions[(x * num_items) / 3 + count];
-                        }
-                        ++count;
-                    }
-                }
-
-                const max_llm_positions = max(llm_positions)[0];
-                mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
-            }
-
-            return [
-                new Tensor('int64', position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
-                new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
-            ];
-        } else {
-            // Text-only
-            if (attention_mask) {
-                const { data, dims } = cumsum_masked_fill(attention_mask);
-
-                const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
-                /** @type {bigint[]} */
-                const mrope_position_deltas = Array.from(
-                    { length: dims[0] },
-                    (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]),
-                );
-
-                return [
-                    new Tensor('int64', position_ids, [3, ...dims]),
-                    new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
-                ];
-            } else {
-                const [batch_size, seq_length] = input_ids.dims;
-                const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) =>
-                    BigInt(Math.floor((i % seq_length) / batch_size)),
-                );
-
-                return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
-            }
-        }
-    }
-
-    async encode_image({ pixel_values, image_grid_thw }) {
-        const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, grid_thw: image_grid_thw }))
-            .image_features;
-        return features;
-    }
-
-    _merge_input_ids_with_image_features(kwargs) {
-        return default_merge_input_ids_with_image_features({
-            // @ts-ignore
-            image_token_id: this.config.image_token_id,
-            ...kwargs,
-        });
-    }
-
-    prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
-        // Overwritten -- in specific circumstances we don't want to forward image inputs to the model
-        if (model_inputs.attention_mask && !model_inputs.position_ids) {
-            // Calculate position_ids and rope_deltas
-            if (!model_inputs.past_key_values) {
-                [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
-                    model_inputs.input_ids,
-                    model_inputs.image_grid_thw,
-                    model_inputs.video_grid_thw,
-                    model_inputs.attention_mask,
-                );
-            } else {
-                model_inputs.pixel_values = null;
-                // model_inputs.pixel_values_videos = null;
-
-                const delta = BigInt(Object.values(model_inputs.past_key_values)[0].dims.at(-2));
-                const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
-                model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
-            }
-        }
-
-        return model_inputs;
-    }
-}
-
-//////////////////////////////////////////////////
-// Phi models
-export class PhiPreTrainedModel extends PreTrainedModel {}
-/**
- * The bare Phi Model outputting raw hidden-states without any specific head on top.
- */
-export class PhiModel extends PhiPreTrainedModel {}
-
-export class PhiForCausalLM extends PhiPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Phi3 models
-export class Phi3PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Phi3 Model outputting raw hidden-states without any specific head on top.
- */
-export class Phi3Model extends Phi3PreTrainedModel {}
-
-export class Phi3ForCausalLM extends Phi3PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Bloom models
-/**
- * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
- */
-export class BloomPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class BloomModel extends BloomPreTrainedModel {}
-
-/**
- * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
- */
-export class BloomForCausalLM extends BloomPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MPT models
-export class MptPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class MptModel extends MptPreTrainedModel {}
-
-/**
- * The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
- */
-export class MptForCausalLM extends MptPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// OPT models
-export class OPTPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare OPT Model outputting raw hidden-states without any specific head on top.
- */
-export class OPTModel extends OPTPreTrainedModel {}
-
-/**
- * The OPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
- */
-export class OPTForCausalLM extends OPTPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class ViTPreTrainedModel extends PreTrainedModel {}
-export class ViTModel extends ViTPreTrainedModel {}
-export class ViTForImageClassification extends ViTPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class IJepaPreTrainedModel extends PreTrainedModel {}
-export class IJepaModel extends IJepaPreTrainedModel {}
-export class IJepaForImageClassification extends IJepaPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class VitPosePreTrainedModel extends PreTrainedModel {}
-
-/**
- * The VitPose model with a pose estimation head on top.
- */
-export class VitPoseForPoseEstimation extends VitPosePreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class PvtPreTrainedModel extends PreTrainedModel {}
-export class PvtModel extends PvtPreTrainedModel {}
-export class PvtForImageClassification extends PvtPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class ViTMAEPreTrainedModel extends PreTrainedModel {}
-export class ViTMAEModel extends ViTMAEPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class ViTMSNPreTrainedModel extends PreTrainedModel {}
-export class ViTMSNModel extends ViTMSNPreTrainedModel {}
-export class ViTMSNForImageClassification extends ViTMSNPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class GroupViTPreTrainedModel extends PreTrainedModel {}
-export class GroupViTModel extends GroupViTPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class FastViTPreTrainedModel extends PreTrainedModel {}
-export class FastViTModel extends FastViTPreTrainedModel {}
-export class FastViTForImageClassification extends FastViTPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class VitMattePreTrainedModel extends PreTrainedModel {}
-
-/**
- * ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
- *
- * **Example:** Perform image matting with a `VitMatteForImageMatting` model.
- * ```javascript
- * import { AutoProcessor, VitMatteForImageMatting, RawImage } from '@huggingface/transformers';
- *
- * // Load processor and model
- * const processor = await AutoProcessor.from_pretrained('Xenova/vitmatte-small-distinctions-646');
- * const model = await VitMatteForImageMatting.from_pretrained('Xenova/vitmatte-small-distinctions-646');
- *
- * // Load image and trimap
- * const image = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_image.png');
- * const trimap = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_trimap.png');
- *
- * // Prepare image + trimap for the model
- * const inputs = await processor(image, trimap);
- *
- * // Predict alpha matte
- * const { alphas } = await model(inputs);
- * // Tensor {
- * //   dims: [ 1, 1, 640, 960 ],
- * //   type: 'float32',
- * //   size: 614400,
- * //   data: Float32Array(614400) [ 0.9894027709960938, 0.9970508813858032, ... ]
- * // }
- * ```
- *
- * You can visualize the alpha matte as follows:
- * ```javascript
- * import { Tensor, cat } from '@huggingface/transformers';
- *
- * // Visualize predicted alpha matte
- * const imageTensor = image.toTensor();
- *
- * // Convert float (0-1) alpha matte to uint8 (0-255)
- * const alphaChannel = alphas
- *   .squeeze(0)
- *   .mul_(255)
- *   .clamp_(0, 255)
- *   .round_()
- *   .to('uint8');
- *
- * // Concatenate original image with predicted alpha
- * const imageData = cat([imageTensor, alphaChannel], 0);
- *
- * // Save output image
- * const outputImage = RawImage.fromTensor(imageData);
- * outputImage.save('output.png');
- * ```
- */
-export class VitMatteForImageMatting extends VitMattePreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new ImageMattingOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class MobileViTPreTrainedModel extends PreTrainedModel {}
-export class MobileViTModel extends MobileViTPreTrainedModel {}
-export class MobileViTForImageClassification extends MobileViTPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-// TODO: MobileViTForSemanticSegmentation
-
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class MobileViTV2PreTrainedModel extends PreTrainedModel {}
-export class MobileViTV2Model extends MobileViTV2PreTrainedModel {}
-export class MobileViTV2ForImageClassification extends MobileViTV2PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-// TODO: MobileViTV2ForSemanticSegmentation
-
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class OwlViTPreTrainedModel extends PreTrainedModel {}
-export class OwlViTModel extends OwlViTPreTrainedModel {}
-export class OwlViTForObjectDetection extends OwlViTPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Owlv2PreTrainedModel extends PreTrainedModel {}
-export class Owlv2Model extends Owlv2PreTrainedModel {}
-export class Owlv2ForObjectDetection extends Owlv2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Beit Models
-export class BeitPreTrainedModel extends PreTrainedModel {}
-export class BeitModel extends BeitPreTrainedModel {}
-export class BeitForImageClassification extends BeitPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DetrPreTrainedModel extends PreTrainedModel {}
-export class DetrModel extends DetrPreTrainedModel {}
-export class DetrForObjectDetection extends DetrPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new DetrObjectDetectionOutput(await super._call(model_inputs));
-    }
-}
-
-export class DetrForSegmentation extends DetrPreTrainedModel {
-    /**
-     * Runs the model with the provided inputs
-     * @param {Object} model_inputs Model inputs
-     * @returns {Promise<DetrSegmentationOutput>} Object containing segmentation outputs
-     */
-    async _call(model_inputs) {
-        return new DetrSegmentationOutput(await super._call(model_inputs));
-    }
-}
-
-export class DetrObjectDetectionOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Classification logits (including no-object) for all queries.
-     * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
-     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
-     */
-    constructor({ logits, pred_boxes }) {
-        super();
-        this.logits = logits;
-        this.pred_boxes = pred_boxes;
-    }
-}
-
-export class DetrSegmentationOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits The output logits of the model.
-     * @param {Tensor} output.pred_boxes Predicted boxes.
-     * @param {Tensor} output.pred_masks Predicted masks.
-     */
-    constructor({ logits, pred_boxes, pred_masks }) {
-        super();
-        this.logits = logits;
-        this.pred_boxes = pred_boxes;
-        this.pred_masks = pred_masks;
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class RTDetrPreTrainedModel extends PreTrainedModel {}
-export class RTDetrModel extends RTDetrPreTrainedModel {}
-export class RTDetrForObjectDetection extends RTDetrPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new RTDetrObjectDetectionOutput(await super._call(model_inputs));
-    }
-}
-
-export class RTDetrObjectDetectionOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Classification logits (including no-object) for all queries.
-     * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
-     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
-     */
-    constructor({ logits, pred_boxes }) {
-        super();
-        this.logits = logits;
-        this.pred_boxes = pred_boxes;
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class RTDetrV2PreTrainedModel extends PreTrainedModel {}
-export class RTDetrV2Model extends RTDetrV2PreTrainedModel {}
-export class RTDetrV2ForObjectDetection extends RTDetrV2PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new RTDetrV2ObjectDetectionOutput(await super._call(model_inputs));
-    }
-}
-
-export class RTDetrV2ObjectDetectionOutput extends RTDetrObjectDetectionOutput {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class RFDetrPreTrainedModel extends PreTrainedModel {}
-export class RFDetrModel extends RFDetrPreTrainedModel {}
-export class RFDetrForObjectDetection extends RFDetrPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new RFDetrObjectDetectionOutput(await super._call(model_inputs));
-    }
-}
-
-export class RFDetrObjectDetectionOutput extends RTDetrObjectDetectionOutput {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DFinePreTrainedModel extends PreTrainedModel {}
-export class DFineModel extends DFinePreTrainedModel {}
-export class DFineForObjectDetection extends DFinePreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new RTDetrObjectDetectionOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class TableTransformerPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer)
- * outputting raw hidden-states without any specific head on top.
- */
-export class TableTransformerModel extends TableTransformerPreTrainedModel {}
-
-/**
- * Table Transformer Model (consisting of a backbone and encoder-decoder Transformer)
- * with object detection heads on top, for tasks such as COCO detection.
- */
-export class TableTransformerForObjectDetection extends TableTransformerPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new TableTransformerObjectDetectionOutput(await super._call(model_inputs));
-    }
-}
-export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DeiTPreTrainedModel extends PreTrainedModel {}
-export class DeiTModel extends DeiTPreTrainedModel {}
-export class DeiTForImageClassification extends DeiTPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class HieraPreTrainedModel extends PreTrainedModel {}
-export class HieraModel extends HieraPreTrainedModel {}
-export class HieraForImageClassification extends HieraPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-/**
- * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
- */
-export class ResNetPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare ResNet model outputting raw features without any specific head on top.
- */
-export class ResNetModel extends ResNetPreTrainedModel {}
-
-/**
- * ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet.
- */
-export class ResNetForImageClassification extends ResNetPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class SwinPreTrainedModel extends PreTrainedModel {}
-export class SwinModel extends SwinPreTrainedModel {}
-export class SwinForImageClassification extends SwinPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-export class SwinForSemanticSegmentation extends SwinPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Swin2SRPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class Swin2SRModel extends Swin2SRPreTrainedModel {}
-
-/**
- * Swin2SR Model transformer with an upsampler head on top for image super resolution and restoration.
- *
- * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64`.
- *
- * ```javascript
- * import { AutoProcessor, Swin2SRForImageSuperResolution, RawImage } from '@huggingface/transformers';
- *
- * // Load processor and model
- * const model_id = 'Xenova/swin2SR-classical-sr-x2-64';
- * const processor = await AutoProcessor.from_pretrained(model_id);
- * const model = await Swin2SRForImageSuperResolution.from_pretrained(model_id);
- *
- * // Prepare model inputs
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/butterfly.jpg';
- * const image = await RawImage.fromURL(url);
- * const inputs = await processor(image);
- *
- * // Run model
- * const outputs = await model(inputs);
- *
- * // Convert Tensor to RawImage
- * const output = outputs.reconstruction.squeeze().clamp_(0, 1).mul_(255).round_().to('uint8');
- * const outputImage = RawImage.fromTensor(output);
- * // RawImage {
- * //   data: Uint8Array(786432) [ 41, 31, 24, ... ],
- * //   width: 512,
- * //   height: 512,
- * //   channels: 3
- * // }
- * ```
- */
-export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DPTPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare DPT Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class DPTModel extends DPTPreTrainedModel {}
-
-/**
- * DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
- *
- * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
- * ```javascript
- * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
- *
- * // Load model and processor
- * const model_id = 'Xenova/dpt-hybrid-midas';
- * const model = await DPTForDepthEstimation.from_pretrained(model_id);
- * const processor = await AutoProcessor.from_pretrained(model_id);
- *
- * // Load image from URL
- * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.read(url);
- *
- * // Prepare image for the model
- * const inputs = await processor(image);
- *
- * // Run model
- * const { predicted_depth } = await model(inputs);
- *
- * // Interpolate to original size
- * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
- * size: image.size.reverse(),
- * mode: 'bilinear',
- * })).squeeze(1);
- *
- * // Visualize the prediction
- * const min = prediction.min().item();
- * const max = prediction.max().item();
- * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
- * const depth = RawImage.fromTensor(formatted);
- * // RawImage {
- * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
- * //   width: 640,
- * //   height: 480,
- * //   channels: 1
- * // }
- * ```
- */
-export class DPTForDepthEstimation extends DPTPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DepthAnythingPreTrainedModel extends PreTrainedModel {}
-
-/**
- * Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
- */
-export class DepthAnythingForDepthEstimation extends DepthAnythingPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class SapiensPreTrainedModel extends PreTrainedModel {}
-export class SapiensForSemanticSegmentation extends SapiensPreTrainedModel {}
-export class SapiensForDepthEstimation extends SapiensPreTrainedModel {}
-export class SapiensForNormalEstimation extends SapiensPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DepthProPreTrainedModel extends PreTrainedModel {}
-export class DepthProForDepthEstimation extends DepthProPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Metric3DPreTrainedModel extends PreTrainedModel {}
-export class Metric3DForDepthEstimation extends Metric3DPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Metric3Dv2PreTrainedModel extends PreTrainedModel {}
-export class Metric3Dv2ForDepthEstimation extends Metric3Dv2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class MaskFormerPreTrainedModel extends PreTrainedModel {}
-export class MaskFormerModel extends MaskFormerPreTrainedModel {}
-export class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class GLPNPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.
- */
-export class GLPNModel extends GLPNPreTrainedModel {}
-
-/**
- * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
- *
- * // Load model and processor
- * const model_id = 'Xenova/glpn-kitti';
- * const model = await GLPNForDepthEstimation.from_pretrained(model_id);
- * const processor = await AutoProcessor.from_pretrained(model_id);
- *
- * // Load image from URL
- * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
- * const image = await RawImage.read(url);
- *
- * // Prepare image for the model
- * const inputs = await processor(image);
- *
- * // Run model
- * const { predicted_depth } = await model(inputs);
- *
- * // Interpolate to original size
- * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
- * size: image.size.reverse(),
- * mode: 'bilinear',
- * })).squeeze(1);
- *
- * // Visualize the prediction
- * const min = prediction.min().item();
- * const max = prediction.max().item();
- * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
- * const depth = RawImage.fromTensor(formatted);
- * // RawImage {
- * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
- * //   width: 640,
- * //   height: 480,
- * //   channels: 1
- * // }
- * ```
- */
-export class GLPNForDepthEstimation extends GLPNPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DonutSwinPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.
- *
- * **Example:** Step-by-step Document Parsing.
- *
- * ```javascript
- * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers';
- *
- * // Choose model to use
- * const model_id = 'Xenova/donut-base-finetuned-cord-v2';
- *
- * // Prepare image inputs
- * const processor = await AutoProcessor.from_pretrained(model_id);
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/receipt.png';
- * const image = await RawImage.read(url);
- * const image_inputs = await processor(image);
- *
- * // Prepare decoder inputs
- * const tokenizer = await AutoTokenizer.from_pretrained(model_id);
- * const task_prompt = '<s_cord-v2>';
- * const decoder_input_ids = tokenizer(task_prompt, {
- *   add_special_tokens: false,
- * }).input_ids;
- *
- * // Create the model
- * const model = await AutoModelForVision2Seq.from_pretrained(model_id);
- *
- * // Run inference
- * const output = await model.generate(image_inputs.pixel_values, {
- *   decoder_input_ids,
- *   max_length: model.config.decoder.max_position_embeddings,
- * });
- *
- * // Decode output
- * const decoded = tokenizer.batch_decode(output)[0];
- * // <s_cord-v2><s_menu><s_nm> CINNAMON SUGAR</s_nm><s_unitprice> 17,000</s_unitprice><s_cnt> 1 x</s_cnt><s_price> 17,000</s_price></s_menu><s_sub_total><s_subtotal_price> 17,000</s_subtotal_price></s_sub_total><s_total><s_total_price> 17,000</s_total_price><s_cashprice> 20,000</s_cashprice><s_changeprice> 3,000</s_changeprice></s_total></s>
- * ```
- *
- * **Example:** Step-by-step Document Visual Question Answering (DocVQA)
- *
- * ```javascript
- * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers';
- *
- * // Choose model to use
- * const model_id = 'Xenova/donut-base-finetuned-docvqa';
- *
- * // Prepare image inputs
- * const processor = await AutoProcessor.from_pretrained(model_id);
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/invoice.png';
- * const image = await RawImage.read(url);
- * const image_inputs = await processor(image);
- *
- * // Prepare decoder inputs
- * const tokenizer = await AutoTokenizer.from_pretrained(model_id);
- * const question = 'What is the invoice number?';
- * const task_prompt = `<s_docvqa><s_question>${question}</s_question><s_answer>`;
- * const decoder_input_ids = tokenizer(task_prompt, {
- *   add_special_tokens: false,
- * }).input_ids;
- *
- * // Create the model
- * const model = await AutoModelForVision2Seq.from_pretrained(model_id);
- *
- * // Run inference
- * const output = await model.generate(image_inputs.pixel_values, {
- *   decoder_input_ids,
- *   max_length: model.config.decoder.max_position_embeddings,
- * });
- *
- * // Decode output
- * const decoded = tokenizer.batch_decode(output)[0];
- * // <s_docvqa><s_question> What is the invoice number?</s_question><s_answer> us-001</s_answer></s>
- * ```
- */
-export class DonutSwinModel extends DonutSwinPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class ConvNextPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare ConvNext model outputting raw features without any specific head on top.
- */
-export class ConvNextModel extends ConvNextPreTrainedModel {}
-
-/**
- * ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet.
- */
-export class ConvNextForImageClassification extends ConvNextPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class ConvNextV2PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare ConvNextV2 model outputting raw features without any specific head on top.
- */
-export class ConvNextV2Model extends ConvNextV2PreTrainedModel {}
-
-/**
- * ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet.
- */
-export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Dinov2PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class Dinov2Model extends Dinov2PreTrainedModel {}
-
-/**
- * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet.
- */
-export class Dinov2ForImageClassification extends Dinov2PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Dinov2WithRegistersPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class Dinov2WithRegistersModel extends Dinov2WithRegistersPreTrainedModel {}
-
-/**
- * Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet.
- */
-export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DINOv3ViTPreTrainedModel extends PreTrainedModel {}
-export class DINOv3ViTModel extends DINOv3ViTPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class DINOv3ConvNextPreTrainedModel extends PreTrainedModel {}
-export class DINOv3ConvNextModel extends DINOv3ConvNextPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class GroundingDinoPreTrainedModel extends PreTrainedModel {}
-export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel {}
-
-//////////////////////////////////////////////////
-export class YolosPreTrainedModel extends PreTrainedModel {}
-export class YolosModel extends YolosPreTrainedModel {}
-export class YolosForObjectDetection extends YolosPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new YolosObjectDetectionOutput(await super._call(model_inputs));
-    }
-}
-
-export class YolosObjectDetectionOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Classification logits (including no-object) for all queries.
-     * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
-     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
-     */
-    constructor({ logits, pred_boxes }) {
-        super();
-        this.logits = logits;
-        this.pred_boxes = pred_boxes;
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class SamPreTrainedModel extends PreTrainedModel {}
-
-/**
- * Segment Anything Model (SAM) for generating segmentation masks, given an input image
- * and optional 2D location and bounding boxes.
- *
- * **Example:** Perform mask generation w/ `Xenova/sam-vit-base`.
- * ```javascript
- * import { SamModel, AutoProcessor, RawImage } from '@huggingface/transformers';
- *
- * const model = await SamModel.from_pretrained('Xenova/sam-vit-base');
- * const processor = await AutoProcessor.from_pretrained('Xenova/sam-vit-base');
- *
- * const img_url = 'https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png';
- * const raw_image = await RawImage.read(img_url);
- * const input_points = [[[450, 600]]] // 2D localization of a window
- *
- * const inputs = await processor(raw_image, { input_points });
- * const outputs = await model(inputs);
- *
- * const masks = await processor.post_process_masks(outputs.pred_masks, inputs.original_sizes, inputs.reshaped_input_sizes);
- * // [
- * //   Tensor {
- * //     dims: [ 1, 3, 1764, 2646 ],
- * //     type: 'bool',
- * //     data: Uint8Array(14002632) [ ... ],
- * //     size: 14002632
- * //   }
- * // ]
- * const scores = outputs.iou_scores;
- * // Tensor {
- * //   dims: [ 1, 1, 3 ],
- * //   type: 'float32',
- * //   data: Float32Array(3) [
- * //     0.8892380595207214,
- * //     0.9311248064041138,
- * //     0.983696699142456
- * //   ],
- * //   size: 3
- * // }
- * ```
- */
-export class SamModel extends SamPreTrainedModel {
-    /**
-     * Compute image embeddings and positional image embeddings, given the pixel values of an image.
-     * @param {Object} model_inputs Object containing the model inputs.
-     * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`.
-     * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings.
-     */
-    async get_image_embeddings({ pixel_values }) {
-        // in:
-        //  - pixel_values: tensor.float32[batch_size,3,1024,1024]
-        //
-        // out:
-        //  - image_embeddings: tensor.float32[batch_size,256,64,64]
-        //  - image_positional_embeddings: tensor.float32[batch_size,256,64,64]
-        return await encoderForward(this, { pixel_values });
-    }
-
-    /**
-     * @typedef {Object} SamModelInputs Object containing the model inputs.
-     * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`.
-     * These can be obtained using a `SamProcessor`.
-     * @property {Tensor} [input_points] Input 2D spatial points with shape `(batch_size, num_points, 2)`.
-     * This is used by the prompt encoder to encode the prompt.
-     * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`.
-     * This is used by the prompt encoder to encode the prompt. There are 4 types of labels:
-     *  - `1`: the point is a point that contains the object of interest
-     *  - `0`: the point is a point that does not contain the object of interest
-     *  - `-1`: the point corresponds to the background
-     *  - `-10`: the point is a padding point, thus should be ignored by the prompt encoder
-     * @property {Tensor} [input_boxes] Input bounding boxes with shape `(batch_size, num_boxes, 4)`.
-     * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder.
-     * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder.
-     */
-
-    /**
-     * @param {SamModelInputs} model_inputs Object containing the model inputs.
-     * @returns {Promise<Object>} The output of the model.
-     */
-    async forward(model_inputs) {
-        if (!model_inputs.image_embeddings || !model_inputs.image_positional_embeddings) {
-            // Compute the image embeddings if they are missing
-            model_inputs = {
-                ...model_inputs,
-                ...(await this.get_image_embeddings(model_inputs)),
-            };
-        } else {
-            model_inputs = { ...model_inputs };
-        }
-
-        // Set default input labels if they are missing
-        model_inputs.input_labels ??= ones(model_inputs.input_points.dims.slice(0, -1));
-
-        const decoder_inputs = {
-            image_embeddings: model_inputs.image_embeddings,
-            image_positional_embeddings: model_inputs.image_positional_embeddings,
-        };
-        if (model_inputs.input_points) {
-            decoder_inputs.input_points = model_inputs.input_points;
-        }
-        if (model_inputs.input_labels) {
-            decoder_inputs.input_labels = model_inputs.input_labels;
-        }
-        if (model_inputs.input_boxes) {
-            decoder_inputs.input_boxes = model_inputs.input_boxes;
-        }
-
-        // Returns:
-        //  - iou_scores: tensor.float32[batch_size,point_batch_size,3]
-        //  - pred_masks: tensor.float32[batch_size,point_batch_size,3,256,256]
-        return await sessionRun(this.sessions['prompt_encoder_mask_decoder'], decoder_inputs);
-    }
-
-    /**
-     * Runs the model with the provided inputs
-     * @param {Object} model_inputs Model inputs
-     * @returns {Promise<SamImageSegmentationOutput>} Object containing segmentation outputs
-     */
-    async _call(model_inputs) {
-        return new SamImageSegmentationOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * Base class for Segment-Anything model's output.
- */
-export class SamImageSegmentationOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.iou_scores The output logits of the model.
-     * @param {Tensor} output.pred_masks Predicted boxes.
-     */
-    constructor({ iou_scores, pred_masks }) {
-        super();
-        this.iou_scores = iou_scores;
-        this.pred_masks = pred_masks;
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Sam2ImageSegmentationOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.iou_scores The output logits of the model.
-     * @param {Tensor} output.pred_masks Predicted boxes.
-     * @param {Tensor} output.object_score_logits Logits for the object score, indicating if an object is present.
-     */
-    constructor({ iou_scores, pred_masks, object_score_logits }) {
-        super();
-        this.iou_scores = iou_scores;
-        this.pred_masks = pred_masks;
-        this.object_score_logits = object_score_logits;
-    }
-}
-
-export class Sam2PreTrainedModel extends PreTrainedModel {}
-export class Sam2Model extends Sam2PreTrainedModel {
-    /**
-     * Compute image embeddings and positional image embeddings, given the pixel values of an image.
-     * @param {Object} model_inputs Object containing the model inputs.
-     * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `Sam2Processor`.
-     * @returns {Promise<Record<String, Tensor>>} The image embeddings.
-     */
-    async get_image_embeddings({ pixel_values }) {
-        // in:
-        //  - pixel_values: tensor.float32[batch_size,3,1024,1024]
-        //
-        // out:
-        //  - image_embeddings.0: tensor.float32[batch_size,32,256,256]
-        //  - image_embeddings.1: tensor.float32[batch_size,64,128,128]
-        //  - image_embeddings.2: tensor.float32[batch_size,256,64,64]
-        return await encoderForward(this, { pixel_values });
-    }
-
-    async forward(model_inputs) {
-        // @ts-expect-error ts(2339)
-        const { num_feature_levels } = this.config.vision_config;
-        const image_embeddings_name = Array.from({ length: num_feature_levels }, (_, i) => `image_embeddings.${i}`);
-
-        if (image_embeddings_name.some((name) => !model_inputs[name])) {
-            // Compute the image embeddings if they are missing
-            model_inputs = {
-                ...model_inputs,
-                ...(await this.get_image_embeddings(model_inputs)),
-            };
-        } else {
-            model_inputs = { ...model_inputs };
-        }
-
-        if (model_inputs.input_points) {
-            if (model_inputs.input_boxes && model_inputs.input_boxes.dims[1] !== 1) {
-                throw new Error(
-                    'When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.',
-                );
-            }
-            const shape = model_inputs.input_points.dims;
-            model_inputs.input_labels ??= ones(shape.slice(0, -1));
-            model_inputs.input_boxes ??= full([shape[0], 0, 4], 0.0);
-        } else if (model_inputs.input_boxes) {
-            // only boxes
-            const shape = model_inputs.input_boxes.dims;
-            model_inputs.input_labels = full([shape[0], shape[1], 0], -1n);
-            model_inputs.input_points = full([shape[0], 1, 0, 2], 0.0);
-        } else {
-            throw new Error('At least one of `input_points` or `input_boxes` must be provided.');
-        }
-
-        const prompt_encoder_mask_decoder_session = this.sessions['prompt_encoder_mask_decoder'];
-        const decoder_inputs = pick(model_inputs, prompt_encoder_mask_decoder_session.inputNames);
-
-        // Returns:
-        //  - iou_scores: tensor.float32[batch_size,num_boxes_or_points,3]
-        //  - pred_masks: tensor.float32[batch_size,num_boxes_or_points,3,256,256]
-        //  - object_score_logits: tensor.float32[batch_size,num_boxes_or_points,1]
-        return await sessionRun(prompt_encoder_mask_decoder_session, decoder_inputs);
-    }
-
-    /**
-     * Runs the model with the provided inputs
-     * @param {Object} model_inputs Model inputs
-     * @returns {Promise<Sam2ImageSegmentationOutput>} Object containing segmentation outputs
-     */
-    async _call(model_inputs) {
-        return new Sam2ImageSegmentationOutput(await super._call(model_inputs));
-    }
-}
-export class EdgeTamModel extends Sam2Model {} // NOTE: extends Sam2Model
-export class Sam3TrackerModel extends Sam2Model {} // NOTE: extends Sam2Model
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MarianMT models
-export class MarianPreTrainedModel extends PreTrainedModel {}
-
-export class MarianModel extends MarianPreTrainedModel {}
-
-export class MarianMTModel extends MarianPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// M2M100 models
-export class M2M100PreTrainedModel extends PreTrainedModel {}
-
-export class M2M100Model extends M2M100PreTrainedModel {}
-
-export class M2M100ForConditionalGeneration extends M2M100PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Wav2Vec2 models
-export class Wav2Vec2PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.
- *
- * **Example:** Load and run a `Wav2Vec2Model` for feature extraction.
- *
- * ```javascript
- * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
- *
- * // Read and preprocess audio
- * const processor = await AutoProcessor.from_pretrained('Xenova/mms-300m');
- * const audio = await read_audio('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac', 16000);
- * const inputs = await processor(audio);
- *
- * // Run model with inputs
- * const model = await AutoModel.from_pretrained('Xenova/mms-300m');
- * const output = await model(inputs);
- * // {
- * //   last_hidden_state: Tensor {
- * //     dims: [ 1, 1144, 1024 ],
- * //     type: 'float32',
- * //     data: Float32Array(1171456) [ ... ],
- * //     size: 1171456
- * //   }
- * // }
- * ```
- */
-export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel {}
-
-export class Wav2Vec2ForCTC extends Wav2Vec2PreTrainedModel {
-    /**
-     * @param {Object} model_inputs
-     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
-     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
-     */
-    async _call(model_inputs) {
-        return new CausalLMOutput(await super._call(model_inputs));
-    }
-}
-
-export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
- */
-export class Wav2Vec2ForAudioFrameClassification extends Wav2Vec2PreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Parakeet models
-export class ParakeetPreTrainedModel extends PreTrainedModel {}
-export class ParakeetForCTC extends ParakeetPreTrainedModel {
-    /**
-     * @param {Object} model_inputs
-     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
-     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
-     */
-    async _call(model_inputs) {
-        return new CausalLMOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// PyAnnote models
-export class PyAnnotePreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare PyAnnote Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class PyAnnoteModel extends PyAnnotePreTrainedModel {}
-
-/**
- * PyAnnote Model with a frame classification head on top for tasks like Speaker Diarization.
- *
- * **Example:** Load and run a `PyAnnoteForAudioFrameClassification` for speaker diarization.
- *
- * ```javascript
- * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers';
- *
- * // Load model and processor
- * const model_id = 'onnx-community/pyannote-segmentation-3.0';
- * const model = await AutoModelForAudioFrameClassification.from_pretrained(model_id);
- * const processor = await AutoProcessor.from_pretrained(model_id);
- *
- * // Read and preprocess audio
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.wav';
- * const audio = await read_audio(url, processor.feature_extractor.config.sampling_rate);
- * const inputs = await processor(audio);
- *
- * // Run model with inputs
- * const { logits } = await model(inputs);
- * // {
- * //   logits: Tensor {
- * //     dims: [ 1, 767, 7 ],  // [batch_size, num_frames, num_classes]
- * //     type: 'float32',
- * //     data: Float32Array(5369) [ ... ],
- * //     size: 5369
- * //   }
- * // }
- *
- * const result = processor.post_process_speaker_diarization(logits, audio.length);
- * // [
- * //   [
- * //     { id: 0, start: 0, end: 1.0512535626298245, confidence: 0.8220156481664611 },
- * //     { id: 2, start: 1.0512535626298245, end: 2.3398869619825127, confidence: 0.9008811707860472 },
- * //     ...
- * //   ]
- * // ]
- *
- * // Display result
- * console.table(result[0], ['start', 'end', 'id', 'confidence']);
- * // ┌─────────┬────────────────────┬────────────────────┬────┬─────────────────────┐
- * // │ (index) │ start              │ end                │ id │ confidence          │
- * // ├─────────┼────────────────────┼────────────────────┼────┼─────────────────────┤
- * // │ 0       │ 0                  │ 1.0512535626298245 │ 0  │ 0.8220156481664611  │
- * // │ 1       │ 1.0512535626298245 │ 2.3398869619825127 │ 2  │ 0.9008811707860472  │
- * // │ 2       │ 2.3398869619825127 │ 3.5946089560890773 │ 0  │ 0.7521651315796233  │
- * // │ 3       │ 3.5946089560890773 │ 4.578039708226655  │ 2  │ 0.8491978128022479  │
- * // │ 4       │ 4.578039708226655  │ 4.594995410849717  │ 0  │ 0.2935352600416393  │
- * // │ 5       │ 4.594995410849717  │ 6.121008646925269  │ 3  │ 0.6788051309866024  │
- * // │ 6       │ 6.121008646925269  │ 6.256654267909762  │ 0  │ 0.37125512393851134 │
- * // │ 7       │ 6.256654267909762  │ 8.630452635138397  │ 2  │ 0.7467035186353542  │
- * // │ 8       │ 8.630452635138397  │ 10.088643060721703 │ 0  │ 0.7689364814666032  │
- * // │ 9       │ 10.088643060721703 │ 12.58113134631177  │ 2  │ 0.9123324509131324  │
- * // │ 10      │ 12.58113134631177  │ 13.005023911888312 │ 0  │ 0.4828358177572041  │
- * // └─────────┴────────────────────┴────────────────────┴────┴─────────────────────┘
- * ```
- */
-export class PyAnnoteForAudioFrameClassification extends PyAnnotePreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// WeSpeakerResNet models
-export class WeSpeakerResNetPreTrainedModel extends PreTrainedModel {}
-export class WeSpeakerResNetModel extends WeSpeakerResNetPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// UniSpeech models
-export class UniSpeechPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class UniSpeechModel extends UniSpeechPreTrainedModel {}
-
-/**
- * UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
- */
-export class UniSpeechForCTC extends UniSpeechPreTrainedModel {
-    /**
-     * @param {Object} model_inputs
-     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
-     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
-     */
-    async _call(model_inputs) {
-        return new CausalLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output).
- */
-export class UniSpeechForSequenceClassification extends UniSpeechPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// UniSpeechSat models
-export class UniSpeechSatPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class UniSpeechSatModel extends UniSpeechSatPreTrainedModel {}
-
-/**
- * UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
- */
-export class UniSpeechSatForCTC extends UniSpeechSatPreTrainedModel {
-    /**
-     * @param {Object} model_inputs
-     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
-     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
-     */
-    async _call(model_inputs) {
-        return new CausalLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output).
- */
-export class UniSpeechSatForSequenceClassification extends UniSpeechSatPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization.
- */
-export class UniSpeechSatForAudioFrameClassification extends UniSpeechSatPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Wav2Vec2Bert models
-export class Wav2Vec2BertPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class Wav2Vec2BertModel extends Wav2Vec2BertPreTrainedModel {}
-
-/**
- * Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
- */
-export class Wav2Vec2BertForCTC extends Wav2Vec2BertPreTrainedModel {
-    /**
-     * @param {Object} model_inputs
-     * @param {Tensor} model_inputs.input_features Float values of input mel-spectrogram.
-     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
-     */
-    async _call(model_inputs) {
-        return new CausalLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output).
- */
-export class Wav2Vec2BertForSequenceClassification extends Wav2Vec2BertPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Hubert models
-export class HubertPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Hubert Model transformer outputting raw hidden-states without any specific head on top.
- *
- * **Example:** Load and run a `HubertModel` for feature extraction.
- *
- * ```javascript
- * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
- *
- * // Read and preprocess audio
- * const processor = await AutoProcessor.from_pretrained('Xenova/hubert-base-ls960');
- * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000);
- * const inputs = await processor(audio);
- *
- * // Load and run model with inputs
- * const model = await AutoModel.from_pretrained('Xenova/hubert-base-ls960');
- * const output = await model(inputs);
- * // {
- * //   last_hidden_state: Tensor {
- * //     dims: [ 1, 549, 768 ],
- * //     type: 'float32',
- * //     data: Float32Array(421632) [0.0682469978928566, 0.08104046434164047, -0.4975186586380005, ...],
- * //     size: 421632
- * //   }
- * // }
- * ```
- */
-export class HubertModel extends Wav2Vec2PreTrainedModel {}
-
-/**
- * Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
- */
-export class HubertForCTC extends Wav2Vec2PreTrainedModel {
-    /**
-     * @param {Object} model_inputs
-     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
-     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
-     */
-    async _call(model_inputs) {
-        return new CausalLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB Keyword Spotting.
- */
-export class HubertForSequenceClassification extends Wav2Vec2PreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// WavLM models
-/**
- * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
- */
-export class WavLMPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.
- *
- * **Example:** Load and run a `WavLMModel` for feature extraction.
- *
- * ```javascript
- * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
- *
- * // Read and preprocess audio
- * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base');
- * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000);
- * const inputs = await processor(audio);
- *
- * // Run model with inputs
- * const model = await AutoModel.from_pretrained('Xenova/wavlm-base');
- * const output = await model(inputs);
- * // {
- * //   last_hidden_state: Tensor {
- * //     dims: [ 1, 549, 768 ],
- * //     type: 'float32',
- * //     data: Float32Array(421632) [-0.349443256855011, -0.39341306686401367,  0.022836603224277496, ...],
- * //     size: 421632
- * //   }
- * // }
- * ```
- */
-export class WavLMModel extends WavLMPreTrainedModel {}
-
-/**
- * WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
- */
-export class WavLMForCTC extends WavLMPreTrainedModel {
-    /**
-     * @param {Object} model_inputs
-     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
-     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
-     */
-    async _call(model_inputs) {
-        return new CausalLMOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * WavLM Model with a sequence classification head on top (a linear layer over the pooled output).
- */
-export class WavLMForSequenceClassification extends WavLMPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
- *
- * **Example:** Extract speaker embeddings with `WavLMForXVector`.
- * ```javascript
- * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
- *
- * // Read and preprocess audio
- * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sv');
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
- * const audio = await read_audio(url, 16000);
- * const inputs = await processor(audio);
- *
- * // Run model with inputs
- * const model = await AutoModel.from_pretrained('Xenova/wavlm-base-plus-sv');
- * const outputs = await model(inputs);
- * // {
- * //   logits: Tensor {
- * //     dims: [ 1, 512 ],
- * //     type: 'float32',
- * //     data: Float32Array(512) [0.5847219228744507, ...],
- * //     size: 512
- * //   },
- * //   embeddings: Tensor {
- * //     dims: [ 1, 512 ],
- * //     type: 'float32',
- * //     data: Float32Array(512) [-0.09079201519489288, ...],
- * //     size: 512
- * //   }
- * // }
- * ```
- */
-export class WavLMForXVector extends WavLMPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<XVectorOutput>} An object containing the model's output logits and speaker embeddings.
-     */
-    async _call(model_inputs) {
-        return new XVectorOutput(await super._call(model_inputs));
-    }
-}
-
-/**
- * WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
- *
- * **Example:** Perform speaker diarization with `WavLMForAudioFrameClassification`.
- * ```javascript
- * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers';
- *
- * // Read and preprocess audio
- * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sd');
- * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
- * const audio = await read_audio(url, 16000);
- * const inputs = await processor(audio);
- *
- * // Run model with inputs
- * const model = await AutoModelForAudioFrameClassification.from_pretrained('Xenova/wavlm-base-plus-sd');
- * const { logits } = await model(inputs);
- * // {
- * //   logits: Tensor {
- * //     dims: [ 1, 549, 2 ],  // [batch_size, num_frames, num_speakers]
- * //     type: 'float32',
- * //     data: Float32Array(1098) [-3.5301010608673096, ...],
- * //     size: 1098
- * //   }
- * // }
- *
- * const labels = logits[0].sigmoid().tolist().map(
- *     frames => frames.map(speaker => speaker > 0.5 ? 1 : 0)
- * );
- * console.log(labels); // labels is a one-hot array of shape (num_frames, num_speakers)
- * // [
- * //     [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
- * //     [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
- * //     [0, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1],
- * //     ...
- * // ]
- * ```
- */
-export class WavLMForAudioFrameClassification extends WavLMPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
-     */
-    async _call(model_inputs) {
-        return new TokenClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-export class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel {}
-export class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel {}
-
-//////////////////////////////////////////////////
-// SpeechT5 models
-/**
- * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
- */
-export class SpeechT5PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
- */
-export class SpeechT5Model extends SpeechT5PreTrainedModel {}
-
-/**
- * SpeechT5 Model with a speech encoder and a text decoder.
- *
- * **Example:** Generate speech from text with `SpeechT5ForSpeechToText`.
- * ```javascript
- * import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@huggingface/transformers';
- *
- * // Load the tokenizer and processor
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts');
- * const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts');
- *
- * // Load the models
- * // NOTE: We use the full-precision versions as they are more accurate
- * const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { dtype: 'fp32' });
- * const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { dtype: 'fp32' });
- *
- * // Load speaker embeddings from URL
- * const speaker_embeddings_data = new Float32Array(
- *     await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer()
- * );
- * const speaker_embeddings = new Tensor(
- *     'float32',
- *     speaker_embeddings_data,
- *     [1, speaker_embeddings_data.length]
- * )
- *
- * // Run tokenization
- * const { input_ids } = tokenizer('Hello, my dog is cute');
- *
- * // Generate waveform
- * const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
- * console.log(waveform)
- * // Tensor {
- * //   dims: [ 26112 ],
- * //   type: 'float32',
- * //   size: 26112,
- * //   data: Float32Array(26112) [ -0.00043630177970044315, -0.00018082228780258447, ... ],
- * // }
- * ```
- */
-export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel {}
-
-/**
- * SpeechT5 Model with a text encoder and a speech decoder.
- */
-export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
-    /**
-     * @typedef {Object} SpeechOutput
-     * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape
-     * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided
-     * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided.
-     * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape
-     * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`.
-     */
-
-    /**
-     * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder.
-     * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary.
-     * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings.
-     * @param {Object} options Optional parameters for generating speech.
-     * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value.
-     * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence.
-     * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence.
-     * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram.
-     * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers.
-     * @returns {Promise<SpeechOutput>} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors.
-     */
-    async generate_speech(
-        input_values,
-        speaker_embeddings,
-        {
-            threshold = 0.5,
-            minlenratio = 0.0,
-            maxlenratio = 20.0,
-            vocoder = null,
-            // output_cross_attentions = false, // TODO add
-        } = {},
-    ) {
-        const model_inputs = {
-            input_ids: input_values,
-        };
-
-        const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
-
-        // @ts-expect-error TS2339
-        const r = encoder_outputs.dims[1] / this.config.reduction_factor;
-        const maxlen = Math.floor(r * maxlenratio);
-        const minlen = Math.floor(r * minlenratio);
-
-        // @ts-expect-error TS2339
-        const num_mel_bins = this.config.num_mel_bins;
-
-        let spectrogramParts = [];
-        let past_key_values = null;
-        let decoder_outputs = null;
-        let idx = 0;
-
-        while (true) {
-            ++idx;
-
-            const use_cache_branch = boolTensor(!!decoder_outputs);
-            let output_sequence;
-            if (decoder_outputs) {
-                output_sequence = decoder_outputs.output_sequence_out;
-            } else {
-                output_sequence = new Tensor('float32', new Float32Array(num_mel_bins), [1, 1, num_mel_bins]);
-            }
-            let decoderFeeds = {
-                use_cache_branch,
-                output_sequence,
-                encoder_attention_mask: encoder_attention_mask,
-                speaker_embeddings: speaker_embeddings,
-                encoder_hidden_states: encoder_outputs,
-            };
-
-            this.addPastKeyValues(decoderFeeds, past_key_values);
-            decoder_outputs = await sessionRun(this.sessions['decoder_model_merged'], decoderFeeds);
-            past_key_values = this.getPastKeyValues(decoder_outputs, past_key_values);
-
-            const { prob, spectrum } = decoder_outputs;
-            spectrogramParts.push(spectrum);
-
-            if (
-                idx >= minlen &&
-                // Finished when stop token or maximum length is reached.
-                (Array.from(prob.data).filter((p) => p >= threshold).length > 0 || idx >= maxlen)
-            ) {
-                break;
-            }
-        }
-
-        const spectrogram = cat(spectrogramParts);
-        const { waveform } = await sessionRun(vocoder.sessions['model'], { spectrogram });
-
-        return {
-            spectrogram,
-            waveform,
-            // cross_attentions: null, // TODO add
-        };
-    }
-}
-
-/**
- * HiFi-GAN vocoder.
- *
- * See [SpeechT5ForSpeechToText](./models#module_models.SpeechT5ForSpeechToText) for example usage.
- */
-export class SpeechT5HifiGan extends PreTrainedModel {
-    main_input_name = 'spectrogram';
-}
-//////////////////////////////////////////////////
-
-export class SupertonicPreTrainedModel extends PreTrainedModel {}
-export class SupertonicForConditionalGeneration extends SupertonicPreTrainedModel {
-    async generate_speech({
-        // Required inputs
-        input_ids,
-        attention_mask,
-        style,
-
-        // Optional inputs
-        num_inference_steps = 5,
-        speed = 1.05,
-    }) {
-        // @ts-expect-error TS2339
-        const { sampling_rate, chunk_compress_factor, base_chunk_size, latent_dim } = this.config;
-
-        // 1. Text Encoder
-        const { last_hidden_state, durations } = await sessionRun(this.sessions['text_encoder'], {
-            input_ids,
-            attention_mask,
-            style,
-        });
-        durations.div_(speed); // Apply speed factor to duration
-
-        // 2. Latent Denoiser
-        const wav_len_max = durations.max().item() * sampling_rate;
-        const chunk_size = base_chunk_size * chunk_compress_factor;
-        const latent_len = Math.floor((wav_len_max + chunk_size - 1) / chunk_size);
-        const batch_size = input_ids.dims[0];
-        const latent_mask = ones([batch_size, latent_len]);
-        const num_steps = full([batch_size], num_inference_steps);
-
-        let noisy_latents = randn([batch_size, latent_dim * chunk_compress_factor, latent_len]);
-        for (let step = 0; step < num_inference_steps; ++step) {
-            const timestep = full([batch_size], step);
-            ({ denoised_latents: noisy_latents } = await sessionRun(this.sessions['latent_denoiser'], {
-                style,
-                noisy_latents,
-                latent_mask,
-                encoder_outputs: last_hidden_state,
-                attention_mask,
-                timestep,
-                num_inference_steps: num_steps,
-            }));
-        }
-
-        // 3. Voice Decoder
-        const { waveform } = await sessionRun(this.sessions['voice_decoder'], {
-            latents: noisy_latents,
-        });
-        return {
-            waveform,
-            durations,
-        };
-    }
-}
-
-//////////////////////////////////////////////////
-// TrOCR models
-export class TrOCRPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The TrOCR Decoder with a language modeling head.
- */
-export class TrOCRForCausalLM extends TrOCRPreTrainedModel {}
-
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Mistral models
-/**
- * The bare Mistral Model outputting raw hidden-states without any specific head on top.
- */
-export class MistralPreTrainedModel extends PreTrainedModel {}
-
-export class MistralModel extends MistralPreTrainedModel {}
-
-export class MistralForCausalLM extends MistralPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// ERNIE-4.5 models
-export class Ernie4_5_PretrainedModel extends PreTrainedModel {}
-
-export class Ernie4_5_Model extends Ernie4_5_PretrainedModel {}
-
-export class Ernie4_5_ForCausalLM extends Ernie4_5_PretrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Starcoder2 models
-/**
- * The bare Starcoder2 Model outputting raw hidden-states without any specific head on top.
- */
-export class Starcoder2PreTrainedModel extends PreTrainedModel {}
-
-export class Starcoder2Model extends Starcoder2PreTrainedModel {}
-
-export class Starcoder2ForCausalLM extends Starcoder2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Falcon models
-/**
- * The bare Falcon Model outputting raw hidden-states without any specific head on top.
- */
-export class FalconPreTrainedModel extends PreTrainedModel {}
-
-export class FalconModel extends FalconPreTrainedModel {}
-
-export class FalconForCausalLM extends FalconPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// CLAP models
-export class ClapPreTrainedModel extends PreTrainedModel {}
-
-export class ClapModel extends ClapPreTrainedModel {}
-
-/**
- * CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
- *
- * **Example:** Compute text embeddings with `ClapTextModelWithProjection`.
- *
- * ```javascript
- * import { AutoTokenizer, ClapTextModelWithProjection } from '@huggingface/transformers';
- *
- * // Load tokenizer and text model
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused');
- * const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
- *
- * // Run tokenization
- * const texts = ['a sound of a cat', 'a sound of a dog'];
- * const text_inputs = tokenizer(texts, { padding: true, truncation: true });
- *
- * // Compute embeddings
- * const { text_embeds } = await text_model(text_inputs);
- * // Tensor {
- * //   dims: [ 2, 512 ],
- * //   type: 'float32',
- * //   data: Float32Array(1024) [ ... ],
- * //   size: 1024
- * // }
- * ```
- */
-export class ClapTextModelWithProjection extends ClapPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'text_model',
-        });
-    }
-}
-
-/**
- * CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
- *
- * **Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
- *
- * ```javascript
- * import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@huggingface/transformers';
- *
- * // Load processor and audio model
- * const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused');
- * const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
- *
- * // Read audio and run processor
- * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav');
- * const audio_inputs = await processor(audio);
- *
- * // Compute embeddings
- * const { audio_embeds } = await audio_model(audio_inputs);
- * // Tensor {
- * //   dims: [ 1, 512 ],
- * //   type: 'float32',
- * //   data: Float32Array(512) [ ... ],
- * //   size: 512
- * // }
- * ```
- */
-export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'audio_model',
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// VITS models
-export class VitsPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The complete VITS model, for text-to-speech synthesis.
- *
- * **Example:** Generate speech from text with `VitsModel`.
- * ```javascript
- * import { AutoTokenizer, VitsModel } from '@huggingface/transformers';
- *
- * // Load the tokenizer and model
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/mms-tts-eng');
- * const model = await VitsModel.from_pretrained('Xenova/mms-tts-eng');
- *
- * // Run tokenization
- * const inputs = tokenizer('I love transformers');
- *
- * // Generate waveform
- * const { waveform } = await model(inputs);
- * // Tensor {
- * //   dims: [ 1, 35328 ],
- * //   type: 'float32',
- * //   data: Float32Array(35328) [ ... ],
- * //   size: 35328,
- * // }
- * ```
- */
-export class VitsModel extends VitsPreTrainedModel {
-    /**
-     * Calls the model on new inputs.
-     * @param {Object} model_inputs The inputs to the model.
-     * @returns {Promise<VitsModelOutput>} The outputs for the VITS model.
-     */
-    async _call(model_inputs) {
-        return new VitsModelOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Segformer models
-export class SegformerPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.
- */
-export class SegformerModel extends SegformerPreTrainedModel {}
-
-/**
- * SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet.
- */
-export class SegformerForImageClassification extends SegformerPreTrainedModel {}
-
-/**
- * SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
- */
-export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel {}
-
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// StableLm models
-export class StableLmPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare StableLm Model transformer outputting raw hidden-states without any specific head on top.
- */
-export class StableLmModel extends StableLmPreTrainedModel {}
-
-/**
- * StableLm Model with a `language modeling` head on top for Causal Language Modeling (with past).
- */
-export class StableLmForCausalLM extends StableLmPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class EfficientNetPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare EfficientNet model outputting raw features without any specific head on top.
- */
-export class EfficientNetModel extends EfficientNetPreTrainedModel {}
-
-/**
- * EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features).
- */
-export class EfficientNetForImageClassification extends EfficientNetPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Musicgen models
-export class MusicgenPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare Musicgen decoder model outputting raw hidden-states without any specific head on top.
- */
-export class MusicgenModel extends MusicgenPreTrainedModel {}
-
-/**
- * The MusicGen decoder model with a language modelling head on top.
- */
-export class MusicgenForCausalLM extends MusicgenPreTrainedModel {}
-
-/**
- * The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder,
- * for music generation tasks with one or both of text and audio prompts.
- *
- * **Example:** Generate music from text with `Xenova/musicgen-small`.
- * ```javascript
- * import { AutoTokenizer, MusicgenForConditionalGeneration } from '@huggingface/transformers';
- *
- * // Load tokenizer and model
- * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/musicgen-small');
- * const model = await MusicgenForConditionalGeneration.from_pretrained(
- *   'Xenova/musicgen-small', { dtype: 'fp32' }
- * );
- *
- * // Prepare text input
- * const prompt = '80s pop track with bassy drums and synth';
- * const inputs = tokenizer(prompt);
- *
- * // Generate audio
- * const audio_values = await model.generate({
- *   ...inputs,
- *   max_new_tokens: 512,
- *   do_sample: true,
- *   guidance_scale: 3,
- * });
- *
- * // (Optional) Write the output to a WAV file
- * import wavefile from 'wavefile';
- * import fs from 'fs';
- *
- * const wav = new wavefile.WaveFile();
- * wav.fromScratch(1, model.config.audio_encoder.sampling_rate, '32f', audio_values.data);
- * fs.writeFileSync('musicgen_out.wav', wav.toBuffer());
- * ```
- */
-export class MusicgenForConditionalGeneration extends PreTrainedModel {
-    // NOTE: not MusicgenPreTrainedModel
-    forward_params = [
-        'input_ids',
-        'attention_mask',
-        'encoder_outputs',
-        'decoder_input_ids',
-        'decoder_attention_mask',
-        'past_key_values',
-    ];
-
-    /**
-     * Apply the pattern mask to the final ids,
-     * then revert the pattern delay mask by filtering the pad token id in a single step.
-     * @param {Tensor} outputs The output tensor from the model.
-     * @returns {Tensor} The filtered output tensor.
-     */
-    _apply_and_filter_by_delay_pattern_mask(outputs) {
-        const [bs_x_codebooks, seqLength] = outputs.dims;
-        // @ts-expect-error TS2339
-        const num_codebooks = this.config.decoder.num_codebooks;
-        const upperBound = seqLength - num_codebooks;
-
-        let newDataSize = 0;
-        for (let i = 0; i < outputs.size; ++i) {
-            // @ts-expect-error TS2339
-            if (outputs.data[i] === this.config.decoder.pad_token_id) {
-                continue;
-            }
-
-            const row = i % seqLength;
-            const col = Math.floor(i / seqLength) % num_codebooks;
-
-            const diff = row - col;
-            if (diff > 0 && diff <= upperBound) {
-                outputs.data[newDataSize++] = outputs.data[i];
-            }
-        }
-
-        const batch_size = Math.floor(bs_x_codebooks / num_codebooks);
-        const inferred = newDataSize / (batch_size * num_codebooks);
-        // TODO: assert `inferred` is an integer
-        return new Tensor(outputs.type, outputs.data.slice(0, newDataSize), [batch_size, num_codebooks, inferred]);
-    }
-
-    prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
-        // apply the delay pattern mask
-        let clonedInputIds = structuredClone(input_ids);
-        for (let i = 0; i < clonedInputIds.length; ++i) {
-            for (let j = 0; j < clonedInputIds[i].length; ++j) {
-                // @ts-expect-error TS2339
-                if (i % this.config.decoder.num_codebooks >= j) {
-                    // @ts-expect-error TS2339
-                    clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
-                }
-            }
-        }
-        // for classifier free guidance we need to replicate the decoder args across the batch dim
-        // (we'll split these before sampling)
-        if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
-            // [batch, seqLength] -> [2 * batch, seqLength]
-            clonedInputIds = clonedInputIds.concat(clonedInputIds);
-        }
-
-        const prepped = super.prepare_inputs_for_generation(clonedInputIds, model_inputs, generation_config);
-        return prepped;
-    }
-
-    /**
-     * Generates sequences of token ids for models with a language modeling head.
-     * @param {import('./generation/parameters.js').GenerationFunctionParameters} options
-     * @returns {Promise<ModelOutput|Tensor>} The output of the model, which can contain the generated token ids, attentions, and scores.
-     */
-    async generate(options) {
-        const output_ids = await super.generate(options);
-
-        // apply the pattern mask to the final ids
-        // tensor: int64[1,batch_size,4,chunk_length]
-        const audio_codes = this._apply_and_filter_by_delay_pattern_mask(/** @type {Tensor} */ (output_ids)).unsqueeze_(
-            0,
-        ); // append the frame dimension back to the audio codes
-
-        const { audio_values } = await sessionRun(this.sessions['encodec_decode'], { audio_codes });
-
-        return audio_values;
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MobileNetV1 models
-export class MobileNetV1PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare MobileNetV1 model outputting raw hidden-states without any specific head on top.
- */
-export class MobileNetV1Model extends MobileNetV1PreTrainedModel {}
-
-/**
- * MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features),
- * e.g. for ImageNet.
- */
-export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-
-export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MobileNetV2 models
-export class MobileNetV2PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare MobileNetV2 model outputting raw hidden-states without any specific head on top.
- */
-export class MobileNetV2Model extends MobileNetV2PreTrainedModel {}
-
-/**
- * MobileNetV2 model with an image classification head on top (a linear layer on top of the pooled features),
- * e.g. for ImageNet.
- */
-export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MobileNetV3 models
-export class MobileNetV3PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare MobileNetV3 model outputting raw hidden-states without any specific head on top.
- */
-export class MobileNetV3Model extends MobileNetV3PreTrainedModel {}
-
-/**
- * MobileNetV3 model with an image classification head on top (a linear layer on top of the pooled features),
- * e.g. for ImageNet.
- */
-export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// MobileNetV4 models
-export class MobileNetV4PreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare MobileNetV4 model outputting raw hidden-states without any specific head on top.
- */
-export class MobileNetV4Model extends MobileNetV4PreTrainedModel {}
-
-/**
- * MobileNetV4 model with an image classification head on top (a linear layer on top of the pooled features),
- * e.g. for ImageNet.
- */
-export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new SequenceClassifierOutput(await super._call(model_inputs));
-    }
-}
-export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Decision Transformer models
-export class DecisionTransformerPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
- * Refer to the paper for more details: https://huggingface.co/papers/2106.01345
- */
-export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel {}
-
-//////////////////////////////////////////////////
-
-export class MultiModalityPreTrainedModel extends PreTrainedModel {}
-export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
-    forward_params = [
-        // prepare_inputs_embeds
-        'input_ids',
-        'pixel_values',
-        'images_seq_mask',
-        'images_emb_mask',
-
-        // language_model
-        'attention_mask',
-        'position_ids',
-        'past_key_values',
-    ];
-
-    /**
-     * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
-     */
-    constructor(...args) {
-        super(...args);
-
-        // State-based approach to switch out which heads to use during generation
-        this._generation_mode = 'text';
-    }
-
-    async forward(model_inputs) {
-        const mode = this._generation_mode ?? 'text';
-
-        // TODO support re-using PKVs for input_ids.dims[1] !== 1
-        // if (model_inputs.past_key_values) {
-        //     //  && model_inputs.input_ids.dims[1] === 1
-        // }
-
-        let output_1;
-        if (mode === 'text' || !model_inputs.past_key_values) {
-            const session = this.sessions['prepare_inputs_embeds'];
-            const prep_inputs = pick(model_inputs, session.inputNames);
-            output_1 = await sessionRun(session, prep_inputs);
-        } else {
-            const session = this.sessions['gen_img_embeds'];
-            const prep_inputs = pick(
-                {
-                    image_ids: model_inputs.input_ids,
-                },
-                session.inputNames,
-            );
-            output_1 = await sessionRun(session, prep_inputs);
-        }
-
-        const input_2 = { ...model_inputs, ...output_1 };
-        const output_2 = await decoderForward(this, input_2);
-
-        const head = this.sessions[mode === 'text' ? 'lm_head' : 'gen_head'];
-        if (!head) {
-            throw new Error(`Unable to find "${head}" generation head`);
-        }
-
-        const output_3 = await sessionRun(head, pick(output_2, head.inputNames));
-
-        return {
-            ...output_1,
-            ...output_2,
-            ...output_3,
-        };
-    }
-
-    /**
-     * @param {import('./generation/parameters.js').GenerationFunctionParameters} options
-     */
-    async generate(options) {
-        this._generation_mode = 'text';
-        return super.generate(options);
-    }
-
-    /**
-     * @param {import('./generation/parameters.js').GenerationFunctionParameters} options
-     */
-    async generate_images(options) {
-        this._generation_mode = 'image';
-
-        const start_num_tokens = (options.inputs ?? options[this.main_input_name]).dims[1];
-        const all_tokens = await super.generate(options);
-
-        const generated_tokens = /** @type {Tensor} */ (all_tokens).slice(null, [start_num_tokens, null]);
-
-        const image_decode = this.sessions['image_decode'];
-        const { decoded_image } = await sessionRun(image_decode, {
-            generated_tokens,
-        });
-
-        // Equivalent to `np.clip((dec + 1) / 2 * 255, 0, 255)`
-        const clamped = decoded_image
-            .add_(1)
-            .mul_(255 / 2)
-            .clamp_(0, 255)
-            .to('uint8');
-
-        // Return as a list of images
-        const images = [];
-        for (const tensor of clamped) {
-            const img = RawImage.fromTensor(tensor);
-            images.push(img);
-        }
-        return images;
-    }
-}
-
-export class MgpstrModelOutput extends ModelOutput {
-    constructor({ char_logits, bpe_logits, wp_logits }) {
-        super();
-        this.char_logits = char_logits;
-        this.bpe_logits = bpe_logits;
-        this.wp_logits = wp_logits;
-    }
-
-    get logits() {
-        return [this.char_logits, this.bpe_logits, this.wp_logits];
-    }
-}
-
-export class MgpstrPreTrainedModel extends PreTrainedModel {}
-
-/**
- * MGP-STR Model transformer with three classification heads on top
- * (three A^3 modules and three linear layer on top of the transformer encoder output) for scene text recognition (STR).
- */
-export class MgpstrForSceneTextRecognition extends MgpstrPreTrainedModel {
-    /**
-     * @param {any} model_inputs
-     */
-    async _call(model_inputs) {
-        return new MgpstrModelOutput(await super._call(model_inputs));
-    }
-}
-
-//////////////////////////////////////////////////
-// PatchTST Transformer models
-export class PatchTSTPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare PatchTST Model outputting raw hidden-states without any specific head.
- */
-export class PatchTSTModel extends PatchTSTPreTrainedModel {}
-
-/**
- * The PatchTST for prediction model.
- */
-export class PatchTSTForPrediction extends PatchTSTPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// PatchTSMixer Transformer models
-export class PatchTSMixerPreTrainedModel extends PreTrainedModel {}
-
-/**
- * The bare PatchTSMixer Model outputting raw hidden-states without any specific head.
- */
-export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel {}
-
-/**
- * The PatchTSMixer for prediction model.
- */
-export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel {}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class UltravoxPreTrainedModel extends PreTrainedModel {
-    forward_params = ['input_ids', 'attention_mask', 'position_ids', 'audio_values', 'past_key_values'];
-}
-
-export class UltravoxModel extends UltravoxPreTrainedModel {
-    _merge_input_ids_with_audio_features(kwargs) {
-        const audio_hidden_size = kwargs.audio_features.dims.at(-1);
-        const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
-
-        return default_merge_input_ids_with_audio_features({
-            // @ts-ignore
-            audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
-            ...kwargs,
-            audio_features: reshaped_audio_features,
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-export class VoxtralForConditionalGeneration extends UltravoxModel {}
-
-//////////////////////////////////////////////////
-// Mimi models
-export class MimiPreTrainedModel extends PreTrainedModel {
-    main_input_name = 'input_values';
-    forward_params = ['input_values'];
-}
-
-export class MimiEncoderOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
-     */
-    constructor({ audio_codes }) {
-        super();
-        this.audio_codes = audio_codes;
-    }
-}
-
-export class MimiDecoderOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
-     */
-    constructor({ audio_values }) {
-        super();
-        this.audio_values = audio_values;
-    }
-}
-
-/**
- * The Mimi neural audio codec model.
- */
-export class MimiModel extends MimiPreTrainedModel {
-    /**
-     * Encodes the input audio waveform into discrete codes.
-     * @param {Object} inputs Model inputs
-     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
-     * @returns {Promise<MimiEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
-     */
-    async encode(inputs) {
-        return new MimiEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
-    }
-
-    /**
-     * Decodes the given frames into an output audio waveform.
-     * @param {MimiEncoderOutput} inputs The encoded audio codes.
-     * @returns {Promise<MimiDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
-     */
-    async decode(inputs) {
-        return new MimiDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
-    }
-}
-
-export class MimiEncoderModel extends MimiPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'encoder_model',
-        });
-    }
-}
-export class MimiDecoderModel extends MimiPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'decoder_model',
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Dac models
-export class DacPreTrainedModel extends PreTrainedModel {
-    main_input_name = 'input_values';
-    forward_params = ['input_values'];
-}
-
-export class DacEncoderOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
-     */
-    constructor({ audio_codes }) {
-        super();
-        this.audio_codes = audio_codes;
-    }
-}
-
-export class DacDecoderOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
-     */
-    constructor({ audio_values }) {
-        super();
-        this.audio_values = audio_values;
-    }
-}
-
-/**
- * The DAC (Descript Audio Codec) model.
- */
-export class DacModel extends DacPreTrainedModel {
-    /**
-     * Encodes the input audio waveform into discrete codes.
-     * @param {Object} inputs Model inputs
-     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
-     * @returns {Promise<DacEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
-     */
-    async encode(inputs) {
-        return new DacEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
-    }
-
-    /**
-     * Decodes the given frames into an output audio waveform.
-     * @param {DacEncoderOutput} inputs The encoded audio codes.
-     * @returns {Promise<DacDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
-     */
-    async decode(inputs) {
-        return new DacDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
-    }
-}
-
-export class DacEncoderModel extends DacPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'encoder_model',
-        });
-    }
-}
-export class DacDecoderModel extends DacPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'decoder_model',
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-// Snac models
-export class SnacPreTrainedModel extends PreTrainedModel {
-    main_input_name = 'input_values';
-    forward_params = ['input_values'];
-}
-
-/**
- * The SNAC (Multi-Scale Neural Audio Codec) model.
- */
-export class SnacModel extends SnacPreTrainedModel {
-    /**
-     * Encodes the input audio waveform into discrete codes.
-     * @param {Object} inputs Model inputs
-     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
-     * @returns {Promise<Record<string, Tensor>>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`.
-     */
-    async encode(inputs) {
-        return await sessionRun(this.sessions['encoder_model'], inputs);
-    }
-
-    /**
-     * Decodes the given frames into an output audio waveform.
-     * @param {Record<string, Tensor>} inputs The encoded audio codes.
-     * @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
-     */
-    async decode(inputs) {
-        return await sessionRun(this.sessions['decoder_model'], inputs);
-    }
-}
-
-export class SnacEncoderModel extends SnacPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'encoder_model',
-        });
-    }
-}
-export class SnacDecoderModel extends SnacPreTrainedModel {
-    /** @type {typeof PreTrainedModel.from_pretrained} */
-    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
-        return super.from_pretrained(pretrained_model_name_or_path, {
-            ...options,
-            // Update default model file name if not provided
-            model_file_name: options.model_file_name ?? 'decoder_model',
-        });
-    }
-}
-//////////////////////////////////////////////////
-
-export class ChatterboxPreTrainedModel extends PreTrainedModel {
-    forward_params = [
-        'input_ids',
-        'inputs_embeds',
-        'attention_mask',
-        'position_ids',
-        'audio_values',
-        'exaggeration',
-        'audio_features',
-        'audio_tokens',
-        'speaker_embeddings',
-        'speaker_features',
-        'past_key_values',
-    ];
-    main_input_name = 'input_ids';
-
-    _return_dict_in_generate_keys = ['audio_tokens', 'speaker_embeddings', 'speaker_features'];
-}
-export class ChatterboxModel extends ChatterboxPreTrainedModel {
-    /**
-     * @param {Tensor} audio_values
-     * @returns {Promise<{audio_features: Tensor, audio_tokens: Tensor, speaker_embeddings: Tensor, speaker_features: Tensor}>}
-     */
-    async encode_speech(audio_values) {
-        return sessionRun(this.sessions['speech_encoder'], {
-            audio_values,
-        });
-    }
-
-    async forward({
-        // Produced by the tokenizer/processor:
-        input_ids = null,
-        attention_mask = null,
-        audio_values = null,
-        exaggeration = null,
-
-        // Used during generation:
-        position_ids = null,
-        inputs_embeds = null,
-        past_key_values = null,
-
-        // Generic generation parameters
-        generation_config = null,
-        logits_processor = null,
-
-        // Speaker embeddings/features (useful for re-using pre-computed speaker data)
-        audio_features = null, // float32[batch_size,sequence_length,1024]
-        audio_tokens = null, // int64[batch_size,audio_sequence_length]
-        speaker_embeddings = null, // float32[batch_size,192]
-        speaker_features = null, // float32[batch_size,feature_dim,80]
-
-        // TODO: needed?
-        ...kwargs
-    }) {
-        let speech_encoder_outputs;
-        if (!inputs_embeds) {
-            const expected_inputs = this.sessions['embed_tokens'].inputNames;
-            const embed_model_inputs = { input_ids };
-            if (expected_inputs.includes('exaggeration')) {
-                // Support the following types for exaggeration:
-                // 1. null/undefined (no exaggeration): use the default of 0.5
-                // 2. number: broadcast to (batch_size,)
-                // 3. number[]: convert to Tensor of shape (batch_size,)
-                // 4. Tensor of shape (batch_size, 1)
-                if (!(exaggeration instanceof Tensor)) {
-                    const batch_size = input_ids.dims[0];
-                    if (exaggeration == null) {
-                        exaggeration = full([batch_size], 0.5);
-                    } else if (typeof exaggeration === 'number') {
-                        exaggeration = full([batch_size], exaggeration);
-                    } else if (Array.isArray(exaggeration)) {
-                        exaggeration = new Tensor('float32', exaggeration, [batch_size]);
-                    } else {
-                        throw new Error('Unsupported type for `exaggeration` input');
-                    }
-                }
-                embed_model_inputs.exaggeration = exaggeration;
-            }
-            if (expected_inputs.includes('position_ids')) {
-                embed_model_inputs.position_ids = position_ids;
-            }
-
-            ({ inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], embed_model_inputs));
-
-            if (audio_features && audio_tokens && speaker_embeddings && speaker_features) {
-                // Use pre-computed speech encoder outputs
-                speech_encoder_outputs = { audio_features, audio_tokens, speaker_embeddings, speaker_features };
-            }
-
-            if (speech_encoder_outputs || audio_values) {
-                speech_encoder_outputs ??= await this.encode_speech(audio_values);
-
-                // Update LLM inputs
-                inputs_embeds = cat([speech_encoder_outputs.audio_features, inputs_embeds], 1);
-                attention_mask = ones([inputs_embeds.dims[0], inputs_embeds.dims[1]]);
-            } else {
-                const target_length = inputs_embeds.dims[1];
-                if (!past_key_values || target_length !== 1) {
-                    throw new Error('Incorrect state encountered during generation.');
-                }
-                const past_length = Object.values(past_key_values)[0].dims.at(-2);
-                attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
-            }
-        }
-
-        const outputs = await decoderForward(
-            this,
-            {
-                inputs_embeds,
-                past_key_values,
-                attention_mask,
-                generation_config,
-                logits_processor,
-            },
-            false,
-        );
-        return {
-            ...outputs,
-            ...speech_encoder_outputs,
-        };
-    }
-
-    /** @type {PreTrainedModel['generate']} */
-    async generate(params) {
-        const { sequences, audio_tokens, speaker_embeddings, speaker_features } = /** @type {any} */ (
-            await super.generate({
-                ...params,
-                return_dict_in_generate: true,
-            })
-        );
-
-        const new_tokens = sequences.slice(null, [
-            params.input_ids.dims[1], // Exclude start of speech token
-            -1, // Exclude end of speech token
-        ]);
-
-        const SILENCE_TOKEN = 4299n;
-        const silence_tokens = full([new_tokens.dims[0], 3], SILENCE_TOKEN); // Add 3 silence tokens
-        const speech_tokens = cat([audio_tokens, new_tokens, silence_tokens], 1);
-
-        const { waveform } = await sessionRun(this.sessions['conditional_decoder'], {
-            speech_tokens,
-            speaker_features,
-            speaker_embeddings,
-        });
-        return waveform;
-    }
-}
+// Re-export all model classes from registry
+export * from './models/registry.js';
 
-//////////////////////////////////////////////////
-// AutoModels, used to simplify construction of PreTrainedModels
-// (uses config to instantiate correct class)
+import {
+    CUSTOM_ARCHITECTURES,
+    MODEL_CLASS_TYPE_MAPPING,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
+    MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES,
+    MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES,
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+    MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES,
+    MODEL_FOR_MASK_GENERATION_MAPPING_NAMES,
+    MODEL_FOR_CTC_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES,
+    MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES,
+    MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES,
+    MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES,
+} from './models/registry.js';
 
 /**
  * Base class of all AutoModels. Contains the `from_pretrained` function
@@ -8174,583 +157,6 @@ export class PretrainedMixin {
     }
 }
 
-const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
-    ['bert', ['BertModel', BertModel]],
-    ['neobert', ['NeoBertModel', NeoBertModel]],
-    ['modernbert', ['ModernBertModel', ModernBertModel]],
-    ['nomic_bert', ['NomicBertModel', NomicBertModel]],
-    ['roformer', ['RoFormerModel', RoFormerModel]],
-    ['electra', ['ElectraModel', ElectraModel]],
-    ['esm', ['EsmModel', EsmModel]],
-    ['convbert', ['ConvBertModel', ConvBertModel]],
-    ['camembert', ['CamembertModel', CamembertModel]],
-    ['deberta', ['DebertaModel', DebertaModel]],
-    ['deberta-v2', ['DebertaV2Model', DebertaV2Model]],
-    ['mpnet', ['MPNetModel', MPNetModel]],
-    ['albert', ['AlbertModel', AlbertModel]],
-    ['distilbert', ['DistilBertModel', DistilBertModel]],
-    ['roberta', ['RobertaModel', RobertaModel]],
-    ['xlm', ['XLMModel', XLMModel]],
-    ['xlm-roberta', ['XLMRobertaModel', XLMRobertaModel]],
-    ['clap', ['ClapModel', ClapModel]],
-    ['clip', ['CLIPModel', CLIPModel]],
-    ['clipseg', ['CLIPSegModel', CLIPSegModel]],
-    ['chinese_clip', ['ChineseCLIPModel', ChineseCLIPModel]],
-    ['siglip', ['SiglipModel', SiglipModel]],
-    ['jina_clip', ['JinaCLIPModel', JinaCLIPModel]],
-    ['mobilebert', ['MobileBertModel', MobileBertModel]],
-    ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]],
-    ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]],
-    ['wav2vec2-bert', ['Wav2Vec2BertModel', Wav2Vec2BertModel]],
-    ['unispeech', ['UniSpeechModel', UniSpeechModel]],
-    ['unispeech-sat', ['UniSpeechSatModel', UniSpeechSatModel]],
-    ['hubert', ['HubertModel', HubertModel]],
-    ['wavlm', ['WavLMModel', WavLMModel]],
-    ['audio-spectrogram-transformer', ['ASTModel', ASTModel]],
-    ['vits', ['VitsModel', VitsModel]],
-    ['pyannote', ['PyAnnoteModel', PyAnnoteModel]],
-    ['wespeaker-resnet', ['WeSpeakerResNetModel', WeSpeakerResNetModel]],
-
-    ['detr', ['DetrModel', DetrModel]],
-    ['rt_detr', ['RTDetrModel', RTDetrModel]],
-    ['rt_detr_v2', ['RTDetrV2Model', RTDetrV2Model]],
-    ['rf_detr', ['RFDetrModel', RFDetrModel]],
-    ['d_fine', ['DFineModel', DFineModel]],
-    ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
-    ['vit', ['ViTModel', ViTModel]],
-    ['ijepa', ['IJepaModel', IJepaModel]],
-    ['pvt', ['PvtModel', PvtModel]],
-    ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
-    ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
-    ['groupvit', ['GroupViTModel', GroupViTModel]],
-    ['fastvit', ['FastViTModel', FastViTModel]],
-    ['mobilevit', ['MobileViTModel', MobileViTModel]],
-    ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
-    ['owlvit', ['OwlViTModel', OwlViTModel]],
-    ['owlv2', ['Owlv2Model', Owlv2Model]],
-    ['beit', ['BeitModel', BeitModel]],
-    ['deit', ['DeiTModel', DeiTModel]],
-    ['hiera', ['HieraModel', HieraModel]],
-    ['convnext', ['ConvNextModel', ConvNextModel]],
-    ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]],
-    ['dinov2', ['Dinov2Model', Dinov2Model]],
-    ['dinov2_with_registers', ['Dinov2WithRegistersModel', Dinov2WithRegistersModel]],
-    ['dinov3_vit', ['DINOv3ViTModel', DINOv3ViTModel]],
-    ['dinov3_convnext', ['DINOv3ConvNextModel', DINOv3ConvNextModel]],
-    ['resnet', ['ResNetModel', ResNetModel]],
-    ['swin', ['SwinModel', SwinModel]],
-    ['swin2sr', ['Swin2SRModel', Swin2SRModel]],
-    ['donut-swin', ['DonutSwinModel', DonutSwinModel]],
-    ['yolos', ['YolosModel', YolosModel]],
-    ['dpt', ['DPTModel', DPTModel]],
-    ['glpn', ['GLPNModel', GLPNModel]],
-
-    ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
-    ['efficientnet', ['EfficientNetModel', EfficientNetModel]],
-
-    ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]],
-    ['patchtst', ['PatchTSTForPrediction', PatchTSTModel]],
-    ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerModel]],
-
-    ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]],
-    ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]],
-    ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]],
-    ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]],
-
-    ['maskformer', ['MaskFormerModel', MaskFormerModel]],
-    ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]],
-
-    ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]],
-]);
-
-const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
-    ['t5', ['T5Model', T5Model]],
-    ['longt5', ['LongT5Model', LongT5Model]],
-    ['mt5', ['MT5Model', MT5Model]],
-    ['bart', ['BartModel', BartModel]],
-    ['mbart', ['MBartModel', MBartModel]],
-    ['marian', ['MarianModel', MarianModel]],
-    ['whisper', ['WhisperModel', WhisperModel]],
-    ['m2m_100', ['M2M100Model', M2M100Model]],
-    ['blenderbot', ['BlenderbotModel', BlenderbotModel]],
-    ['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]],
-]);
-
-const MODEL_MAPPING_NAMES_AUTO_ENCODER = new Map([
-    ['mimi', ['MimiModel', MimiModel]],
-    ['dac', ['DacModel', DacModel]],
-    ['snac', ['SnacModel', SnacModel]],
-]);
-
-const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
-    ['bloom', ['BloomModel', BloomModel]],
-    ['jais', ['JAISModel', JAISModel]],
-    ['gpt2', ['GPT2Model', GPT2Model]],
-    ['gpt_oss', ['GptOssModel', GptOssModel]],
-    ['gptj', ['GPTJModel', GPTJModel]],
-    ['gpt_bigcode', ['GPTBigCodeModel', GPTBigCodeModel]],
-    ['gpt_neo', ['GPTNeoModel', GPTNeoModel]],
-    ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
-    ['codegen', ['CodeGenModel', CodeGenModel]],
-    ['llama', ['LlamaModel', LlamaModel]],
-    ['apertus', ['ApertusModel', ApertusModel]],
-    ['nanochat', ['NanoChatModel', NanoChatModel]],
-    ['arcee', ['ArceeModel', ArceeModel]],
-    ['lfm2', ['Lfm2Model', Lfm2Model]],
-    ['smollm3', ['SmolLM3Model', SmolLM3Model]],
-    ['exaone', ['ExaoneModel', ExaoneModel]],
-    ['olmo', ['OlmoModel', OlmoModel]],
-    ['olmo2', ['Olmo2Model', Olmo2Model]],
-    ['olmo3', ['Olmo3Model', Olmo3Model]],
-    ['mobilellm', ['MobileLLMModel', MobileLLMModel]],
-    ['granite', ['GraniteModel', GraniteModel]],
-    ['granitemoehybrid', ['GraniteMoeHybridModel', GraniteMoeHybridModel]],
-    ['cohere', ['CohereModel', CohereModel]],
-    ['gemma', ['GemmaModel', GemmaModel]],
-    ['gemma2', ['Gemma2Model', Gemma2Model]],
-    ['vaultgemma', ['VaultGemmaModel', VaultGemmaModel]],
-    ['gemma3_text', ['Gemma3Model', Gemma3Model]],
-    ['helium', ['HeliumModel', HeliumModel]],
-    ['glm', ['GlmModel', GlmModel]],
-    ['openelm', ['OpenELMModel', OpenELMModel]],
-    ['qwen2', ['Qwen2Model', Qwen2Model]],
-    ['qwen3', ['Qwen3Model', Qwen3Model]],
-    ['phi', ['PhiModel', PhiModel]],
-    ['phi3', ['Phi3Model', Phi3Model]],
-    ['mpt', ['MptModel', MptModel]],
-    ['opt', ['OPTModel', OPTModel]],
-    ['mistral', ['MistralModel', MistralModel]],
-    ['ernie4_5', ['Ernie4_5_Model', Ernie4_5_Model]],
-    ['starcoder2', ['Starcoder2Model', Starcoder2Model]],
-    ['falcon', ['FalconModel', FalconModel]],
-    ['stablelm', ['StableLmModel', StableLmModel]],
-    ['modernbert-decoder', ['ModernBertDecoderModel', ModernBertDecoderModel]],
-]);
-
-const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
-    ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
-    ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
-    ['lite-whisper', ['LiteWhisperForConditionalGeneration', LiteWhisperForConditionalGeneration]],
-    ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
-]);
-
-const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
-    ['speecht5', ['SpeechT5ForTextToSpeech', SpeechT5ForTextToSpeech]],
-]);
-
-const MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = new Map([
-    ['vits', ['VitsModel', VitsModel]],
-    ['musicgen', ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration]],
-    ['supertonic', ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration]],
-]);
-
-const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([
-    ['bert', ['BertForSequenceClassification', BertForSequenceClassification]],
-    ['neobert', ['NeoBertForSequenceClassification', NeoBertForSequenceClassification]],
-    ['modernbert', ['ModernBertForSequenceClassification', ModernBertForSequenceClassification]],
-    ['roformer', ['RoFormerForSequenceClassification', RoFormerForSequenceClassification]],
-    ['electra', ['ElectraForSequenceClassification', ElectraForSequenceClassification]],
-    ['esm', ['EsmForSequenceClassification', EsmForSequenceClassification]],
-    ['convbert', ['ConvBertForSequenceClassification', ConvBertForSequenceClassification]],
-    ['camembert', ['CamembertForSequenceClassification', CamembertForSequenceClassification]],
-    ['deberta', ['DebertaForSequenceClassification', DebertaForSequenceClassification]],
-    ['deberta-v2', ['DebertaV2ForSequenceClassification', DebertaV2ForSequenceClassification]],
-    ['mpnet', ['MPNetForSequenceClassification', MPNetForSequenceClassification]],
-    ['albert', ['AlbertForSequenceClassification', AlbertForSequenceClassification]],
-    ['distilbert', ['DistilBertForSequenceClassification', DistilBertForSequenceClassification]],
-    ['roberta', ['RobertaForSequenceClassification', RobertaForSequenceClassification]],
-    ['xlm', ['XLMForSequenceClassification', XLMForSequenceClassification]],
-    ['xlm-roberta', ['XLMRobertaForSequenceClassification', XLMRobertaForSequenceClassification]],
-    ['bart', ['BartForSequenceClassification', BartForSequenceClassification]],
-    ['mbart', ['MBartForSequenceClassification', MBartForSequenceClassification]],
-    ['mobilebert', ['MobileBertForSequenceClassification', MobileBertForSequenceClassification]],
-    ['squeezebert', ['SqueezeBertForSequenceClassification', SqueezeBertForSequenceClassification]],
-]);
-
-const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([
-    ['bert', ['BertForTokenClassification', BertForTokenClassification]],
-    ['neobert', ['NeoBertForTokenClassification', NeoBertForTokenClassification]],
-    ['modernbert', ['ModernBertForTokenClassification', ModernBertForTokenClassification]],
-    ['roformer', ['RoFormerForTokenClassification', RoFormerForTokenClassification]],
-    ['electra', ['ElectraForTokenClassification', ElectraForTokenClassification]],
-    ['esm', ['EsmForTokenClassification', EsmForTokenClassification]],
-    ['convbert', ['ConvBertForTokenClassification', ConvBertForTokenClassification]],
-    ['camembert', ['CamembertForTokenClassification', CamembertForTokenClassification]],
-    ['deberta', ['DebertaForTokenClassification', DebertaForTokenClassification]],
-    ['deberta-v2', ['DebertaV2ForTokenClassification', DebertaV2ForTokenClassification]],
-    ['mpnet', ['MPNetForTokenClassification', MPNetForTokenClassification]],
-    ['distilbert', ['DistilBertForTokenClassification', DistilBertForTokenClassification]],
-    ['roberta', ['RobertaForTokenClassification', RobertaForTokenClassification]],
-    ['xlm', ['XLMForTokenClassification', XLMForTokenClassification]],
-    ['xlm-roberta', ['XLMRobertaForTokenClassification', XLMRobertaForTokenClassification]],
-]);
-
-const MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = new Map([
-    ['t5', ['T5ForConditionalGeneration', T5ForConditionalGeneration]],
-    ['longt5', ['LongT5ForConditionalGeneration', LongT5ForConditionalGeneration]],
-    ['mt5', ['MT5ForConditionalGeneration', MT5ForConditionalGeneration]],
-    ['bart', ['BartForConditionalGeneration', BartForConditionalGeneration]],
-    ['mbart', ['MBartForConditionalGeneration', MBartForConditionalGeneration]],
-    ['marian', ['MarianMTModel', MarianMTModel]],
-    ['m2m_100', ['M2M100ForConditionalGeneration', M2M100ForConditionalGeneration]],
-    ['blenderbot', ['BlenderbotForConditionalGeneration', BlenderbotForConditionalGeneration]],
-    ['blenderbot-small', ['BlenderbotSmallForConditionalGeneration', BlenderbotSmallForConditionalGeneration]],
-]);
-
-const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
-    ['bloom', ['BloomForCausalLM', BloomForCausalLM]],
-    ['gpt2', ['GPT2LMHeadModel', GPT2LMHeadModel]],
-    ['gpt_oss', ['GptOssForCausalLM', GptOssForCausalLM]],
-    ['jais', ['JAISLMHeadModel', JAISLMHeadModel]],
-    ['gptj', ['GPTJForCausalLM', GPTJForCausalLM]],
-    ['gpt_bigcode', ['GPTBigCodeForCausalLM', GPTBigCodeForCausalLM]],
-    ['gpt_neo', ['GPTNeoForCausalLM', GPTNeoForCausalLM]],
-    ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
-    ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
-    ['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
-    ['nanochat', ['NanoChatForCausalLM', NanoChatForCausalLM]],
-    ['apertus', ['ApertusForCausalLM', ApertusForCausalLM]],
-    ['llama4_text', ['Llama4ForCausalLM', Llama4ForCausalLM]],
-    ['arcee', ['ArceeForCausalLM', ArceeForCausalLM]],
-    ['lfm2', ['Lfm2ForCausalLM', Lfm2ForCausalLM]],
-    ['smollm3', ['SmolLM3ForCausalLM', SmolLM3ForCausalLM]],
-    ['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
-    ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
-    ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
-    ['olmo3', ['Olmo3ForCausalLM', Olmo3ForCausalLM]],
-    ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
-    ['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
-    ['granitemoehybrid', ['GraniteMoeHybridForCausalLM', GraniteMoeHybridForCausalLM]],
-    ['cohere', ['CohereForCausalLM', CohereForCausalLM]],
-    ['gemma', ['GemmaForCausalLM', GemmaForCausalLM]],
-    ['gemma2', ['Gemma2ForCausalLM', Gemma2ForCausalLM]],
-    ['vaultgemma', ['VaultGemmaForCausalLM', VaultGemmaForCausalLM]],
-    ['gemma3_text', ['Gemma3ForCausalLM', Gemma3ForCausalLM]],
-    ['helium', ['HeliumForCausalLM', HeliumForCausalLM]],
-    ['glm', ['GlmForCausalLM', GlmForCausalLM]],
-    ['openelm', ['OpenELMForCausalLM', OpenELMForCausalLM]],
-    ['qwen2', ['Qwen2ForCausalLM', Qwen2ForCausalLM]],
-    ['qwen3', ['Qwen3ForCausalLM', Qwen3ForCausalLM]],
-    ['phi', ['PhiForCausalLM', PhiForCausalLM]],
-    ['phi3', ['Phi3ForCausalLM', Phi3ForCausalLM]],
-    ['mpt', ['MptForCausalLM', MptForCausalLM]],
-    ['opt', ['OPTForCausalLM', OPTForCausalLM]],
-    ['mbart', ['MBartForCausalLM', MBartForCausalLM]],
-    ['mistral', ['MistralForCausalLM', MistralForCausalLM]],
-    ['ernie4_5', ['Ernie4_5_ForCausalLM', Ernie4_5_ForCausalLM]],
-    ['starcoder2', ['Starcoder2ForCausalLM', Starcoder2ForCausalLM]],
-    ['falcon', ['FalconForCausalLM', FalconForCausalLM]],
-    ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
-    ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
-    ['modernbert-decoder', ['ModernBertDecoderForCausalLM', ModernBertDecoderForCausalLM]],
-
-    // Also image-text-to-text
-    ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
-]);
-
-const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
-    ['multi_modality', ['MultiModalityCausalLM', MultiModalityCausalLM]],
-]);
-
-const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([
-    ['bert', ['BertForMaskedLM', BertForMaskedLM]],
-    ['neobert', ['NeoBertForMaskedLM', NeoBertForMaskedLM]],
-    ['modernbert', ['ModernBertForMaskedLM', ModernBertForMaskedLM]],
-    ['roformer', ['RoFormerForMaskedLM', RoFormerForMaskedLM]],
-    ['electra', ['ElectraForMaskedLM', ElectraForMaskedLM]],
-    ['esm', ['EsmForMaskedLM', EsmForMaskedLM]],
-    ['convbert', ['ConvBertForMaskedLM', ConvBertForMaskedLM]],
-    ['camembert', ['CamembertForMaskedLM', CamembertForMaskedLM]],
-    ['deberta', ['DebertaForMaskedLM', DebertaForMaskedLM]],
-    ['deberta-v2', ['DebertaV2ForMaskedLM', DebertaV2ForMaskedLM]],
-    ['mpnet', ['MPNetForMaskedLM', MPNetForMaskedLM]],
-    ['albert', ['AlbertForMaskedLM', AlbertForMaskedLM]],
-    ['distilbert', ['DistilBertForMaskedLM', DistilBertForMaskedLM]],
-    ['roberta', ['RobertaForMaskedLM', RobertaForMaskedLM]],
-    ['xlm', ['XLMWithLMHeadModel', XLMWithLMHeadModel]],
-    ['xlm-roberta', ['XLMRobertaForMaskedLM', XLMRobertaForMaskedLM]],
-    ['mobilebert', ['MobileBertForMaskedLM', MobileBertForMaskedLM]],
-    ['squeezebert', ['SqueezeBertForMaskedLM', SqueezeBertForMaskedLM]],
-]);
-
-const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
-    ['bert', ['BertForQuestionAnswering', BertForQuestionAnswering]],
-    ['neobert', ['NeoBertForQuestionAnswering', NeoBertForQuestionAnswering]],
-    ['roformer', ['RoFormerForQuestionAnswering', RoFormerForQuestionAnswering]],
-    ['electra', ['ElectraForQuestionAnswering', ElectraForQuestionAnswering]],
-    ['convbert', ['ConvBertForQuestionAnswering', ConvBertForQuestionAnswering]],
-    ['camembert', ['CamembertForQuestionAnswering', CamembertForQuestionAnswering]],
-    ['deberta', ['DebertaForQuestionAnswering', DebertaForQuestionAnswering]],
-    ['deberta-v2', ['DebertaV2ForQuestionAnswering', DebertaV2ForQuestionAnswering]],
-    ['mpnet', ['MPNetForQuestionAnswering', MPNetForQuestionAnswering]],
-    ['albert', ['AlbertForQuestionAnswering', AlbertForQuestionAnswering]],
-    ['distilbert', ['DistilBertForQuestionAnswering', DistilBertForQuestionAnswering]],
-    ['roberta', ['RobertaForQuestionAnswering', RobertaForQuestionAnswering]],
-    ['xlm', ['XLMForQuestionAnswering', XLMForQuestionAnswering]],
-    ['xlm-roberta', ['XLMRobertaForQuestionAnswering', XLMRobertaForQuestionAnswering]],
-    ['mobilebert', ['MobileBertForQuestionAnswering', MobileBertForQuestionAnswering]],
-    ['squeezebert', ['SqueezeBertForQuestionAnswering', SqueezeBertForQuestionAnswering]],
-]);
-
-const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([
-    ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
-    ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
-    ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
-]);
-
-const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
-    ['llava', ['LlavaForConditionalGeneration', LlavaForConditionalGeneration]],
-    ['llava_onevision', ['LlavaOnevisionForConditionalGeneration', LlavaOnevisionForConditionalGeneration]],
-    ['moondream1', ['Moondream1ForConditionalGeneration', Moondream1ForConditionalGeneration]],
-    ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
-    ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
-    ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
-    ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
-    ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
-    ['llava_qwen2', ['LlavaQwen2ForCausalLM', LlavaQwen2ForCausalLM]],
-    ['gemma3n', ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration]],
-]);
-
-const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
-    ['ultravox', ['UltravoxModel', UltravoxModel]],
-    ['voxtral', ['VoxtralForConditionalGeneration', VoxtralForConditionalGeneration]],
-]);
-
-const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
-    ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
-]);
-
-const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
-    ['vit', ['ViTForImageClassification', ViTForImageClassification]],
-    ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
-    ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
-    ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
-    ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
-    ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
-    ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
-    ['beit', ['BeitForImageClassification', BeitForImageClassification]],
-    ['deit', ['DeiTForImageClassification', DeiTForImageClassification]],
-    ['hiera', ['HieraForImageClassification', HieraForImageClassification]],
-    ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],
-    ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]],
-    ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
-    ['dinov2_with_registers', ['Dinov2WithRegistersForImageClassification', Dinov2WithRegistersForImageClassification]],
-    ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
-    ['swin', ['SwinForImageClassification', SwinForImageClassification]],
-    ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]],
-    ['efficientnet', ['EfficientNetForImageClassification', EfficientNetForImageClassification]],
-    ['mobilenet_v1', ['MobileNetV1ForImageClassification', MobileNetV1ForImageClassification]],
-    ['mobilenet_v2', ['MobileNetV2ForImageClassification', MobileNetV2ForImageClassification]],
-    ['mobilenet_v3', ['MobileNetV3ForImageClassification', MobileNetV3ForImageClassification]],
-    ['mobilenet_v4', ['MobileNetV4ForImageClassification', MobileNetV4ForImageClassification]],
-]);
-
-const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
-    ['detr', ['DetrForObjectDetection', DetrForObjectDetection]],
-    ['rt_detr', ['RTDetrForObjectDetection', RTDetrForObjectDetection]],
-    ['rt_detr_v2', ['RTDetrV2ForObjectDetection', RTDetrV2ForObjectDetection]],
-    ['rf_detr', ['RFDetrForObjectDetection', RFDetrForObjectDetection]],
-    ['d_fine', ['DFineForObjectDetection', DFineForObjectDetection]],
-    ['table-transformer', ['TableTransformerForObjectDetection', TableTransformerForObjectDetection]],
-    ['yolos', ['YolosForObjectDetection', YolosForObjectDetection]],
-]);
-
-const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
-    ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]],
-    ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]],
-    ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]],
-]);
-
-const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
-    // TODO: Do not add new models here
-    ['detr', ['DetrForSegmentation', DetrForSegmentation]],
-    ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
-]);
-
-const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
-    ['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]],
-    ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
-
-    ['swin', ['SwinForSemanticSegmentation', SwinForSemanticSegmentation]],
-    ['mobilenet_v1', ['MobileNetV1ForSemanticSegmentation', MobileNetV1ForSemanticSegmentation]],
-    ['mobilenet_v2', ['MobileNetV2ForSemanticSegmentation', MobileNetV2ForSemanticSegmentation]],
-    ['mobilenet_v3', ['MobileNetV3ForSemanticSegmentation', MobileNetV3ForSemanticSegmentation]],
-    ['mobilenet_v4', ['MobileNetV4ForSemanticSegmentation', MobileNetV4ForSemanticSegmentation]],
-]);
-
-const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
-    ['detr', ['DetrForSegmentation', DetrForSegmentation]],
-    ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
-]);
-
-const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
-    ['sam', ['SamModel', SamModel]],
-    ['sam2', ['Sam2Model', Sam2Model]],
-    ['edgetam', ['EdgeTamModel', EdgeTamModel]],
-    ['sam3_tracker', ['Sam3TrackerModel', Sam3TrackerModel]],
-]);
-
-const MODEL_FOR_CTC_MAPPING_NAMES = new Map([
-    ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]],
-    ['wav2vec2-bert', ['Wav2Vec2BertForCTC', Wav2Vec2BertForCTC]],
-    ['unispeech', ['UniSpeechForCTC', UniSpeechForCTC]],
-    ['unispeech-sat', ['UniSpeechSatForCTC', UniSpeechSatForCTC]],
-    ['wavlm', ['WavLMForCTC', WavLMForCTC]],
-    ['hubert', ['HubertForCTC', HubertForCTC]],
-    ['parakeet_ctc', ['ParakeetForCTC', ParakeetForCTC]],
-]);
-
-const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([
-    ['wav2vec2', ['Wav2Vec2ForSequenceClassification', Wav2Vec2ForSequenceClassification]],
-    ['wav2vec2-bert', ['Wav2Vec2BertForSequenceClassification', Wav2Vec2BertForSequenceClassification]],
-    ['unispeech', ['UniSpeechForSequenceClassification', UniSpeechForSequenceClassification]],
-    ['unispeech-sat', ['UniSpeechSatForSequenceClassification', UniSpeechSatForSequenceClassification]],
-    ['wavlm', ['WavLMForSequenceClassification', WavLMForSequenceClassification]],
-    ['hubert', ['HubertForSequenceClassification', HubertForSequenceClassification]],
-    ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]],
-]);
-
-const MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = new Map([['wavlm', ['WavLMForXVector', WavLMForXVector]]]);
-
-const MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = new Map([
-    ['unispeech-sat', ['UniSpeechSatForAudioFrameClassification', UniSpeechSatForAudioFrameClassification]],
-    ['wavlm', ['WavLMForAudioFrameClassification', WavLMForAudioFrameClassification]],
-    ['wav2vec2', ['Wav2Vec2ForAudioFrameClassification', Wav2Vec2ForAudioFrameClassification]],
-    ['pyannote', ['PyAnnoteForAudioFrameClassification', PyAnnoteForAudioFrameClassification]],
-]);
-
-const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([
-    ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]],
-]);
-
-const MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = new Map([
-    ['patchtst', ['PatchTSTForPrediction', PatchTSTForPrediction]],
-    ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerForPrediction]],
-]);
-
-const MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = new Map([
-    ['swin2sr', ['Swin2SRForImageSuperResolution', Swin2SRForImageSuperResolution]],
-]);
-
-const MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = new Map([
-    ['dpt', ['DPTForDepthEstimation', DPTForDepthEstimation]],
-    ['depth_anything', ['DepthAnythingForDepthEstimation', DepthAnythingForDepthEstimation]],
-    ['glpn', ['GLPNForDepthEstimation', GLPNForDepthEstimation]],
-    ['sapiens', ['SapiensForDepthEstimation', SapiensForDepthEstimation]],
-    ['depth_pro', ['DepthProForDepthEstimation', DepthProForDepthEstimation]],
-    ['metric3d', ['Metric3DForDepthEstimation', Metric3DForDepthEstimation]],
-    ['metric3dv2', ['Metric3Dv2ForDepthEstimation', Metric3Dv2ForDepthEstimation]],
-]);
-
-const MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES = new Map([
-    ['sapiens', ['SapiensForNormalEstimation', SapiensForNormalEstimation]],
-]);
-
-const MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES = new Map([
-    ['vitpose', ['VitPoseForPoseEstimation', VitPoseForPoseEstimation]],
-]);
-
-// NOTE: This is custom to Transformers.js, and is necessary because certain models
-// (e.g., CLIP) are split into vision and text components
-const MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = new Map([
-    ['clip', ['CLIPVisionModelWithProjection', CLIPVisionModelWithProjection]],
-    ['siglip', ['SiglipVisionModel', SiglipVisionModel]],
-    ['jina_clip', ['JinaCLIPVisionModel', JinaCLIPVisionModel]],
-]);
-
-const MODEL_CLASS_TYPE_MAPPING = [
-    // MODEL_MAPPING_NAMES:
-    [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly],
-    [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder],
-    [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly],
-    [MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_TYPES.AutoEncoder],
-
-    [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
-    [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
-    [MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.DecoderOnly],
-    [MODEL_FOR_MULTIMODALITY_MAPPING_NAMES, MODEL_TYPES.MultiModality],
-    [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq],
-    [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
-    [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.AudioTextToText],
-    [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES.MaskGeneration],
-    [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
-    [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-    [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-
-    // Custom:
-    [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
-];
-
-for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) {
-    // @ts-ignore
-    for (const [name, model] of mappings.values()) {
-        MODEL_TYPE_MAPPING.set(name, type);
-        MODEL_CLASS_TO_NAME_MAPPING.set(model, name);
-        MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
-    }
-}
-
-const CUSTOM_MAPPING = [
-    // OVERRIDE:
-    // TODO: Refactor to allow class to specify model
-    ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
-    ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
-
-    ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
-    ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
-    ['JinaCLIPTextModel', JinaCLIPTextModel, MODEL_TYPES.EncoderOnly],
-    ['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly],
-    ['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly],
-
-    ['DacEncoderModel', DacEncoderModel, MODEL_TYPES.EncoderOnly],
-    ['DacDecoderModel', DacDecoderModel, MODEL_TYPES.EncoderOnly],
-    ['MimiEncoderModel', MimiEncoderModel, MODEL_TYPES.EncoderOnly],
-    ['MimiDecoderModel', MimiDecoderModel, MODEL_TYPES.EncoderOnly],
-    ['SnacEncoderModel', SnacEncoderModel, MODEL_TYPES.EncoderOnly],
-    ['SnacDecoderModel', SnacDecoderModel, MODEL_TYPES.EncoderOnly],
-
-    ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration, MODEL_TYPES.ImageAudioTextToText],
-    ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
-    ['ChatterboxModel', ChatterboxModel, MODEL_TYPES.Chatterbox],
-];
-for (const [name, model, type] of CUSTOM_MAPPING) {
-    MODEL_TYPE_MAPPING.set(name, type);
-    MODEL_CLASS_TO_NAME_MAPPING.set(model, name);
-    MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
-}
-
-const CUSTOM_ARCHITECTURES = new Map([
-    ['modnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
-    ['birefnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
-    ['isnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
-    ['ben', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
-]);
-for (const [name, mapping] of CUSTOM_ARCHITECTURES.entries()) {
-    mapping.set(name, ['PreTrainedModel', PreTrainedModel]);
-    MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly);
-    MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, name);
-    MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel);
-}
-
 /**
  * Helper class which is used to instantiate pretrained models with the `from_pretrained` function.
  * The chosen model class is determined by the type specified in the model config.
@@ -8997,164 +403,5 @@ export class AutoModelForAudioTextToText extends PretrainedMixin {
     static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES];
 }
 
-//////////////////////////////////////////////////
-
-//////////////////////////////////////////////////
-export class Seq2SeqLMOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits The output logits of the model.
-     * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model.
-     * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model.
-     * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads.
-     * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads.
-     */
-    constructor({ logits, past_key_values, encoder_outputs, decoder_attentions = null, cross_attentions = null }) {
-        super();
-        this.logits = logits;
-        this.past_key_values = past_key_values;
-        this.encoder_outputs = encoder_outputs;
-        this.decoder_attentions = decoder_attentions;
-        this.cross_attentions = cross_attentions;
-    }
-}
-
-/**
- * Base class for outputs of sentence classification models.
- */
-export class SequenceClassifierOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
-     * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
-     * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-     */
-    constructor({ logits, ...attentions }) {
-        super();
-        this.logits = logits;
-        const attentions_list = Object.values(attentions);
-        if (attentions_list.length > 0) {
-            // Only set attentions if they are not empty
-            this.attentions = attentions_list;
-        }
-    }
-}
-
-/**
- * Base class for outputs of XVector models.
- */
-export class XVectorOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Classification hidden states before AMSoftmax, of shape `(batch_size, config.xvector_output_dim)`.
-     * @param {Tensor} output.embeddings Utterance embeddings used for vector similarity-based retrieval, of shape `(batch_size, config.xvector_output_dim)`.
-     */
-    constructor({ logits, embeddings }) {
-        super();
-        this.logits = logits;
-        this.embeddings = embeddings;
-    }
-}
-
-/**
- * Base class for outputs of token classification models.
- */
-export class TokenClassifierOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Classification scores (before SoftMax).
-     */
-    constructor({ logits }) {
-        super();
-        this.logits = logits;
-    }
-}
-
-/**
- * Base class for masked language models outputs.
- */
-export class MaskedLMOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-     */
-    constructor({ logits }) {
-        super();
-        this.logits = logits;
-    }
-}
-
-/**
- * Base class for outputs of question answering models.
- */
-export class QuestionAnsweringModelOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.start_logits Span-start scores (before SoftMax).
-     * @param {Tensor} output.end_logits Span-end scores (before SoftMax).
-     */
-    constructor({ start_logits, end_logits }) {
-        super();
-        this.start_logits = start_logits;
-        this.end_logits = end_logits;
-    }
-}
-
-/**
- * Base class for causal language model (or autoregressive) outputs.
- */
-export class CausalLMOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax).
-     */
-    constructor({ logits }) {
-        super();
-        this.logits = logits;
-    }
-}
-
-/**
- * Base class for causal language model (or autoregressive) outputs.
- */
-export class CausalLMOutputWithPast extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax).
-     * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks)
-     * that can be used (see `past_key_values` input) to speed up sequential decoding.
-     */
-    constructor({ logits, past_key_values }) {
-        super();
-        this.logits = logits;
-        this.past_key_values = past_key_values;
-    }
-}
-
-export class ImageMattingOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`.
-     */
-    constructor({ alphas }) {
-        super();
-        this.alphas = alphas;
-    }
-}
-
-/**
- * Describes the outputs for the VITS model.
- */
-export class VitsModelOutput extends ModelOutput {
-    /**
-     * @param {Object} output The output of the model.
-     * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`.
-     * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model.
-     * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform.
-     */
-    constructor({ waveform, spectrogram }) {
-        super();
-        this.waveform = waveform;
-        this.spectrogram = spectrogram;
-    }
-}
+// Re-export PreTrainedModel for backwards compatibility
+export { PreTrainedModel };
diff --git a/src/models/beit/image_processing_beit.js b/src/models/beit/image_processing_beit.js
deleted file mode 100644
index 5b952a248..000000000
--- a/src/models/beit/image_processing_beit.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class BeitFeatureExtractor extends ImageProcessor {}
diff --git a/src/models/bit/image_processing_bit.js b/src/models/bit/image_processing_bit.js
deleted file mode 100644
index 7a59a3987..000000000
--- a/src/models/bit/image_processing_bit.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class BitImageProcessor extends ImageProcessor {}
diff --git a/src/models/chinese_clip/image_processing_chinese_clip.js b/src/models/chinese_clip/image_processing_chinese_clip.js
deleted file mode 100644
index 3feed9f62..000000000
--- a/src/models/chinese_clip/image_processing_chinese_clip.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class ChineseCLIPFeatureExtractor extends ImageProcessor {}
diff --git a/src/models/dinov3_vit/image_processing_dinov3_vit.js b/src/models/dinov3_vit/image_processing_dinov3_vit.js
deleted file mode 100644
index abf5fac51..000000000
--- a/src/models/dinov3_vit/image_processing_dinov3_vit.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class DINOv3ViTImageProcessor extends ImageProcessor {}
diff --git a/src/models/feature-extractors.js b/src/models/feature-extractors.js
new file mode 100644
index 000000000..54a071d3f
--- /dev/null
+++ b/src/models/feature-extractors.js
@@ -0,0 +1,18 @@
+export * from './model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
+export * from './model-processors/encodec/feature_extraction_encodec.js';
+export * from './model-processors/chatterbox/feature_extraction_chatterbox.js';
+export * from './model-processors/clap/feature_extraction_clap.js';
+export * from './model-processors/dac/feature_extraction_dac.js';
+export * from './model-processors/gemma3n/feature_extraction_gemma3n.js';
+export * from './model-processors/moonshine/feature_extraction_moonshine.js';
+export * from './model-processors/parakeet/feature_extraction_parakeet.js';
+export * from './model-processors/pyannote/feature_extraction_pyannote.js';
+export * from './model-processors/seamless_m4t/feature_extraction_seamless_m4t.js';
+export * from './model-processors/snac/feature_extraction_snac.js';
+export * from './model-processors/speecht5/feature_extraction_speecht5.js';
+export * from './model-processors/wav2vec2/feature_extraction_wav2vec2.js';
+export * from './model-processors/wespeaker/feature_extraction_wespeaker.js';
+export * from './model-processors/whisper/feature_extraction_whisper.js';
+
+// For legacy support, ImageFeatureExtractor is an alias for ImageProcessor
+export { ImageProcessor as ImageFeatureExtractor } from '../base/image_processors_utils.js';
diff --git a/src/models/feature_extractors.js b/src/models/feature_extractors.js
deleted file mode 100644
index 2d19945ae..000000000
--- a/src/models/feature_extractors.js
+++ /dev/null
@@ -1,18 +0,0 @@
-export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
-export * from './encodec/feature_extraction_encodec.js';
-export * from './chatterbox/feature_extraction_chatterbox.js';
-export * from './clap/feature_extraction_clap.js';
-export * from './dac/feature_extraction_dac.js';
-export * from './gemma3n/feature_extraction_gemma3n.js';
-export * from './moonshine/feature_extraction_moonshine.js';
-export * from './parakeet/feature_extraction_parakeet.js';
-export * from './pyannote/feature_extraction_pyannote.js';
-export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
-export * from './snac/feature_extraction_snac.js';
-export * from './speecht5/feature_extraction_speecht5.js';
-export * from './wav2vec2/feature_extraction_wav2vec2.js';
-export * from './wespeaker/feature_extraction_wespeaker.js';
-export * from './whisper/feature_extraction_whisper.js';
-
-// For legacy support, ImageFeatureExtractor is an alias for ImageProcessor
-export { ImageProcessor as ImageFeatureExtractor } from '../base/image_processors_utils.js';
diff --git a/src/models/glpn/image_processing_glpn.js b/src/models/glpn/image_processing_glpn.js
deleted file mode 100644
index dd6b0ad4f..000000000
--- a/src/models/glpn/image_processing_glpn.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class GLPNFeatureExtractor extends ImageProcessor {}
diff --git a/src/models/image-processors.js b/src/models/image-processors.js
new file mode 100644
index 000000000..fb60330db
--- /dev/null
+++ b/src/models/image-processors.js
@@ -0,0 +1,42 @@
+export * from './model-processors/beit/image_processing_beit.js';
+export * from './model-processors/bit/image_processing_bit.js';
+export * from './model-processors/chinese_clip/image_processing_chinese_clip.js';
+export * from './model-processors/clip/image_processing_clip.js';
+export * from './model-processors/convnext/image_processing_convnext.js';
+export * from './model-processors/deit/image_processing_deit.js';
+export * from './model-processors/detr/image_processing_detr.js';
+export * from './model-processors/dinov3_vit/image_processing_dinov3_vit.js';
+export * from './model-processors/donut/image_processing_donut.js';
+export * from './model-processors/dpt/image_processing_dpt.js';
+export * from './model-processors/efficientnet/image_processing_efficientnet.js';
+export * from './model-processors/glpn/image_processing_glpn.js';
+export * from './model-processors/grounding_dino/image_processing_grounding_dino.js';
+export * from './model-processors/idefics3/image_processing_idefics3.js';
+export * from './model-processors/janus/image_processing_janus.js';
+export * from './model-processors/jina_clip/image_processing_jina_clip.js';
+export * from './model-processors/llava_onevision/image_processing_llava_onevision.js';
+export * from './model-processors/mask2former/image_processing_mask2former.js';
+export * from './model-processors/maskformer/image_processing_maskformer.js';
+export * from './model-processors/mobilenet_v1/image_processing_mobilenet_v1.js';
+export * from './model-processors/mobilenet_v2/image_processing_mobilenet_v2.js';
+export * from './model-processors/mobilenet_v3/image_processing_mobilenet_v3.js';
+export * from './model-processors/mobilenet_v4/image_processing_mobilenet_v4.js';
+export * from './model-processors/mobilevit/image_processing_mobilevit.js';
+export * from './model-processors/nougat/image_processing_nougat.js';
+export * from './model-processors/owlv2/image_processing_owlv2.js';
+export * from './model-processors/owlvit/image_processing_owlvit.js';
+export * from './model-processors/phi3_v/image_processing_phi3_v.js';
+export * from './model-processors/pvt/image_processing_pvt.js';
+export * from './model-processors/qwen2_vl/image_processing_qwen2_vl.js';
+export * from './model-processors/rt_detr/image_processing_rt_detr.js';
+export * from './model-processors/sam/image_processing_sam.js';
+export * from './model-processors/sam2/image_processing_sam2.js';
+export * from './model-processors/sam3/image_processing_sam3.js';
+export * from './model-processors/segformer/image_processing_segformer.js';
+export * from './model-processors/siglip/image_processing_siglip.js';
+export * from './model-processors/smolvlm/image_processing_smolvlm.js';
+export * from './model-processors/swin2sr/image_processing_swin2sr.js';
+export * from './model-processors/vit/image_processing_vit.js';
+export * from './model-processors/vitmatte/image_processing_vitmatte.js';
+export * from './model-processors/vitpose/image_processing_vitpose.js';
+export * from './model-processors/yolos/image_processing_yolos.js';
diff --git a/src/models/image_processors.js b/src/models/image_processors.js
deleted file mode 100644
index ef58ba41a..000000000
--- a/src/models/image_processors.js
+++ /dev/null
@@ -1,42 +0,0 @@
-export * from './beit/image_processing_beit.js';
-export * from './bit/image_processing_bit.js';
-export * from './chinese_clip/image_processing_chinese_clip.js';
-export * from './clip/image_processing_clip.js';
-export * from './convnext/image_processing_convnext.js';
-export * from './deit/image_processing_deit.js';
-export * from './detr/image_processing_detr.js';
-export * from './dinov3_vit/image_processing_dinov3_vit.js';
-export * from './donut/image_processing_donut.js';
-export * from './dpt/image_processing_dpt.js';
-export * from './efficientnet/image_processing_efficientnet.js';
-export * from './glpn/image_processing_glpn.js';
-export * from './grounding_dino/image_processing_grounding_dino.js';
-export * from './idefics3/image_processing_idefics3.js';
-export * from './janus/image_processing_janus.js';
-export * from './jina_clip/image_processing_jina_clip.js';
-export * from './llava_onevision/image_processing_llava_onevision.js';
-export * from './mask2former/image_processing_mask2former.js';
-export * from './maskformer/image_processing_maskformer.js';
-export * from './mobilenet_v1/image_processing_mobilenet_v1.js';
-export * from './mobilenet_v2/image_processing_mobilenet_v2.js';
-export * from './mobilenet_v3/image_processing_mobilenet_v3.js';
-export * from './mobilenet_v4/image_processing_mobilenet_v4.js';
-export * from './mobilevit/image_processing_mobilevit.js';
-export * from './nougat/image_processing_nougat.js';
-export * from './owlv2/image_processing_owlv2.js';
-export * from './owlvit/image_processing_owlvit.js';
-export * from './phi3_v/image_processing_phi3_v.js';
-export * from './pvt/image_processing_pvt.js';
-export * from './qwen2_vl/image_processing_qwen2_vl.js';
-export * from './rt_detr/image_processing_rt_detr.js';
-export * from './sam/image_processing_sam.js';
-export * from './sam2/image_processing_sam2.js';
-export * from './sam3/image_processing_sam3.js';
-export * from './segformer/image_processing_segformer.js';
-export * from './siglip/image_processing_siglip.js';
-export * from './smolvlm/image_processing_smolvlm.js';
-export * from './swin2sr/image_processing_swin2sr.js';
-export * from './vit/image_processing_vit.js';
-export * from './vitmatte/image_processing_vitmatte.js';
-export * from './vitpose/image_processing_vitpose.js';
-export * from './yolos/image_processing_yolos.js';
diff --git a/src/models/llava_onevision/image_processing_llava_onevision.js b/src/models/llava_onevision/image_processing_llava_onevision.js
deleted file mode 100644
index 95075666a..000000000
--- a/src/models/llava_onevision/image_processing_llava_onevision.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class LlavaOnevisionImageProcessor extends ImageProcessor {}
diff --git a/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js b/src/models/model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js
similarity index 94%
rename from src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js
rename to src/models/model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js
index 057e72a19..0f98d5e09 100644
--- a/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js
+++ b/src/models/model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js
@@ -1,6 +1,6 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
-import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js';
 
 export class ASTFeatureExtractor extends FeatureExtractor {
     constructor(config) {
diff --git a/src/models/auto/feature_extraction_auto.js b/src/models/model-processors/auto/feature_extraction_auto.js
similarity index 79%
rename from src/models/auto/feature_extraction_auto.js
rename to src/models/model-processors/auto/feature_extraction_auto.js
index c10f5bbef..e1a76f9aa 100644
--- a/src/models/auto/feature_extraction_auto.js
+++ b/src/models/model-processors/auto/feature_extraction_auto.js
@@ -1,7 +1,7 @@
-import { FEATURE_EXTRACTOR_NAME, GITHUB_ISSUE_URL } from '../../utils/constants.js';
-import { getModelJSON } from '../../utils/hub.js';
-import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
-import * as AllFeatureExtractors from '../feature_extractors.js';
+import { FEATURE_EXTRACTOR_NAME, GITHUB_ISSUE_URL } from '../../../utils/constants.js';
+import { getModelJSON } from '../../../utils/hub.js';
+import { FeatureExtractor } from '../../../base/feature_extraction_utils.js';
+import * as AllFeatureExtractors from '../../feature-extractors.js';
 
 export class AutoFeatureExtractor {
     /** @type {typeof FeatureExtractor.from_pretrained} */
diff --git a/src/models/auto/image_processing_auto.js b/src/models/model-processors/auto/image_processing_auto.js
similarity index 80%
rename from src/models/auto/image_processing_auto.js
rename to src/models/model-processors/auto/image_processing_auto.js
index 1a9348dc3..a841d25de 100644
--- a/src/models/auto/image_processing_auto.js
+++ b/src/models/model-processors/auto/image_processing_auto.js
@@ -1,7 +1,7 @@
-import { GITHUB_ISSUE_URL, IMAGE_PROCESSOR_NAME } from '../../utils/constants.js';
-import { getModelJSON } from '../../utils/hub.js';
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-import * as AllImageProcessors from '../image_processors.js';
+import { getModelJSON } from '../../../utils/hub.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+import * as AllImageProcessors from '../../image-processors.js';
+import { GITHUB_ISSUE_URL, IMAGE_PROCESSOR_NAME } from '../../../utils/constants.js';
 
 export class AutoImageProcessor {
     /** @type {typeof ImageProcessor.from_pretrained} */
diff --git a/src/models/auto/processing_auto.js b/src/models/model-processors/auto/processing_auto.js
similarity index 87%
rename from src/models/auto/processing_auto.js
rename to src/models/model-processors/auto/processing_auto.js
index 75b2b7335..5d2908516 100644
--- a/src/models/auto/processing_auto.js
+++ b/src/models/model-processors/auto/processing_auto.js
@@ -1,13 +1,13 @@
-import { IMAGE_PROCESSOR_NAME } from '../../utils/constants.js';
-import { getModelJSON } from '../../utils/hub.js';
-import { Processor } from '../../base/processing_utils.js';
+import { IMAGE_PROCESSOR_NAME } from '../../../utils/constants.js';
+import { getModelJSON } from '../../../utils/hub.js';
+import { Processor } from '../../../base/processing_utils.js';
 
-import * as AllProcessors from '../processors.js';
-import * as AllImageProcessors from '../image_processors.js';
-import * as AllFeatureExtractors from '../feature_extractors.js';
+import * as AllProcessors from '../../processors.js';
+import * as AllImageProcessors from '../../image-processors.js';
+import * as AllFeatureExtractors from '../../feature-extractors.js';
 
 /**
- * @typedef {import('../../base/processing_utils.js').PretrainedProcessorOptions} PretrainedProcessorOptions
+ * @typedef {import('../../../base/processing_utils.js').PretrainedProcessorOptions} PretrainedProcessorOptions
  */
 
 /**
diff --git a/src/models/model-processors/beit/image_processing_beit.js b/src/models/model-processors/beit/image_processing_beit.js
new file mode 100644
index 000000000..9eb07ef95
--- /dev/null
+++ b/src/models/model-processors/beit/image_processing_beit.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class BeitFeatureExtractor extends ImageProcessor {}
diff --git a/src/models/model-processors/bit/image_processing_bit.js b/src/models/model-processors/bit/image_processing_bit.js
new file mode 100644
index 000000000..75925ed6c
--- /dev/null
+++ b/src/models/model-processors/bit/image_processing_bit.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class BitImageProcessor extends ImageProcessor {}
diff --git a/src/models/chatterbox/feature_extraction_chatterbox.js b/src/models/model-processors/chatterbox/feature_extraction_chatterbox.js
similarity index 82%
rename from src/models/chatterbox/feature_extraction_chatterbox.js
rename to src/models/model-processors/chatterbox/feature_extraction_chatterbox.js
index 39cf46ffd..5675a9b27 100644
--- a/src/models/chatterbox/feature_extraction_chatterbox.js
+++ b/src/models/model-processors/chatterbox/feature_extraction_chatterbox.js
@@ -1,5 +1,5 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
 
 export class ChatterboxFeatureExtractor extends FeatureExtractor {
     /**
diff --git a/src/models/chatterbox/processing_chatterbox.js b/src/models/model-processors/chatterbox/processing_chatterbox.js
similarity index 82%
rename from src/models/chatterbox/processing_chatterbox.js
rename to src/models/model-processors/chatterbox/processing_chatterbox.js
index 546a3b63d..d54e43b55 100644
--- a/src/models/chatterbox/processing_chatterbox.js
+++ b/src/models/model-processors/chatterbox/processing_chatterbox.js
@@ -1,6 +1,6 @@
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { Processor } from '../../base/processing_utils.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { Processor } from '../../../base/processing_utils.js';
 
 /**
  * Represents a ChatterboxProcessor that extracts features from an audio input.
diff --git a/src/models/model-processors/chinese_clip/image_processing_chinese_clip.js b/src/models/model-processors/chinese_clip/image_processing_chinese_clip.js
new file mode 100644
index 000000000..97f5d09a2
--- /dev/null
+++ b/src/models/model-processors/chinese_clip/image_processing_chinese_clip.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class ChineseCLIPFeatureExtractor extends ImageProcessor {}
diff --git a/src/models/clap/feature_extraction_clap.js b/src/models/model-processors/clap/feature_extraction_clap.js
similarity index 97%
rename from src/models/clap/feature_extraction_clap.js
rename to src/models/model-processors/clap/feature_extraction_clap.js
index 605748616..c700a6ecd 100644
--- a/src/models/clap/feature_extraction_clap.js
+++ b/src/models/model-processors/clap/feature_extraction_clap.js
@@ -1,6 +1,6 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
-import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js';
 
 export class ClapFeatureExtractor extends FeatureExtractor {
     constructor(config) {
diff --git a/src/models/clip/image_processing_clip.js b/src/models/model-processors/clip/image_processing_clip.js
similarity index 62%
rename from src/models/clip/image_processing_clip.js
rename to src/models/model-processors/clip/image_processing_clip.js
index d40bb44c4..f649e09e9 100644
--- a/src/models/clip/image_processing_clip.js
+++ b/src/models/model-processors/clip/image_processing_clip.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class CLIPImageProcessor extends ImageProcessor {}
 export class CLIPFeatureExtractor extends CLIPImageProcessor {}
diff --git a/src/models/convnext/image_processing_convnext.js b/src/models/model-processors/convnext/image_processing_convnext.js
similarity index 95%
rename from src/models/convnext/image_processing_convnext.js
rename to src/models/model-processors/convnext/image_processing_convnext.js
index b5812c9c8..67e5de5b5 100644
--- a/src/models/convnext/image_processing_convnext.js
+++ b/src/models/model-processors/convnext/image_processing_convnext.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class ConvNextImageProcessor extends ImageProcessor {
     constructor(config) {
diff --git a/src/models/dac/feature_extraction_dac.js b/src/models/model-processors/dac/feature_extraction_dac.js
similarity index 100%
rename from src/models/dac/feature_extraction_dac.js
rename to src/models/model-processors/dac/feature_extraction_dac.js
diff --git a/src/models/deit/image_processing_deit.js b/src/models/model-processors/deit/image_processing_deit.js
similarity index 62%
rename from src/models/deit/image_processing_deit.js
rename to src/models/model-processors/deit/image_processing_deit.js
index 7313495c7..faa5c1563 100644
--- a/src/models/deit/image_processing_deit.js
+++ b/src/models/model-processors/deit/image_processing_deit.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class DeiTImageProcessor extends ImageProcessor {}
 export class DeiTFeatureExtractor extends DeiTImageProcessor {}
diff --git a/src/models/detr/image_processing_detr.js b/src/models/model-processors/detr/image_processing_detr.js
similarity index 79%
rename from src/models/detr/image_processing_detr.js
rename to src/models/model-processors/detr/image_processing_detr.js
index 7c8a653b0..cd441764c 100644
--- a/src/models/detr/image_processing_detr.js
+++ b/src/models/model-processors/detr/image_processing_detr.js
@@ -3,21 +3,21 @@ import {
     post_process_object_detection,
     post_process_panoptic_segmentation,
     post_process_instance_segmentation,
-} from '../../base/image_processors_utils.js';
+} from '../../../base/image_processors_utils.js';
 
-import { full } from '../../utils/tensor.js';
+import { full } from '../../../utils/tensor.js';
 
 /**
  * @typedef {object} DetrFeatureExtractorResultProps
- * @property {import('../../utils/tensor.js').Tensor} pixel_mask
- * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult
+ * @property {import('../../../utils/tensor.js').Tensor} pixel_mask
+ * @typedef {import('../../../base/image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult
  */
 
 export class DetrImageProcessor extends ImageProcessor {
     /**
      * Calls the feature extraction process on an array of images, preprocesses
      * each image, and concatenates the resulting features into a single Tensor.
-     * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
+     * @param {import('../../../utils/image.js').RawImage[]} images The image(s) to extract features from.
      * @returns {Promise<DetrFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
      */
     async _call(images) {
diff --git a/src/models/model-processors/dinov3_vit/image_processing_dinov3_vit.js b/src/models/model-processors/dinov3_vit/image_processing_dinov3_vit.js
new file mode 100644
index 000000000..534120b78
--- /dev/null
+++ b/src/models/model-processors/dinov3_vit/image_processing_dinov3_vit.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class DINOv3ViTImageProcessor extends ImageProcessor {}
diff --git a/src/models/donut/image_processing_donut.js b/src/models/model-processors/donut/image_processing_donut.js
similarity index 93%
rename from src/models/donut/image_processing_donut.js
rename to src/models/model-processors/donut/image_processing_donut.js
index 353c0bdd4..e778ff350 100644
--- a/src/models/donut/image_processing_donut.js
+++ b/src/models/model-processors/donut/image_processing_donut.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class DonutImageProcessor extends ImageProcessor {
     pad_image(pixelData, imgDims, padSize, options = {}) {
diff --git a/src/models/dpt/image_processing_dpt.js b/src/models/model-processors/dpt/image_processing_dpt.js
similarity index 67%
rename from src/models/dpt/image_processing_dpt.js
rename to src/models/model-processors/dpt/image_processing_dpt.js
index c854ae6fd..7789d3bf2 100644
--- a/src/models/dpt/image_processing_dpt.js
+++ b/src/models/model-processors/dpt/image_processing_dpt.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class DPTImageProcessor extends ImageProcessor {}
 export class DPTFeatureExtractor extends DPTImageProcessor {} // NOTE: extends DPTImageProcessor
diff --git a/src/models/efficientnet/image_processing_efficientnet.js b/src/models/model-processors/efficientnet/image_processing_efficientnet.js
similarity index 81%
rename from src/models/efficientnet/image_processing_efficientnet.js
rename to src/models/model-processors/efficientnet/image_processing_efficientnet.js
index 837af8840..ef39f10aa 100644
--- a/src/models/efficientnet/image_processing_efficientnet.js
+++ b/src/models/model-processors/efficientnet/image_processing_efficientnet.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class EfficientNetImageProcessor extends ImageProcessor {
     constructor(config) {
diff --git a/src/models/encodec/feature_extraction_encodec.js b/src/models/model-processors/encodec/feature_extraction_encodec.js
similarity index 87%
rename from src/models/encodec/feature_extraction_encodec.js
rename to src/models/model-processors/encodec/feature_extraction_encodec.js
index bb60a5d63..83a1430a1 100644
--- a/src/models/encodec/feature_extraction_encodec.js
+++ b/src/models/model-processors/encodec/feature_extraction_encodec.js
@@ -1,5 +1,5 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
 
 export class EncodecFeatureExtractor extends FeatureExtractor {
     /**
diff --git a/src/models/florence2/processing_florence2.js b/src/models/model-processors/florence2/processing_florence2.js
similarity index 97%
rename from src/models/florence2/processing_florence2.js
rename to src/models/model-processors/florence2/processing_florence2.js
index 13edc10a0..e817a5b4d 100644
--- a/src/models/florence2/processing_florence2.js
+++ b/src/models/model-processors/florence2/processing_florence2.js
@@ -1,6 +1,6 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 
 export class Florence2Processor extends Processor {
     static tokenizer_class = AutoTokenizer;
diff --git a/src/models/gemma3n/feature_extraction_gemma3n.js b/src/models/model-processors/gemma3n/feature_extraction_gemma3n.js
similarity index 95%
rename from src/models/gemma3n/feature_extraction_gemma3n.js
rename to src/models/model-processors/gemma3n/feature_extraction_gemma3n.js
index 5d77ebf24..d6fc75a95 100644
--- a/src/models/gemma3n/feature_extraction_gemma3n.js
+++ b/src/models/model-processors/gemma3n/feature_extraction_gemma3n.js
@@ -1,6 +1,6 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { full, Tensor } from '../../utils/tensor.js';
-import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { full, Tensor } from '../../../utils/tensor.js';
+import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js';
 
 export class Gemma3nAudioFeatureExtractor extends FeatureExtractor {
     constructor(config) {
diff --git a/src/models/gemma3n/processing_gemma3n.js b/src/models/model-processors/gemma3n/processing_gemma3n.js
similarity index 92%
rename from src/models/gemma3n/processing_gemma3n.js
rename to src/models/model-processors/gemma3n/processing_gemma3n.js
index a3b47741d..e8162f6f6 100644
--- a/src/models/gemma3n/processing_gemma3n.js
+++ b/src/models/model-processors/gemma3n/processing_gemma3n.js
@@ -1,9 +1,9 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { RawImage } from '../../utils/image.js';
-import { RawAudio } from '../../utils/audio.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { RawImage } from '../../../utils/image.js';
+import { RawAudio } from '../../../utils/audio.js';
 
 export class Gemma3nProcessor extends Processor {
     static image_processor_class = AutoImageProcessor;
diff --git a/src/models/model-processors/glpn/image_processing_glpn.js b/src/models/model-processors/glpn/image_processing_glpn.js
new file mode 100644
index 000000000..cf6b23967
--- /dev/null
+++ b/src/models/model-processors/glpn/image_processing_glpn.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class GLPNFeatureExtractor extends ImageProcessor {}
diff --git a/src/models/grounding_dino/image_processing_grounding_dino.js b/src/models/model-processors/grounding_dino/image_processing_grounding_dino.js
similarity index 60%
rename from src/models/grounding_dino/image_processing_grounding_dino.js
rename to src/models/model-processors/grounding_dino/image_processing_grounding_dino.js
index 0042ab763..2eceb7bcc 100644
--- a/src/models/grounding_dino/image_processing_grounding_dino.js
+++ b/src/models/model-processors/grounding_dino/image_processing_grounding_dino.js
@@ -1,17 +1,17 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-import { ones } from '../../utils/tensor.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+import { ones } from '../../../utils/tensor.js';
 
 /**
  * @typedef {object} GroundingDinoFeatureExtractorResultProps
- * @property {import('../../utils/tensor.js').Tensor} pixel_mask
- * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
+ * @property {import('../../../utils/tensor.js').Tensor} pixel_mask
+ * @typedef {import('../../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
  */
 
 export class GroundingDinoImageProcessor extends ImageProcessor {
     /**
      * Calls the feature extraction process on an array of images, preprocesses
      * each image, and concatenates the resulting features into a single Tensor.
-     * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
+     * @param {import('../../../utils/image.js').RawImage[]} images The image(s) to extract features from.
      * @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
      */
     async _call(images) {
diff --git a/src/models/grounding_dino/processing_grounding_dino.js b/src/models/model-processors/grounding_dino/processing_grounding_dino.js
similarity index 85%
rename from src/models/grounding_dino/processing_grounding_dino.js
rename to src/models/model-processors/grounding_dino/processing_grounding_dino.js
index 8ad0eade0..d1dea7b05 100644
--- a/src/models/grounding_dino/processing_grounding_dino.js
+++ b/src/models/model-processors/grounding_dino/processing_grounding_dino.js
@@ -1,12 +1,12 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { center_to_corners_format } from '../../base/image_processors_utils.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { center_to_corners_format } from '../../../base/image_processors_utils.js';
 
 /**
  * Get token ids of phrases from posmaps and input_ids.
- * @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
- * @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
+ * @param {import('../../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
+ * @param {import('../../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
  */
 function get_phrases_from_posmap(posmaps, input_ids) {
     const left_idx = 0;
@@ -28,7 +28,7 @@ export class GroundingDinoProcessor extends Processor {
     static image_processor_class = AutoImageProcessor;
 
     /**
-     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     * @typedef {import('../../../utils/image.js').RawImage} RawImage
      */
     /**
      *
diff --git a/src/models/idefics3/image_processing_idefics3.js b/src/models/model-processors/idefics3/image_processing_idefics3.js
similarity index 96%
rename from src/models/idefics3/image_processing_idefics3.js
rename to src/models/model-processors/idefics3/image_processing_idefics3.js
index a2ada459b..709208f4c 100644
--- a/src/models/idefics3/image_processing_idefics3.js
+++ b/src/models/model-processors/idefics3/image_processing_idefics3.js
@@ -1,5 +1,5 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-import { cat, full, interpolate_4d, slice, stack } from '../../utils/tensor.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+import { cat, full, interpolate_4d, slice, stack } from '../../../utils/tensor.js';
 
 export class Idefics3ImageProcessor extends ImageProcessor {
     constructor(config) {
@@ -10,8 +10,8 @@ export class Idefics3ImageProcessor extends ImageProcessor {
     }
 
     /**
-     * @typedef {import('../../utils/image.js').RawImage} RawImage
-     * @typedef {import('../../utils/tensor.js').Tensor} Tensor
+     * @typedef {import('../../../utils/image.js').RawImage} RawImage
+     * @typedef {import('../../../utils/tensor.js').Tensor} Tensor
      */
 
     /**
diff --git a/src/models/idefics3/processing_idefics3.js b/src/models/model-processors/idefics3/processing_idefics3.js
similarity index 95%
rename from src/models/idefics3/processing_idefics3.js
rename to src/models/model-processors/idefics3/processing_idefics3.js
index fb8898cf8..423d4ebf6 100644
--- a/src/models/idefics3/processing_idefics3.js
+++ b/src/models/model-processors/idefics3/processing_idefics3.js
@@ -1,8 +1,8 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { RawImage } from '../../utils/image.js';
-import { count } from '../../utils/core.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { RawImage } from '../../../utils/image.js';
+import { count } from '../../../utils/core.js';
 
 /**
  * Prompt with expanded image tokens for when the image is split into patches.
diff --git a/src/models/janus/image_processing_janus.js b/src/models/model-processors/janus/image_processing_janus.js
similarity index 89%
rename from src/models/janus/image_processing_janus.js
rename to src/models/model-processors/janus/image_processing_janus.js
index 96d00e795..30c4d685b 100644
--- a/src/models/janus/image_processing_janus.js
+++ b/src/models/model-processors/janus/image_processing_janus.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class VLMImageProcessor extends ImageProcessor {
     constructor(config) {
diff --git a/src/models/janus/processing_janus.js b/src/models/model-processors/janus/processing_janus.js
similarity index 89%
rename from src/models/janus/processing_janus.js
rename to src/models/model-processors/janus/processing_janus.js
index 54c797f9f..3a3301f14 100644
--- a/src/models/janus/processing_janus.js
+++ b/src/models/model-processors/janus/processing_janus.js
@@ -1,9 +1,9 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { mergeArrays } from '../../utils/core.js';
-import { Tensor } from '../../utils/tensor.js';
-import { RawImage } from '../../utils/image.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { mergeArrays } from '../../../utils/core.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { RawImage } from '../../../utils/image.js';
 
 export class VLChatProcessor extends Processor {
     static image_processor_class = AutoImageProcessor;
@@ -22,7 +22,7 @@ export class VLChatProcessor extends Processor {
     /**
      * @typedef {Object} MultimodalMessageProperties Additional properties for multimodal messages.
      * @property {(RawImage | string | URL)[]} [images] The images in the message.
-     * @typedef {(import('../../tokenizers.js').Message & MultimodalMessageProperties)[]} MultimodalConversation The conversation possibly containing multimodal inputs.
+     * @typedef {(import('../../../tokenizers.js').Message & MultimodalMessageProperties)[]} MultimodalConversation The conversation possibly containing multimodal inputs.
      */
 
     /**
@@ -38,7 +38,7 @@ export class VLChatProcessor extends Processor {
      * @param {Object} options Additional options for processing.
      * @param {RawImage|RawImage[]} [options.images] The images to process, if not set in the conversation.
      * @param {string} [options.chat_template="default"] The chat template to use.
-     * @returns {Promise<VLCChatProcessorResult | VLCChatProcessorResult & import('../../base/image_processors_utils.js').ImageProcessorResult>} The processed input.
+     * @returns {Promise<VLCChatProcessorResult | VLCChatProcessorResult & import('../../../base/image_processors_utils.js').ImageProcessorResult>} The processed input.
      */
     async _call(conversation, { images = null, chat_template = 'default' } = {}) {
         if (!images) {
diff --git a/src/models/jina_clip/image_processing_jina_clip.js b/src/models/model-processors/jina_clip/image_processing_jina_clip.js
similarity index 91%
rename from src/models/jina_clip/image_processing_jina_clip.js
rename to src/models/model-processors/jina_clip/image_processing_jina_clip.js
index 7b901f5ee..9613a3710 100644
--- a/src/models/jina_clip/image_processing_jina_clip.js
+++ b/src/models/model-processors/jina_clip/image_processing_jina_clip.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class JinaCLIPImageProcessor extends ImageProcessor {
     constructor(config) {
diff --git a/src/models/jina_clip/processing_jina_clip.js b/src/models/model-processors/jina_clip/processing_jina_clip.js
similarity index 84%
rename from src/models/jina_clip/processing_jina_clip.js
rename to src/models/model-processors/jina_clip/processing_jina_clip.js
index ef3d3ffd8..e0dadb756 100644
--- a/src/models/jina_clip/processing_jina_clip.js
+++ b/src/models/model-processors/jina_clip/processing_jina_clip.js
@@ -1,6 +1,6 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 
 export class JinaCLIPProcessor extends Processor {
     static tokenizer_class = AutoTokenizer;
diff --git a/src/models/llava/processing_llava.js b/src/models/model-processors/llava/processing_llava.js
similarity index 87%
rename from src/models/llava/processing_llava.js
rename to src/models/model-processors/llava/processing_llava.js
index 4f70edca8..849c82c54 100644
--- a/src/models/llava/processing_llava.js
+++ b/src/models/model-processors/llava/processing_llava.js
@@ -1,6 +1,6 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 
 export class LlavaProcessor extends Processor {
     static tokenizer_class = AutoTokenizer;
@@ -8,7 +8,7 @@ export class LlavaProcessor extends Processor {
     static uses_processor_config = true;
 
     /**
-     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     * @typedef {import('../../../utils/image.js').RawImage} RawImage
      */
 
     // `images` is required, `text` is optional
diff --git a/src/models/model-processors/llava_onevision/image_processing_llava_onevision.js b/src/models/model-processors/llava_onevision/image_processing_llava_onevision.js
new file mode 100644
index 000000000..1caf43c6b
--- /dev/null
+++ b/src/models/model-processors/llava_onevision/image_processing_llava_onevision.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class LlavaOnevisionImageProcessor extends ImageProcessor {}
diff --git a/src/models/mask2former/image_processing_mask2former.js b/src/models/model-processors/mask2former/image_processing_mask2former.js
similarity index 100%
rename from src/models/mask2former/image_processing_mask2former.js
rename to src/models/model-processors/mask2former/image_processing_mask2former.js
diff --git a/src/models/maskformer/image_processing_maskformer.js b/src/models/model-processors/maskformer/image_processing_maskformer.js
similarity index 92%
rename from src/models/maskformer/image_processing_maskformer.js
rename to src/models/model-processors/maskformer/image_processing_maskformer.js
index 95e70f6e8..5996c913f 100644
--- a/src/models/maskformer/image_processing_maskformer.js
+++ b/src/models/model-processors/maskformer/image_processing_maskformer.js
@@ -2,7 +2,7 @@ import {
     ImageProcessor,
     post_process_panoptic_segmentation,
     post_process_instance_segmentation,
-} from '../../base/image_processors_utils.js';
+} from '../../../base/image_processors_utils.js';
 
 export class MaskFormerImageProcessor extends ImageProcessor {
     /** @type {typeof post_process_panoptic_segmentation} */
diff --git a/src/models/mgp_str/processing_mgp_str.js b/src/models/model-processors/mgp_str/processing_mgp_str.js
similarity index 89%
rename from src/models/mgp_str/processing_mgp_str.js
rename to src/models/model-processors/mgp_str/processing_mgp_str.js
index a312b17d2..763f7d579 100644
--- a/src/models/mgp_str/processing_mgp_str.js
+++ b/src/models/model-processors/mgp_str/processing_mgp_str.js
@@ -1,7 +1,7 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { max, softmax } from '../../utils/maths.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { max, softmax } from '../../../utils/maths.js';
 
 const DECODE_TYPE_MAPPING = {
     char: ['char_decode', 1],
@@ -13,21 +13,21 @@ export class MgpstrProcessor extends Processor {
     static image_processor_class = AutoImageProcessor;
 
     /**
-     * @returns {import('../../tokenizers.js').MgpstrTokenizer} The character tokenizer.
+     * @returns {import('../../../tokenizers.js').MgpstrTokenizer} The character tokenizer.
      */
     get char_tokenizer() {
         return this.components.char_tokenizer;
     }
 
     /**
-     * @returns {import('../../tokenizers.js').GPT2Tokenizer} The BPE tokenizer.
+     * @returns {import('../../../tokenizers.js').GPT2Tokenizer} The BPE tokenizer.
      */
     get bpe_tokenizer() {
         return this.components.bpe_tokenizer;
     }
 
     /**
-     * @returns {import('../../tokenizers.js').BertTokenizer} The WordPiece tokenizer.
+     * @returns {import('../../../tokenizers.js').BertTokenizer} The WordPiece tokenizer.
      */
     get wp_tokenizer() {
         return this.components.wp_tokenizer;
@@ -35,7 +35,7 @@ export class MgpstrProcessor extends Processor {
 
     /**
      * Helper function to decode the model prediction logits.
-     * @param {import('../../utils/tensor.js').Tensor} pred_logits Model prediction logits.
+     * @param {import('../../../utils/tensor.js').Tensor} pred_logits Model prediction logits.
      * @param {string} format Type of model prediction. Must be one of ['char', 'bpe', 'wp'].
      * @returns {[string[], number[]]} The decoded sentences and their confidence scores.
      */
@@ -108,7 +108,7 @@ export class MgpstrProcessor extends Processor {
 
     /**
      * Convert a list of lists of token ids into a list of strings by calling decode.
-     * @param {[import('../../utils/tensor.js').Tensor, import('../../utils/tensor.js').Tensor, import('../../utils/tensor.js').Tensor]} sequences List of tokenized input ids.
+     * @param {[import('../../../utils/tensor.js').Tensor, import('../../../utils/tensor.js').Tensor, import('../../../utils/tensor.js').Tensor]} sequences List of tokenized input ids.
      * @returns {{generated_text: string[], scores: number[], char_preds: string[], bpe_preds: string[], wp_preds: string[]}}
      * Dictionary of all the outputs of the decoded results.
      * - generated_text: The final results after fusion of char, bpe, and wp.
diff --git a/src/models/mobilenet_v1/image_processing_mobilenet_v1.js b/src/models/model-processors/mobilenet_v1/image_processing_mobilenet_v1.js
similarity index 66%
rename from src/models/mobilenet_v1/image_processing_mobilenet_v1.js
rename to src/models/model-processors/mobilenet_v1/image_processing_mobilenet_v1.js
index d11dbed3d..4913f7033 100644
--- a/src/models/mobilenet_v1/image_processing_mobilenet_v1.js
+++ b/src/models/model-processors/mobilenet_v1/image_processing_mobilenet_v1.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class MobileNetV1ImageProcessor extends ImageProcessor {}
 export class MobileNetV1FeatureExtractor extends MobileNetV1ImageProcessor {}
diff --git a/src/models/mobilenet_v2/image_processing_mobilenet_v2.js b/src/models/model-processors/mobilenet_v2/image_processing_mobilenet_v2.js
similarity index 66%
rename from src/models/mobilenet_v2/image_processing_mobilenet_v2.js
rename to src/models/model-processors/mobilenet_v2/image_processing_mobilenet_v2.js
index 687d888ca..34a8a2ac2 100644
--- a/src/models/mobilenet_v2/image_processing_mobilenet_v2.js
+++ b/src/models/model-processors/mobilenet_v2/image_processing_mobilenet_v2.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class MobileNetV2ImageProcessor extends ImageProcessor {}
 export class MobileNetV2FeatureExtractor extends MobileNetV2ImageProcessor {}
diff --git a/src/models/mobilenet_v3/image_processing_mobilenet_v3.js b/src/models/model-processors/mobilenet_v3/image_processing_mobilenet_v3.js
similarity index 66%
rename from src/models/mobilenet_v3/image_processing_mobilenet_v3.js
rename to src/models/model-processors/mobilenet_v3/image_processing_mobilenet_v3.js
index 0da34b58b..1859ef91d 100644
--- a/src/models/mobilenet_v3/image_processing_mobilenet_v3.js
+++ b/src/models/model-processors/mobilenet_v3/image_processing_mobilenet_v3.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class MobileNetV3ImageProcessor extends ImageProcessor {}
 export class MobileNetV3FeatureExtractor extends MobileNetV3ImageProcessor {}
diff --git a/src/models/mobilenet_v4/image_processing_mobilenet_v4.js b/src/models/model-processors/mobilenet_v4/image_processing_mobilenet_v4.js
similarity index 66%
rename from src/models/mobilenet_v4/image_processing_mobilenet_v4.js
rename to src/models/model-processors/mobilenet_v4/image_processing_mobilenet_v4.js
index c838ffbdb..d7f8e3e0a 100644
--- a/src/models/mobilenet_v4/image_processing_mobilenet_v4.js
+++ b/src/models/model-processors/mobilenet_v4/image_processing_mobilenet_v4.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class MobileNetV4ImageProcessor extends ImageProcessor {}
 export class MobileNetV4FeatureExtractor extends MobileNetV4ImageProcessor {}
diff --git a/src/models/mobilevit/image_processing_mobilevit.js b/src/models/model-processors/mobilevit/image_processing_mobilevit.js
similarity index 65%
rename from src/models/mobilevit/image_processing_mobilevit.js
rename to src/models/model-processors/mobilevit/image_processing_mobilevit.js
index df2877ca4..164d71f49 100644
--- a/src/models/mobilevit/image_processing_mobilevit.js
+++ b/src/models/model-processors/mobilevit/image_processing_mobilevit.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class MobileViTImageProcessor extends ImageProcessor {}
 export class MobileViTFeatureExtractor extends MobileViTImageProcessor {}
diff --git a/src/models/moonshine/feature_extraction_moonshine.js b/src/models/model-processors/moonshine/feature_extraction_moonshine.js
similarity index 82%
rename from src/models/moonshine/feature_extraction_moonshine.js
rename to src/models/model-processors/moonshine/feature_extraction_moonshine.js
index 6702d9021..55912ed4e 100644
--- a/src/models/moonshine/feature_extraction_moonshine.js
+++ b/src/models/model-processors/moonshine/feature_extraction_moonshine.js
@@ -1,5 +1,5 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
 
 export class MoonshineFeatureExtractor extends FeatureExtractor {
     /**
diff --git a/src/models/moonshine/processing_moonshine.js b/src/models/model-processors/moonshine/processing_moonshine.js
similarity index 84%
rename from src/models/moonshine/processing_moonshine.js
rename to src/models/model-processors/moonshine/processing_moonshine.js
index 895c276b6..65d1d7e34 100644
--- a/src/models/moonshine/processing_moonshine.js
+++ b/src/models/model-processors/moonshine/processing_moonshine.js
@@ -1,6 +1,6 @@
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { Processor } from '../../base/processing_utils.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { Processor } from '../../../base/processing_utils.js';
 
 /**
  * Represents a MoonshineProcessor that extracts features from an audio input.
diff --git a/src/models/nougat/image_processing_nougat.js b/src/models/model-processors/nougat/image_processing_nougat.js
similarity index 100%
rename from src/models/nougat/image_processing_nougat.js
rename to src/models/model-processors/nougat/image_processing_nougat.js
diff --git a/src/models/owlv2/image_processing_owlv2.js b/src/models/model-processors/owlv2/image_processing_owlv2.js
similarity index 100%
rename from src/models/owlv2/image_processing_owlv2.js
rename to src/models/model-processors/owlv2/image_processing_owlv2.js
diff --git a/src/models/owlvit/image_processing_owlvit.js b/src/models/model-processors/owlvit/image_processing_owlvit.js
similarity index 89%
rename from src/models/owlvit/image_processing_owlvit.js
rename to src/models/model-processors/owlvit/image_processing_owlvit.js
index 767c312f8..2bcd147fb 100644
--- a/src/models/owlvit/image_processing_owlvit.js
+++ b/src/models/model-processors/owlvit/image_processing_owlvit.js
@@ -1,4 +1,4 @@
-import { ImageProcessor, post_process_object_detection } from '../../base/image_processors_utils.js';
+import { ImageProcessor, post_process_object_detection } from '../../../base/image_processors_utils.js';
 
 export class OwlViTImageProcessor extends ImageProcessor {
     /** @type {typeof post_process_object_detection} */
diff --git a/src/models/owlvit/processing_owlvit.js b/src/models/model-processors/owlvit/processing_owlvit.js
similarity index 65%
rename from src/models/owlvit/processing_owlvit.js
rename to src/models/model-processors/owlvit/processing_owlvit.js
index 6f673746e..8722b5af3 100644
--- a/src/models/owlvit/processing_owlvit.js
+++ b/src/models/model-processors/owlvit/processing_owlvit.js
@@ -1,6 +1,6 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 export class OwlViTProcessor extends Processor {
     static tokenizer_class = AutoTokenizer;
     static image_processor_class = AutoImageProcessor;
diff --git a/src/models/paligemma/processing_paligemma.js b/src/models/model-processors/paligemma/processing_paligemma.js
similarity index 93%
rename from src/models/paligemma/processing_paligemma.js
rename to src/models/model-processors/paligemma/processing_paligemma.js
index 1aa3b1167..68dc83bfb 100644
--- a/src/models/paligemma/processing_paligemma.js
+++ b/src/models/model-processors/paligemma/processing_paligemma.js
@@ -1,6 +1,6 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 
 const IMAGE_TOKEN = '<image>';
 
@@ -14,7 +14,7 @@ export class PaliGemmaProcessor extends Processor {
     static uses_processor_config = false;
 
     /**
-     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     * @typedef {import('../../../utils/image.js').RawImage} RawImage
      */
 
     // `images` is required, `text` is optional
diff --git a/src/models/parakeet/feature_extraction_parakeet.js b/src/models/model-processors/parakeet/feature_extraction_parakeet.js
similarity index 96%
rename from src/models/parakeet/feature_extraction_parakeet.js
rename to src/models/model-processors/parakeet/feature_extraction_parakeet.js
index e4862270b..c986c8b3d 100644
--- a/src/models/parakeet/feature_extraction_parakeet.js
+++ b/src/models/model-processors/parakeet/feature_extraction_parakeet.js
@@ -1,6 +1,6 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
-import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js';
 
 const EPSILON = 1e-5;
 
diff --git a/src/models/phi3_v/image_processing_phi3_v.js b/src/models/model-processors/phi3_v/image_processing_phi3_v.js
similarity index 98%
rename from src/models/phi3_v/image_processing_phi3_v.js
rename to src/models/model-processors/phi3_v/image_processing_phi3_v.js
index 50a804e98..eb52c96c3 100644
--- a/src/models/phi3_v/image_processing_phi3_v.js
+++ b/src/models/model-processors/phi3_v/image_processing_phi3_v.js
@@ -1,5 +1,5 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-import { cat, interpolate_4d, slice, stack, Tensor } from '../../utils/tensor.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+import { cat, interpolate_4d, slice, stack, Tensor } from '../../../utils/tensor.js';
 
 const IMAGE_SIZE = 336;
 const SLICE_AXES = [2, 3]; // axes to slice on
diff --git a/src/models/phi3_v/processing_phi3_v.js b/src/models/model-processors/phi3_v/processing_phi3_v.js
similarity index 91%
rename from src/models/phi3_v/processing_phi3_v.js
rename to src/models/model-processors/phi3_v/processing_phi3_v.js
index 18d3eb15a..5fb681330 100644
--- a/src/models/phi3_v/processing_phi3_v.js
+++ b/src/models/model-processors/phi3_v/processing_phi3_v.js
@@ -1,7 +1,7 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { RawImage } from '../../utils/image.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { RawImage } from '../../../utils/image.js';
 
 const IMAGE_TOKEN = '<|image|>';
 const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
diff --git a/src/models/model-processors/pvt/image_processing_pvt.js b/src/models/model-processors/pvt/image_processing_pvt.js
new file mode 100644
index 000000000..2aa79cd4c
--- /dev/null
+++ b/src/models/model-processors/pvt/image_processing_pvt.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class PvtImageProcessor extends ImageProcessor {}
diff --git a/src/models/pyannote/feature_extraction_pyannote.js b/src/models/model-processors/pyannote/feature_extraction_pyannote.js
similarity index 89%
rename from src/models/pyannote/feature_extraction_pyannote.js
rename to src/models/model-processors/pyannote/feature_extraction_pyannote.js
index 3231a8ea9..dd4aaaed8 100644
--- a/src/models/pyannote/feature_extraction_pyannote.js
+++ b/src/models/model-processors/pyannote/feature_extraction_pyannote.js
@@ -1,6 +1,6 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
-import { max, softmax } from '../../utils/maths.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { max, softmax } from '../../../utils/maths.js';
 
 export class PyAnnoteFeatureExtractor extends FeatureExtractor {
     /**
@@ -32,7 +32,7 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
 
     /**
      * Post-processes the speaker diarization logits output by the model.
-     * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
+     * @param {import('../../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
      * @param {number} num_samples Number of samples in the input audio.
      * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
      */
diff --git a/src/models/pyannote/processing_pyannote.js b/src/models/model-processors/pyannote/processing_pyannote.js
similarity index 93%
rename from src/models/pyannote/processing_pyannote.js
rename to src/models/model-processors/pyannote/processing_pyannote.js
index e6f19cdb3..d49d04142 100644
--- a/src/models/pyannote/processing_pyannote.js
+++ b/src/models/model-processors/pyannote/processing_pyannote.js
@@ -1,4 +1,4 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';
 
 export class PyAnnoteProcessor extends Processor {
diff --git a/src/models/qwen2_vl/image_processing_qwen2_vl.js b/src/models/model-processors/qwen2_vl/image_processing_qwen2_vl.js
similarity index 92%
rename from src/models/qwen2_vl/image_processing_qwen2_vl.js
rename to src/models/model-processors/qwen2_vl/image_processing_qwen2_vl.js
index 809b24566..f6a77f436 100644
--- a/src/models/qwen2_vl/image_processing_qwen2_vl.js
+++ b/src/models/model-processors/qwen2_vl/image_processing_qwen2_vl.js
@@ -1,5 +1,5 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-import { cat, Tensor } from '../../utils/tensor.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+import { cat, Tensor } from '../../../utils/tensor.js';
 
 export class Qwen2VLImageProcessor extends ImageProcessor {
     async _call(images, ...args) {
diff --git a/src/models/qwen2_vl/processing_qwen2_vl.js b/src/models/model-processors/qwen2_vl/processing_qwen2_vl.js
similarity index 90%
rename from src/models/qwen2_vl/processing_qwen2_vl.js
rename to src/models/model-processors/qwen2_vl/processing_qwen2_vl.js
index dd4775721..cf53b3807 100644
--- a/src/models/qwen2_vl/processing_qwen2_vl.js
+++ b/src/models/model-processors/qwen2_vl/processing_qwen2_vl.js
@@ -1,7 +1,7 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { RawImage } from '../../utils/image.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { RawImage } from '../../../utils/image.js';
 
 export class Qwen2VLProcessor extends Processor {
     static image_processor_class = AutoImageProcessor;
diff --git a/src/models/rt_detr/image_processing_rt_detr.js b/src/models/model-processors/rt_detr/image_processing_rt_detr.js
similarity index 87%
rename from src/models/rt_detr/image_processing_rt_detr.js
rename to src/models/model-processors/rt_detr/image_processing_rt_detr.js
index 1a8f8ee2d..da4b7b0e3 100644
--- a/src/models/rt_detr/image_processing_rt_detr.js
+++ b/src/models/model-processors/rt_detr/image_processing_rt_detr.js
@@ -1,4 +1,4 @@
-import { ImageProcessor, post_process_object_detection } from '../../base/image_processors_utils.js';
+import { ImageProcessor, post_process_object_detection } from '../../../base/image_processors_utils.js';
 
 export class RTDetrImageProcessor extends ImageProcessor {
     /** @type {typeof post_process_object_detection} */
diff --git a/src/models/sam/image_processing_sam.js b/src/models/model-processors/sam/image_processing_sam.js
similarity index 93%
rename from src/models/sam/image_processing_sam.js
rename to src/models/model-processors/sam/image_processing_sam.js
index d4fa856ff..14c8276db 100644
--- a/src/models/sam/image_processing_sam.js
+++ b/src/models/model-processors/sam/image_processing_sam.js
@@ -1,13 +1,13 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-import { calculateDimensions } from '../../utils/core.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+import { calculateDimensions } from '../../../utils/core.js';
 
-import { interpolate_4d, Tensor } from '../../utils/tensor.js';
+import { interpolate_4d, Tensor } from '../../../utils/tensor.js';
 
 /**
  * @typedef {object} SamImageProcessorResult
  * @property {Tensor} pixel_values
- * @property {import("../../base/image_processors_utils.js").HeightWidth[]} original_sizes
- * @property {import("../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes
+ * @property {import("../../../base/image_processors_utils.js").HeightWidth[]} original_sizes
+ * @property {import("../../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes
  * @property {Tensor} [input_points]
  * @property {Tensor} [input_labels]
  * @property {Tensor} [input_boxes]
@@ -17,8 +17,8 @@ export class SamImageProcessor extends ImageProcessor {
     /**
      *
      * @param {any} input_points
-     * @param {import("../../base/image_processors_utils.js").HeightWidth[]} original_sizes
-     * @param {import("../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes
+     * @param {import("../../../base/image_processors_utils.js").HeightWidth[]} original_sizes
+     * @param {import("../../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes
      * @returns {Tensor}
      */
     reshape_input_points(input_points, original_sizes, reshaped_input_sizes, is_bounding_box = false) {
@@ -204,7 +204,7 @@ export class SamImageProcessor extends ImageProcessor {
 
     /**
      * Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer.
-     * @param {import("../../utils/image.js").RawImage} image Input original image
+     * @param {import("../../../utils/image.js").RawImage} image Input original image
      * @param {number} target_size Target size of the resized image
      * @param {Object} options Options for generating crop boxes
      * @param {number} [options.crop_n_layers] If >0, mask prediction will be run again on crops of the image.
diff --git a/src/models/sam/processing_sam.js b/src/models/model-processors/sam/processing_sam.js
similarity index 89%
rename from src/models/sam/processing_sam.js
rename to src/models/model-processors/sam/processing_sam.js
index de9e856ce..077db3458 100644
--- a/src/models/sam/processing_sam.js
+++ b/src/models/model-processors/sam/processing_sam.js
@@ -1,4 +1,4 @@
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 import { AutoImageProcessor } from '../auto/image_processing_auto.js';
 
 export class SamProcessor extends Processor {
diff --git a/src/models/sam2/image_processing_sam2.js b/src/models/model-processors/sam2/image_processing_sam2.js
similarity index 100%
rename from src/models/sam2/image_processing_sam2.js
rename to src/models/model-processors/sam2/image_processing_sam2.js
diff --git a/src/models/sam2/processing_sam2.js b/src/models/model-processors/sam2/processing_sam2.js
similarity index 100%
rename from src/models/sam2/processing_sam2.js
rename to src/models/model-processors/sam2/processing_sam2.js
diff --git a/src/models/sam3/image_processing_sam3.js b/src/models/model-processors/sam3/image_processing_sam3.js
similarity index 100%
rename from src/models/sam3/image_processing_sam3.js
rename to src/models/model-processors/sam3/image_processing_sam3.js
diff --git a/src/models/sapiens/image_processing_sapiens.js b/src/models/model-processors/sapiens/image_processing_sapiens.js
similarity index 89%
rename from src/models/sapiens/image_processing_sapiens.js
rename to src/models/model-processors/sapiens/image_processing_sapiens.js
index 15b755ff5..9df6582bd 100644
--- a/src/models/sapiens/image_processing_sapiens.js
+++ b/src/models/model-processors/sapiens/image_processing_sapiens.js
@@ -1,4 +1,4 @@
-import { ImageProcessor, post_process_semantic_segmentation } from '../../base/image_processors_utils.js';
+import { ImageProcessor, post_process_semantic_segmentation } from '../../../base/image_processors_utils.js';
 
 export class SapiensImageProcessor extends ImageProcessor {
     /** @type {typeof post_process_semantic_segmentation} */
diff --git a/src/models/seamless_m4t/feature_extraction_seamless_m4t.js b/src/models/model-processors/seamless_m4t/feature_extraction_seamless_m4t.js
similarity index 97%
rename from src/models/seamless_m4t/feature_extraction_seamless_m4t.js
rename to src/models/model-processors/seamless_m4t/feature_extraction_seamless_m4t.js
index 41d9b7f4e..32c623239 100644
--- a/src/models/seamless_m4t/feature_extraction_seamless_m4t.js
+++ b/src/models/model-processors/seamless_m4t/feature_extraction_seamless_m4t.js
@@ -1,6 +1,6 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
-import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js';
 
 export class SeamlessM4TFeatureExtractor extends FeatureExtractor {
     constructor(config) {
diff --git a/src/models/segformer/image_processing_segformer.js b/src/models/model-processors/segformer/image_processing_segformer.js
similarity index 89%
rename from src/models/segformer/image_processing_segformer.js
rename to src/models/model-processors/segformer/image_processing_segformer.js
index 1d23bc045..969af8eb8 100644
--- a/src/models/segformer/image_processing_segformer.js
+++ b/src/models/model-processors/segformer/image_processing_segformer.js
@@ -1,4 +1,4 @@
-import { ImageProcessor, post_process_semantic_segmentation } from '../../base/image_processors_utils.js';
+import { ImageProcessor, post_process_semantic_segmentation } from '../../../base/image_processors_utils.js';
 
 export class SegformerImageProcessor extends ImageProcessor {
     /** @type {typeof post_process_semantic_segmentation} */
diff --git a/src/models/model-processors/siglip/image_processing_siglip.js b/src/models/model-processors/siglip/image_processing_siglip.js
new file mode 100644
index 000000000..3c126f5f1
--- /dev/null
+++ b/src/models/model-processors/siglip/image_processing_siglip.js
@@ -0,0 +1,3 @@
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
+
+export class SiglipImageProcessor extends ImageProcessor {}
diff --git a/src/models/smolvlm/image_processing_smolvlm.js b/src/models/model-processors/smolvlm/image_processing_smolvlm.js
similarity index 100%
rename from src/models/smolvlm/image_processing_smolvlm.js
rename to src/models/model-processors/smolvlm/image_processing_smolvlm.js
diff --git a/src/models/smolvlm/processing_smolvlm.js b/src/models/model-processors/smolvlm/processing_smolvlm.js
similarity index 100%
rename from src/models/smolvlm/processing_smolvlm.js
rename to src/models/model-processors/smolvlm/processing_smolvlm.js
diff --git a/src/models/snac/feature_extraction_snac.js b/src/models/model-processors/snac/feature_extraction_snac.js
similarity index 100%
rename from src/models/snac/feature_extraction_snac.js
rename to src/models/model-processors/snac/feature_extraction_snac.js
diff --git a/src/models/model-processors/speecht5/feature_extraction_speecht5.js b/src/models/model-processors/speecht5/feature_extraction_speecht5.js
new file mode 100644
index 000000000..669d3e1f5
--- /dev/null
+++ b/src/models/model-processors/speecht5/feature_extraction_speecht5.js
@@ -0,0 +1,3 @@
+import { FeatureExtractor } from '../../../base/feature_extraction_utils.js';
+
+export class SpeechT5FeatureExtractor extends FeatureExtractor {}
diff --git a/src/models/speecht5/processing_speecht5.js b/src/models/model-processors/speecht5/processing_speecht5.js
similarity index 82%
rename from src/models/speecht5/processing_speecht5.js
rename to src/models/model-processors/speecht5/processing_speecht5.js
index 72824b2b0..2be9e8f42 100644
--- a/src/models/speecht5/processing_speecht5.js
+++ b/src/models/model-processors/speecht5/processing_speecht5.js
@@ -1,5 +1,5 @@
-import { Processor } from '../../base/processing_utils.js';
-import { AutoTokenizer } from '../../tokenizers.js';
+import { Processor } from '../../../base/processing_utils.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
 
 export class SpeechT5Processor extends Processor {
diff --git a/src/models/swin2sr/image_processing_swin2sr.js b/src/models/model-processors/swin2sr/image_processing_swin2sr.js
similarity index 94%
rename from src/models/swin2sr/image_processing_swin2sr.js
rename to src/models/model-processors/swin2sr/image_processing_swin2sr.js
index 7e5c810d1..c71ad06cf 100644
--- a/src/models/swin2sr/image_processing_swin2sr.js
+++ b/src/models/model-processors/swin2sr/image_processing_swin2sr.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class Swin2SRImageProcessor extends ImageProcessor {
     pad_image(pixelData, imgDims, padSize, options = {}) {
diff --git a/src/models/ultravox/processing_ultravox.js b/src/models/model-processors/ultravox/processing_ultravox.js
similarity index 94%
rename from src/models/ultravox/processing_ultravox.js
rename to src/models/model-processors/ultravox/processing_ultravox.js
index 80bee8ec8..05fe8d16f 100644
--- a/src/models/ultravox/processing_ultravox.js
+++ b/src/models/model-processors/ultravox/processing_ultravox.js
@@ -1,6 +1,6 @@
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { Processor } from '../../base/processing_utils.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { Processor } from '../../../base/processing_utils.js';
 
 /**
  * Represents a UltravoxProcessor that extracts features from an audio input.
diff --git a/src/models/vit/image_processing_vit.js b/src/models/model-processors/vit/image_processing_vit.js
similarity index 61%
rename from src/models/vit/image_processing_vit.js
rename to src/models/model-processors/vit/image_processing_vit.js
index 63864507f..15b63aef2 100644
--- a/src/models/vit/image_processing_vit.js
+++ b/src/models/model-processors/vit/image_processing_vit.js
@@ -1,4 +1,4 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class ViTImageProcessor extends ImageProcessor {}
 export class ViTFeatureExtractor extends ViTImageProcessor {}
diff --git a/src/models/vitmatte/image_processing_vitmatte.js b/src/models/model-processors/vitmatte/image_processing_vitmatte.js
similarity index 72%
rename from src/models/vitmatte/image_processing_vitmatte.js
rename to src/models/model-processors/vitmatte/image_processing_vitmatte.js
index d08b9b132..78ac2098e 100644
--- a/src/models/vitmatte/image_processing_vitmatte.js
+++ b/src/models/model-processors/vitmatte/image_processing_vitmatte.js
@@ -1,14 +1,14 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
-import { stack, cat } from '../../utils/tensor.js';
+import { stack, cat } from '../../../utils/tensor.js';
 
 export class VitMatteImageProcessor extends ImageProcessor {
     /**
      * Calls the feature extraction process on an array of images, preprocesses
      * each image, and concatenates the resulting features into a single Tensor.
-     * @param {import("../../utils/image.js").RawImage[]} images The image(s) to extract features from.
-     * @param {import("../../utils/image.js").RawImage[]} trimaps The trimaps(s) to extract features from.
-     * @returns {Promise<import("../../base/image_processors_utils.js").ImageProcessorResult>} An object containing the concatenated pixel values of the preprocessed images.
+     * @param {import("../../../utils/image.js").RawImage[]} images The image(s) to extract features from.
+     * @param {import("../../../utils/image.js").RawImage[]} trimaps The trimaps(s) to extract features from.
+     * @returns {Promise<import("../../../base/image_processors_utils.js").ImageProcessorResult>} An object containing the concatenated pixel values of the preprocessed images.
      */
     async _call(images, trimaps) {
         if (!Array.isArray(images)) {
diff --git a/src/models/vitpose/image_processing_vitpose.js b/src/models/model-processors/vitpose/image_processing_vitpose.js
similarity index 95%
rename from src/models/vitpose/image_processing_vitpose.js
rename to src/models/model-processors/vitpose/image_processing_vitpose.js
index c19c486cd..cd6335534 100644
--- a/src/models/vitpose/image_processing_vitpose.js
+++ b/src/models/model-processors/vitpose/image_processing_vitpose.js
@@ -1,11 +1,11 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
+import { ImageProcessor } from '../../../base/image_processors_utils.js';
 
 export class VitPoseImageProcessor extends ImageProcessor {
     /**
      * Transform the heatmaps into keypoint predictions and transform them back to the image.
      * NOTE: This is a naive implementation and does not include advanced post-processing techniques,
      * so the results may not be as accurate as the original implementation.
-     * @param {import('../../utils/tensor.js').Tensor} outputs The model outputs.
+     * @param {import('../../../utils/tensor.js').Tensor} outputs The model outputs.
      * @param {[number, number, number, number][][]} boxes List or array of bounding boxes for each image.
      * Each box should be a list of 4 floats representing the bounding box coordinates in COCO format (top_left_x, top_left_y, width, height).
      * @returns {{
diff --git a/src/models/voxtral/processing_voxtral.js b/src/models/model-processors/voxtral/processing_voxtral.js
similarity index 95%
rename from src/models/voxtral/processing_voxtral.js
rename to src/models/model-processors/voxtral/processing_voxtral.js
index d002f4eca..f3d379c43 100644
--- a/src/models/voxtral/processing_voxtral.js
+++ b/src/models/model-processors/voxtral/processing_voxtral.js
@@ -1,7 +1,7 @@
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { Processor } from '../../base/processing_utils.js';
-import { cat } from '../../utils/tensor.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { Processor } from '../../../base/processing_utils.js';
+import { cat } from '../../../utils/tensor.js';
 
 const AUDIO_TOKEN = '[AUDIO]';
 const BEGIN_AUDIO_TOKEN = '[BEGIN_AUDIO]';
diff --git a/src/models/wav2vec2/feature_extraction_wav2vec2.js b/src/models/model-processors/wav2vec2/feature_extraction_wav2vec2.js
similarity index 91%
rename from src/models/wav2vec2/feature_extraction_wav2vec2.js
rename to src/models/model-processors/wav2vec2/feature_extraction_wav2vec2.js
index 194b71359..83688169c 100644
--- a/src/models/wav2vec2/feature_extraction_wav2vec2.js
+++ b/src/models/model-processors/wav2vec2/feature_extraction_wav2vec2.js
@@ -1,5 +1,5 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
 
 export class Wav2Vec2FeatureExtractor extends FeatureExtractor {
     /**
diff --git a/src/models/wav2vec2/processing_wav2vec2.js b/src/models/model-processors/wav2vec2/processing_wav2vec2.js
similarity index 82%
rename from src/models/wav2vec2/processing_wav2vec2.js
rename to src/models/model-processors/wav2vec2/processing_wav2vec2.js
index 583eee66e..553dcb67d 100644
--- a/src/models/wav2vec2/processing_wav2vec2.js
+++ b/src/models/model-processors/wav2vec2/processing_wav2vec2.js
@@ -1,6 +1,6 @@
-import { AutoTokenizer } from '../../tokenizers.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 
 export class Wav2Vec2Processor extends Processor {
     static tokenizer_class = AutoTokenizer;
diff --git a/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js b/src/models/model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js
similarity index 82%
rename from src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js
rename to src/models/model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js
index 157aca474..b768817b1 100644
--- a/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js
+++ b/src/models/model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js
@@ -1,6 +1,6 @@
-import { AutoTokenizer } from '../../tokenizers.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { Processor } from '../../base/processing_utils.js';
+import { Processor } from '../../../base/processing_utils.js';
 
 export class Wav2Vec2ProcessorWithLM extends Processor {
     static tokenizer_class = AutoTokenizer;
diff --git a/src/models/wespeaker/feature_extraction_wespeaker.js b/src/models/model-processors/wespeaker/feature_extraction_wespeaker.js
similarity index 95%
rename from src/models/wespeaker/feature_extraction_wespeaker.js
rename to src/models/model-processors/wespeaker/feature_extraction_wespeaker.js
index 81145c0d6..d8439ca32 100644
--- a/src/models/wespeaker/feature_extraction_wespeaker.js
+++ b/src/models/model-processors/wespeaker/feature_extraction_wespeaker.js
@@ -1,6 +1,6 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
-import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js';
 
 export class WeSpeakerFeatureExtractor extends FeatureExtractor {
     constructor(config) {
diff --git a/src/models/whisper/common_whisper.js b/src/models/model-processors/whisper/common_whisper.js
similarity index 100%
rename from src/models/whisper/common_whisper.js
rename to src/models/model-processors/whisper/common_whisper.js
diff --git a/src/models/whisper/feature_extraction_whisper.js b/src/models/model-processors/whisper/feature_extraction_whisper.js
similarity index 93%
rename from src/models/whisper/feature_extraction_whisper.js
rename to src/models/model-processors/whisper/feature_extraction_whisper.js
index 0f8c85fc7..d049fe0f6 100644
--- a/src/models/whisper/feature_extraction_whisper.js
+++ b/src/models/model-processors/whisper/feature_extraction_whisper.js
@@ -1,7 +1,7 @@
-import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
-import { Tensor } from '../../utils/tensor.js';
-import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
-import { max } from '../../utils/maths.js';
+import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js';
+import { Tensor } from '../../../utils/tensor.js';
+import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js';
+import { max } from '../../../utils/maths.js';
 
 export class WhisperFeatureExtractor extends FeatureExtractor {
     constructor(config) {
diff --git a/src/models/whisper/generation_whisper.js b/src/models/model-processors/whisper/generation_whisper.js
similarity index 90%
rename from src/models/whisper/generation_whisper.js
rename to src/models/model-processors/whisper/generation_whisper.js
index 0fd1daa7d..d73f577d6 100644
--- a/src/models/whisper/generation_whisper.js
+++ b/src/models/model-processors/whisper/generation_whisper.js
@@ -1,4 +1,4 @@
-import { GenerationConfig } from '../../generation/configuration_utils.js';
+import { GenerationConfig } from '../../../generation/configuration_utils.js';
 
 export class WhisperGenerationConfig extends GenerationConfig {
     /**
@@ -84,5 +84,5 @@ export class WhisperGenerationConfig extends GenerationConfig {
 }
 
 /**
- * @typedef {import('../../generation/parameters.js').GenerationFunctionParameters & {generation_config: WhisperGenerationConfig} & WhisperGenerationConfig} WhisperGenerationFunctionParameters
+ * @typedef {import('../../../generation/parameters.js').GenerationFunctionParameters & {generation_config: WhisperGenerationConfig} & WhisperGenerationConfig} WhisperGenerationFunctionParameters
  */
diff --git a/src/models/whisper/processing_whisper.js b/src/models/model-processors/whisper/processing_whisper.js
similarity index 84%
rename from src/models/whisper/processing_whisper.js
rename to src/models/model-processors/whisper/processing_whisper.js
index ef18dd7ce..952d06257 100644
--- a/src/models/whisper/processing_whisper.js
+++ b/src/models/model-processors/whisper/processing_whisper.js
@@ -1,6 +1,6 @@
 import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { AutoTokenizer } from '../../tokenizers.js';
-import { Processor } from '../../base/processing_utils.js';
+import { AutoTokenizer } from '../../../tokenizers.js';
+import { Processor } from '../../../base/processing_utils.js';
 
 /**
  * Represents a WhisperProcessor that extracts features from an audio input.
diff --git a/src/models/yolos/image_processing_yolos.js b/src/models/model-processors/yolos/image_processing_yolos.js
similarity index 89%
rename from src/models/yolos/image_processing_yolos.js
rename to src/models/model-processors/yolos/image_processing_yolos.js
index 2933536b6..253fe4f9f 100644
--- a/src/models/yolos/image_processing_yolos.js
+++ b/src/models/model-processors/yolos/image_processing_yolos.js
@@ -1,4 +1,4 @@
-import { ImageProcessor, post_process_object_detection } from '../../base/image_processors_utils.js';
+import { ImageProcessor, post_process_object_detection } from '../../../base/image_processors_utils.js';
 
 export class YolosImageProcessor extends ImageProcessor {
     /** @type {typeof post_process_object_detection} */
diff --git a/src/models/output.js b/src/models/output.js
new file mode 100644
index 000000000..97823d38e
--- /dev/null
+++ b/src/models/output.js
@@ -0,0 +1,270 @@
+/**
+ * @typedef {import('../utils/tensor.js').Tensor} Tensor
+ */
+
+export class ModelOutput {}
+
+/**
+ * Base class for model's outputs, with potential hidden states and attentions.
+ */
+export class BaseModelOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model.
+     * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+     * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+     */
+    constructor({ last_hidden_state, hidden_states = null, attentions = null }) {
+        super();
+        this.last_hidden_state = last_hidden_state;
+        this.hidden_states = hidden_states;
+        this.attentions = attentions;
+    }
+}
+
+/**
+ * Base class for Segment-Anything model's output.
+ */
+export class SamImageSegmentationOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.iou_scores The output logits of the model.
+     * @param {Tensor} output.pred_masks Predicted boxes.
+     */
+    constructor({ iou_scores, pred_masks }) {
+        super();
+        this.iou_scores = iou_scores;
+        this.pred_masks = pred_masks;
+    }
+}
+
+/**
+ * Base class for outputs of sentence classification models.
+ */
+export class SequenceClassifierOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax).
+     * @param {Record<string, Tensor>} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+     * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+     */
+    constructor({ logits, ...attentions }) {
+        super();
+        this.logits = logits;
+        const attentions_list = Object.values(attentions);
+        if (attentions_list.length > 0) {
+            // Only set attentions if they are not empty
+            this.attentions = attentions_list;
+        }
+    }
+}
+
+/**
+ * Base class for outputs of XVector models.
+ */
+export class XVectorOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Classification hidden states before AMSoftmax, of shape `(batch_size, config.xvector_output_dim)`.
+     * @param {Tensor} output.embeddings Utterance embeddings used for vector similarity-based retrieval, of shape `(batch_size, config.xvector_output_dim)`.
+     */
+    constructor({ logits, embeddings }) {
+        super();
+        this.logits = logits;
+        this.embeddings = embeddings;
+    }
+}
+
+/**
+ * Base class for outputs of token classification models.
+ */
+export class TokenClassifierOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Classification scores (before SoftMax).
+     */
+    constructor({ logits }) {
+        super();
+        this.logits = logits;
+    }
+}
+
+/**
+ * Base class for masked language models outputs.
+ */
+export class MaskedLMOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+     */
+    constructor({ logits }) {
+        super();
+        this.logits = logits;
+    }
+}
+
+/**
+ * Base class for outputs of question answering models.
+ */
+export class QuestionAnsweringModelOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.start_logits Span-start scores (before SoftMax).
+     * @param {Tensor} output.end_logits Span-end scores (before SoftMax).
+     */
+    constructor({ start_logits, end_logits }) {
+        super();
+        this.start_logits = start_logits;
+        this.end_logits = end_logits;
+    }
+}
+
+/**
+ * Base class for causal language model (or autoregressive) outputs.
+ */
+export class CausalLMOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax).
+     */
+    constructor({ logits }) {
+        super();
+        this.logits = logits;
+    }
+}
+
+/**
+ * Base class for causal language model (or autoregressive) outputs.
+ */
+export class CausalLMOutputWithPast extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax).
+     * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks)
+     * that can be used (see `past_key_values` input) to speed up sequential decoding.
+     */
+    constructor({ logits, past_key_values }) {
+        super();
+        this.logits = logits;
+        this.past_key_values = past_key_values;
+    }
+}
+
+export class Seq2SeqLMOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits The output logits of the model.
+     * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model.
+     * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model.
+     * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads.
+     * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads.
+     */
+    constructor({ logits, past_key_values, encoder_outputs, decoder_attentions = null, cross_attentions = null }) {
+        super();
+        this.logits = logits;
+        this.past_key_values = past_key_values;
+        this.encoder_outputs = encoder_outputs;
+        this.decoder_attentions = decoder_attentions;
+        this.cross_attentions = cross_attentions;
+    }
+}
+
+export class ImageMattingOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`.
+     */
+    constructor({ alphas }) {
+        super();
+        this.alphas = alphas;
+    }
+}
+
+/**
+ * Describes the outputs for the VITS model.
+ */
+export class VitsModelOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`.
+     * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model.
+     * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform.
+     */
+    constructor({ waveform, spectrogram }) {
+        super();
+        this.waveform = waveform;
+        this.spectrogram = spectrogram;
+    }
+}
+
+export class Sam2ImageSegmentationOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.iou_scores The output logits of the model.
+     * @param {Tensor} output.pred_masks Predicted boxes.
+     * @param {Tensor} output.object_score_logits Logits for the object score, indicating if an object is present.
+     */
+    constructor({ iou_scores, pred_masks, object_score_logits }) {
+        super();
+        this.iou_scores = iou_scores;
+        this.pred_masks = pred_masks;
+        this.object_score_logits = object_score_logits;
+    }
+}
+
+export class MgpstrModelOutput extends ModelOutput {
+    constructor({ char_logits, bpe_logits, wp_logits }) {
+        super();
+        this.char_logits = char_logits;
+        this.bpe_logits = bpe_logits;
+        this.wp_logits = wp_logits;
+    }
+
+    get logits() {
+        return [this.char_logits, this.bpe_logits, this.wp_logits];
+    }
+}
+
+export class MimiEncoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
+     */
+    constructor({ audio_codes }) {
+        super();
+        this.audio_codes = audio_codes;
+    }
+}
+
+export class MimiDecoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    constructor({ audio_values }) {
+        super();
+        this.audio_values = audio_values;
+    }
+}
+
+export class DacEncoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`.
+     */
+    constructor({ audio_codes }) {
+        super();
+        this.audio_codes = audio_codes;
+    }
+}
+
+export class DacDecoderOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    constructor({ audio_values }) {
+        super();
+        this.audio_values = audio_values;
+    }
+}
diff --git a/src/models/pre-trained-model.js b/src/models/pre-trained-model.js
new file mode 100644
index 000000000..307686e04
--- /dev/null
+++ b/src/models/pre-trained-model.js
@@ -0,0 +1,1277 @@
+import { Callable } from '../utils/generic.js';
+import { constructSessions, sessionRun } from './session.js';
+import { AutoConfig, getCacheShapes } from '../configs.js';
+import { Tensor, DataTypeMap, full_like, cat, zeros_like, toI64Tensor, ones_like, ones } from '../utils/tensor.js';
+import {
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
+} from './registry.js';
+import { GITHUB_ISSUE_URL } from '../utils/constants.js';
+import {
+    decoderForward,
+    decoder_prepare_inputs_for_generation,
+    seq2seqForward,
+    encoder_decoder_prepare_inputs_for_generation,
+    imageTextToTextForward,
+    multimodal_text_to_text_prepare_inputs_for_generation,
+    audioTextToTextForward,
+    multimodality_prepare_inputs_for_generation,
+    autoEncoderForward,
+    chatterbox_prepare_inputs_for_generation,
+    encoderForward,
+    getOptionalConfigs,
+} from './utils.js';
+import {
+    LogitsProcessorList,
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    SuppressTokensAtBeginLogitsProcessor,
+    NoRepeatNGramLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    NoBadWordsLogitsProcessor,
+    MinLengthLogitsProcessor,
+    MinNewTokensLengthLogitsProcessor,
+    TemperatureLogitsWarper,
+    ClassifierFreeGuidanceLogitsProcessor,
+} from '../generation/logits_process.js';
+import { GenerationConfig } from '../generation/configuration_utils.js';
+import { EosTokenCriteria, MaxLengthCriteria, StoppingCriteriaList } from '../generation/stopping_criteria.js';
+import { LogitsSampler } from '../generation/logits_sampler.js';
+import { pick } from '../utils/core.js';
+import { ModelOutput } from './output.js';
+
+export const MODEL_TYPES = {
+    EncoderOnly: 0,
+    EncoderDecoder: 1,
+    Seq2Seq: 2,
+    Vision2Seq: 3,
+    DecoderOnly: 4,
+    MaskGeneration: 5,
+    ImageTextToText: 6,
+    Musicgen: 7,
+    MultiModality: 8,
+    Phi3V: 9,
+    AudioTextToText: 10,
+    AutoEncoder: 11,
+    ImageAudioTextToText: 12,
+    Supertonic: 13,
+    Chatterbox: 14,
+};
+
+const MODEL_TYPE_CONFIG = {
+    [MODEL_TYPES.DecoderOnly]: {
+        can_generate: true,
+        forward: decoderForward,
+        prepare_inputs: decoder_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.Seq2Seq]: {
+        can_generate: true,
+        forward: seq2seqForward,
+        prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.Vision2Seq]: {
+        can_generate: true,
+        forward: seq2seqForward,
+        prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.Musicgen]: {
+        can_generate: true,
+        forward: seq2seqForward,
+        prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.EncoderDecoder]: {
+        can_generate: false,
+        forward: seq2seqForward,
+        prepare_inputs: null,
+    },
+    [MODEL_TYPES.ImageTextToText]: {
+        can_generate: true,
+        forward: imageTextToTextForward,
+        prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.AudioTextToText]: {
+        can_generate: true,
+        forward: audioTextToTextForward,
+        prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.Phi3V]: {
+        can_generate: true,
+        forward: null,
+        prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.ImageAudioTextToText]: {
+        can_generate: true,
+        forward: null,
+        prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.MultiModality]: {
+        can_generate: true,
+        forward: null,
+        prepare_inputs: multimodality_prepare_inputs_for_generation,
+    },
+    [MODEL_TYPES.AutoEncoder]: {
+        can_generate: false,
+        forward: autoEncoderForward,
+        prepare_inputs: null,
+    },
+    [MODEL_TYPES.Chatterbox]: {
+        can_generate: true,
+        forward: encoderForward,
+        prepare_inputs: chatterbox_prepare_inputs_for_generation,
+    },
+    default: {
+        can_generate: false,
+        forward: encoderForward,
+        prepare_inputs: null,
+    },
+};
+
+export const MODEL_TYPE_MAPPING = new Map();
+export const MODEL_NAME_TO_CLASS_MAPPING = new Map();
+export const MODEL_CLASS_TO_NAME_MAPPING = new Map();
+
+/**
+ * A base class for pre-trained models that provides the model configuration and an ONNX session.
+ */
+export class PreTrainedModel extends Callable {
+    main_input_name = 'input_ids';
+    forward_params = ['input_ids', 'attention_mask'];
+
+    _return_dict_in_generate_keys = null;
+    /**
+     * Creates a new instance of the `PreTrainedModel` class.
+     * @param {import('../configs.js').PretrainedConfig} config The model configuration.
+     * @param {Record<string, any>} sessions The inference sessions for the model.
+     * @param {Record<string, Object>} configs Additional configuration files (e.g., generation_config.json).
+     */
+    constructor(config, sessions, configs) {
+        super();
+
+        this.config = config;
+        this.sessions = sessions;
+        this.configs = configs;
+
+        const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
+        const modelType = MODEL_TYPE_MAPPING.get(modelName);
+
+        // Get configuration for this model type
+        const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
+
+        this.can_generate = typeConfig.can_generate;
+        this._forward = typeConfig.forward;
+        this._prepare_inputs_for_generation = typeConfig.prepare_inputs;
+
+        if (this.can_generate) {
+            this.forward_params.push('past_key_values');
+        }
+
+        /** @type {import('../configs.js').TransformersJSConfig} */
+        this.custom_config = this.config['transformers.js_config'] ?? {};
+    }
+
+    /**
+     * Disposes of all the ONNX sessions that were created during inference.
+     * @returns {Promise<unknown[]>} An array of promises, one for each ONNX session that is being disposed.
+     * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry
+     */
+    async dispose() {
+        const promises = [];
+        for (const session of Object.values(this.sessions)) {
+            promises.push(session.release?.());
+        }
+        return await Promise.all(promises);
+    }
+
+    /**
+     * Instantiate one of the model classes of the library from a pretrained model.
+     *
+     * The model class to instantiate is selected based on the `model_type` property of the config object
+     * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
+     *
+     * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
+     * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+     *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+     *   user or organization name, like `dbmdz/bert-base-german-cased`.
+     * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`.
+     * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
+     *
+     * @returns {Promise<PreTrainedModel>} A new instance of the `PreTrainedModel` class.
+     */
+    static async from_pretrained(
+        pretrained_model_name_or_path,
+        {
+            progress_callback = null,
+            config = null,
+            cache_dir = null,
+            local_files_only = false,
+            revision = 'main',
+            model_file_name = null,
+            subfolder = 'onnx',
+            device = null,
+            dtype = null,
+            use_external_data_format = null,
+            session_options = {},
+        } = {},
+    ) {
+        let options = {
+            progress_callback,
+            config,
+            cache_dir,
+            local_files_only,
+            revision,
+            model_file_name,
+            subfolder,
+            device,
+            dtype,
+            use_external_data_format,
+            session_options,
+        };
+
+        const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
+        const modelType = MODEL_TYPE_MAPPING.get(modelName);
+
+        config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
+
+        let info;
+        if (modelType === MODEL_TYPES.DecoderOnly) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        model: options.model_file_name ?? 'model',
+                    },
+                    options,
+                    'model',
+                ),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        model: 'encoder_model',
+                        decoder_model_merged: 'decoder_model_merged',
+                    },
+                    options,
+                    'decoder_model_merged',
+                ),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.MaskGeneration) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        model: 'vision_encoder',
+                        prompt_encoder_mask_decoder: 'prompt_encoder_mask_decoder',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.EncoderDecoder) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        model: 'encoder_model',
+                        decoder_model_merged: 'decoder_model_merged',
+                    },
+                    options,
+                    'decoder_model_merged',
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.ImageTextToText) {
+            const sessions = {
+                embed_tokens: 'embed_tokens',
+                vision_encoder: 'vision_encoder',
+                decoder_model_merged: 'decoder_model_merged',
+            };
+            if (config.is_encoder_decoder) {
+                sessions['model'] = 'encoder_model';
+            }
+            info = await Promise.all([
+                constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.AudioTextToText) {
+            const sessions = {
+                embed_tokens: 'embed_tokens',
+                audio_encoder: 'audio_encoder',
+                decoder_model_merged: 'decoder_model_merged',
+            };
+            info = await Promise.all([
+                constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
+            const sessions = {
+                embed_tokens: 'embed_tokens',
+                audio_encoder: 'audio_encoder',
+                vision_encoder: 'vision_encoder',
+                decoder_model_merged: 'decoder_model_merged',
+            };
+            info = await Promise.all([
+                constructSessions(pretrained_model_name_or_path, sessions, options),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.Musicgen) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        model: 'text_encoder',
+                        decoder_model_merged: 'decoder_model_merged',
+                        encodec_decode: 'encodec_decode',
+                    },
+                    options,
+                    'decoder_model_merged',
+                ),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.MultiModality) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        prepare_inputs_embeds: 'prepare_inputs_embeds',
+                        model: 'language_model',
+                        lm_head: 'lm_head',
+                        gen_head: 'gen_head',
+                        gen_img_embeds: 'gen_img_embeds',
+                        image_decode: 'image_decode',
+                    },
+                    options,
+                    'model',
+                ),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.Phi3V) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        prepare_inputs_embeds: 'prepare_inputs_embeds',
+                        model: 'model',
+                        vision_encoder: 'vision_encoder',
+                    },
+                    options,
+                    'model',
+                ),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.Chatterbox) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        embed_tokens: 'embed_tokens',
+                        speech_encoder: 'speech_encoder',
+                        model: 'language_model',
+                        conditional_decoder: 'conditional_decoder',
+                    },
+                    options,
+                    'model',
+                ),
+                getOptionalConfigs(
+                    pretrained_model_name_or_path,
+                    {
+                        generation_config: 'generation_config.json',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.AutoEncoder) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        encoder_model: 'encoder_model',
+                        decoder_model: 'decoder_model',
+                    },
+                    options,
+                ),
+            ]);
+        } else if (modelType === MODEL_TYPES.Supertonic) {
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        text_encoder: 'text_encoder',
+                        latent_denoiser: 'latent_denoiser',
+                        voice_decoder: 'voice_decoder',
+                    },
+                    options,
+                ),
+            ]);
+        } else {
+            // should be MODEL_TYPES.EncoderOnly
+            if (modelType !== MODEL_TYPES.EncoderOnly) {
+                const type = modelName ?? config?.model_type;
+                if (type !== 'custom') {
+                    console.warn(
+                        `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`,
+                    );
+                }
+            }
+            info = await Promise.all([
+                constructSessions(
+                    pretrained_model_name_or_path,
+                    {
+                        model: options.model_file_name ?? 'model',
+                    },
+                    options,
+                ),
+            ]);
+        }
+
+        // @ts-ignore
+        return new this(config, ...info);
+    }
+
+    /**
+     * Runs the model with the provided inputs
+     * @param {Object} model_inputs Object containing input tensors
+     * @returns {Promise<Object>} Object containing output tensors
+     */
+    async _call(model_inputs) {
+        return await this.forward(model_inputs);
+    }
+
+    /**
+     * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
+     * will be chosen based on the model type.
+     * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
+     * @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
+     * @throws {Error} This method must be implemented in subclasses.
+     */
+    async forward(model_inputs) {
+        return await this._forward(this, model_inputs);
+    }
+
+    /**
+     * Get the model's generation config, if it exists.
+     * @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
+     */
+    get generation_config() {
+        return this.configs?.generation_config ?? null;
+    }
+
+    /**
+     * @param {GenerationConfig} generation_config
+     * @param {number} input_ids_seq_length The starting sequence length for the input ids.
+     * @returns {LogitsProcessorList}
+     * @private
+     */
+    _get_logits_processor(
+        generation_config,
+        input_ids_seq_length,
+        // encoder_input_ids, TODO
+        // prefix_allowed_tokens_fn, TODO
+        logits_processor = null,
+    ) {
+        const processors = new LogitsProcessorList();
+
+        // if (generation_config.diversity_penalty !== null && generation_config.diversity_penalty > 0.0) {
+        //     processors.push(new HammingDiversityLogitsProcessor(
+        //         generation_config.diversity_penalty,
+        //         generation_config.num_beams,
+        //         generation_config.num_beam_groups
+        //     ));
+        // }
+
+        // if (generation_config.encoder_repetition_penalty !== null && generation_config.encoder_repetition_penalty !== 1.0) {
+        //     processors.push(new EncoderRepetitionPenaltyLogitsProcessor(
+        //         generation_config.encoder_repetition_penalty,
+        //         encoder_input_ids
+        //     ));
+        // }
+
+        if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1.0) {
+            processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
+        }
+
+        if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
+            processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
+        }
+
+        // if (generation_config.encoder_no_repeat_ngram_size !== null && generation_config.encoder_no_repeat_ngram_size > 0) {
+        //     if (this.config.is_encoder_decoder) {
+        //         processors.push(new EncoderNoRepeatNGramLogitsProcessor(
+        //             generation_config.encoder_no_repeat_ngram_size,
+        //             encoder_input_ids
+        //         ));
+        //     } else {
+        //         throw new Error("It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture");
+        //     }
+        // }
+
+        if (generation_config.bad_words_ids !== null) {
+            processors.push(
+                new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id),
+            );
+        }
+
+        if (
+            generation_config.min_length !== null &&
+            generation_config.eos_token_id !== null &&
+            generation_config.min_length > 0
+        ) {
+            processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
+        }
+
+        if (
+            generation_config.min_new_tokens !== null &&
+            generation_config.eos_token_id !== null &&
+            generation_config.min_new_tokens > 0
+        ) {
+            processors.push(
+                new MinNewTokensLengthLogitsProcessor(
+                    input_ids_seq_length,
+                    generation_config.min_new_tokens,
+                    generation_config.eos_token_id,
+                ),
+            );
+        }
+
+        // if (prefix_allowed_tokens_fn !== null) {
+        //     processors.push(new PrefixConstrainedLogitsProcessor(
+        //         prefix_allowed_tokens_fn,
+        //         generation_config.num_beams / generation_config.num_beam_groups
+        //     ));
+        // }
+
+        if (generation_config.forced_bos_token_id !== null) {
+            processors.push(new ForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id));
+        }
+
+        if (generation_config.forced_eos_token_id !== null) {
+            processors.push(
+                new ForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id),
+            );
+        }
+
+        // if (generation_config.remove_invalid_values === true) {
+        //     processors.push(new InfNanRemoveLogitsProcessor());
+        // }
+
+        // if (generation_config.exponential_decay_length_penalty !== null) {
+        //     processors.push(new ExponentialDecayLengthPenalty(
+        //         generation_config.exponential_decay_length_penalty,
+        //         generation_config.eos_token_id,
+        //         input_ids_seq_length
+        //     ));
+        // }
+
+        // if (generation_config.suppress_tokens !== null) {
+        //     processors.push(new SuppressTokensLogitsProcessor(generation_config.suppress_tokens));
+        // }
+
+        if (generation_config.begin_suppress_tokens !== null) {
+            const begin_index =
+                input_ids_seq_length > 1 || generation_config.forced_bos_token_id === null
+                    ? input_ids_seq_length
+                    : input_ids_seq_length + 1;
+
+            processors.push(
+                new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index),
+            );
+        }
+
+        // DEPRECATED: https://github.com/huggingface/transformers/pull/29485
+        // if (generation_config.forced_decoder_ids !== null) {
+        //     processors.push(new ForceTokensLogitsProcessor(generation_config.forced_decoder_ids));
+        // }
+
+        // 8. prepare batched CFG externally
+        if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
+            processors.push(new ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale));
+        }
+
+        if (generation_config.temperature === 0 && generation_config.do_sample) {
+            console.warn(
+                '`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`.',
+            );
+            generation_config.do_sample = false;
+        }
+
+        if (generation_config.do_sample) {
+            if (generation_config.temperature !== null && generation_config.temperature !== 1.0) {
+                processors.push(new TemperatureLogitsWarper(generation_config.temperature));
+            }
+            // TODO: Add TopPLogitsWarper and TopKLogitsWarper
+            // if (generation_config.top_k !== null && generation_config.top_k !== 0) {
+            //     processors.push(new TopKLogitsWarper(generation_config.top_k));
+            // }
+            // if (generation_config.top_p !== null && generation_config.top_p < 1.0) {
+            //     processors.push(new TopPLogitsWarper(generation_config.top_p));
+            // }
+        }
+
+        if (logits_processor !== null) {
+            processors.extend(logits_processor);
+        }
+
+        // `LogitNormalization` should always be the last logit processor, when present
+        // if (generation_config.renormalize_logits === true) {
+        //     processors.push(new LogitNormalization());
+        // }
+
+        return processors;
+    }
+
+    /**
+     * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation.
+     * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object.
+     * @param {GenerationConfig|null} generation_config A `GenerationConfig` object containing generation parameters.
+     * @param {Object} kwargs Additional generation parameters to be used in place of those in the `generation_config` object.
+     * @returns {GenerationConfig} The final generation config object to be used by the model for text generation.
+     */
+    _prepare_generation_config(generation_config, kwargs, cls = GenerationConfig) {
+        // Create empty generation config (contains defaults)
+        // We pass `this.config` so that if `eos_token_id` or `bos_token_id` exist in the model's config, we will use them
+        const config = { ...this.config };
+        for (const key of ['decoder', 'generator', 'text_config']) {
+            // Special case: some models have generation attributes set in the decoder.
+            // Use them if still unset in the generation config.
+            if (key in config) {
+                Object.assign(config, config[key]);
+            }
+        }
+
+        const gen_config = new cls(config);
+
+        // Apply model's generation config, if it exists
+        Object.assign(gen_config, this.generation_config ?? {});
+
+        // Next, use any generation config specified by the user
+        // when calling `generate`
+        if (generation_config) {
+            Object.assign(gen_config, generation_config);
+        }
+
+        // Finally, if any kwargs were passed, use them to overwrite
+        if (kwargs) {
+            Object.assign(gen_config, pick(kwargs, Object.getOwnPropertyNames(gen_config)));
+        }
+
+        return gen_config;
+    }
+
+    /**
+     *
+     * @param {GenerationConfig} generation_config
+     * @param {StoppingCriteriaList} [stopping_criteria=null]
+     */
+    _get_stopping_criteria(generation_config, stopping_criteria = null) {
+        const criteria = new StoppingCriteriaList();
+
+        if (generation_config.max_length !== null) {
+            criteria.push(
+                new MaxLengthCriteria(generation_config.max_length, this.config.max_position_embeddings ?? null),
+            );
+        }
+        // if (generation_config.max_time !== null) {
+        //     criteria.push(new MaxTimeCriteria(generation_config.max_time));
+        // }
+        if (generation_config.eos_token_id !== null) {
+            criteria.push(new EosTokenCriteria(generation_config.eos_token_id));
+        }
+
+        if (stopping_criteria) {
+            criteria.extend(stopping_criteria);
+        }
+        return criteria;
+    }
+
+    /**
+     * Confirms that the model class is compatible with generation.
+     * If not, raises an exception that points to the right class to use.
+     */
+    _validate_model_class() {
+        if (!this.can_generate) {
+            const generate_compatible_mappings = [
+                MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+                // MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, // TODO
+                MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
+                MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+                MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
+            ];
+
+            const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
+
+            const generate_compatible_classes = new Set();
+            const modelType = this.config.model_type;
+            for (const model_mapping of generate_compatible_mappings) {
+                const supported_models = model_mapping.get(modelType);
+                if (supported_models) {
+                    generate_compatible_classes.add(supported_models[0]);
+                }
+            }
+
+            let errorMessage = `The current model class (${modelName}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;
+            if (generate_compatible_classes.size > 0) {
+                errorMessage += ` Please use the following class instead: ${[...generate_compatible_classes].join(', ')}`;
+            }
+            throw Error(errorMessage);
+        }
+    }
+
+    prepare_inputs_for_generation(...args) {
+        return this._prepare_inputs_for_generation(this, ...args);
+    }
+
+    /**
+     *
+     * @param {Object} inputs
+     * @param {bigint[][]} inputs.generated_input_ids
+     * @param {Object} inputs.outputs
+     * @param {Object} inputs.model_inputs
+     * @param {boolean} inputs.is_encoder_decoder
+     * @returns {Object} The updated model inputs for the next generation iteration.
+     */
+    _update_model_kwargs_for_generation({ generated_input_ids, outputs, model_inputs, is_encoder_decoder }) {
+        // update past_key_values
+        model_inputs['past_key_values'] = this.getPastKeyValues(outputs, model_inputs.past_key_values);
+
+        // update inputs for next run
+        model_inputs['input_ids'] = new Tensor('int64', generated_input_ids.flat(), [generated_input_ids.length, 1]);
+
+        if (!is_encoder_decoder) {
+            // update attention mask
+            model_inputs.attention_mask = cat(
+                [model_inputs.attention_mask, ones([model_inputs.attention_mask.dims[0], 1])],
+                1,
+            );
+        } else if ('decoder_attention_mask' in model_inputs) {
+            // TODO: update decoder attention mask if the model requires it
+        }
+
+        // force recreate position_ids in next iteration
+        model_inputs['position_ids'] = null;
+
+        return model_inputs;
+    }
+
+    /**
+     * This function extracts the model-specific `inputs` for generation.
+     * @param {Object} params
+     * @param {Tensor} [params.inputs=null]
+     * @param {number} [params.bos_token_id=null]
+     * @param {Record<string, Tensor|number[]>} [params.model_kwargs]
+     * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor>, model_input_name: string}} The model-specific inputs for generation.
+     */
+    _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
+        const model_inputs = pick(model_kwargs, this.forward_params);
+        const input_name = this.main_input_name;
+        if (input_name in model_inputs) {
+            if (inputs) {
+                throw new Error(
+                    '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' +
+                        'Make sure to either pass {inputs} or {input_name}=...',
+                );
+            }
+        } else {
+            model_inputs[input_name] = inputs;
+        }
+
+        const inputs_tensor = model_inputs[input_name];
+
+        return { inputs_tensor, model_inputs, model_input_name: input_name };
+    }
+
+    async _prepare_encoder_decoder_kwargs_for_generation({
+        inputs_tensor,
+        model_inputs,
+        model_input_name,
+        generation_config,
+    }) {
+        if (
+            this.sessions['model'].inputNames.includes('inputs_embeds') &&
+            !model_inputs.inputs_embeds &&
+            '_prepare_inputs_embeds' in this
+        ) {
+            // Encoder expects `inputs_embeds` instead of `input_ids`
+            const { input_ids, pixel_values, attention_mask, ...kwargs } = model_inputs;
+            // @ts-ignore
+            const prepared_inputs = await this._prepare_inputs_embeds(model_inputs);
+            model_inputs = {
+                ...kwargs,
+                ...pick(prepared_inputs, ['inputs_embeds', 'attention_mask']),
+            };
+        }
+        let { last_hidden_state } = await encoderForward(this, model_inputs);
+
+        // for classifier free guidance we need to add a 'null' input to our encoder hidden states
+        if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
+            last_hidden_state = cat([last_hidden_state, full_like(last_hidden_state, 0.0)], 0);
+
+            if ('attention_mask' in model_inputs) {
+                model_inputs['attention_mask'] = cat(
+                    [model_inputs['attention_mask'], zeros_like(model_inputs['attention_mask'])],
+                    0,
+                );
+            }
+        } else if (model_inputs.decoder_input_ids) {
+            // Ensure that the encoder outputs have the same batch size as the decoder inputs,
+            // allowing for more efficient batched generation for single inputs
+            const decoder_input_ids_batch_size = toI64Tensor(model_inputs.decoder_input_ids).dims[0];
+            if (decoder_input_ids_batch_size !== last_hidden_state.dims[0]) {
+                if (last_hidden_state.dims[0] !== 1) {
+                    throw new Error(
+                        `The encoder outputs have a different batch size (${last_hidden_state.dims[0]}) than the decoder inputs (${decoder_input_ids_batch_size}).`,
+                    );
+                }
+                last_hidden_state = cat(
+                    Array.from({ length: decoder_input_ids_batch_size }, () => last_hidden_state),
+                    0,
+                );
+            }
+        }
+        model_inputs['encoder_outputs'] = last_hidden_state;
+
+        return model_inputs;
+    }
+
+    /**
+     * Prepares `decoder_input_ids` for generation with encoder-decoder models
+     * @param {*} param0
+     */
+    _prepare_decoder_input_ids_for_generation({
+        batch_size,
+        model_input_name,
+        model_kwargs,
+        decoder_start_token_id,
+        bos_token_id,
+        generation_config,
+    }) {
+        let { decoder_input_ids, ...model_inputs } = model_kwargs;
+
+        // Prepare input ids if the user has not defined `decoder_input_ids` manually.
+        if (!(decoder_input_ids instanceof Tensor)) {
+            if (!decoder_input_ids) {
+                decoder_start_token_id ??= bos_token_id;
+
+                if (this.config.model_type === 'musicgen') {
+                    // Custom logic (TODO: move to Musicgen class)
+                    decoder_input_ids = Array.from(
+                        {
+                            // @ts-expect-error TS2339
+                            length: batch_size * this.config.decoder.num_codebooks,
+                        },
+                        () => [decoder_start_token_id],
+                    );
+                } else if (Array.isArray(decoder_start_token_id)) {
+                    if (decoder_start_token_id.length !== batch_size) {
+                        throw new Error(
+                            `\`decoder_start_token_id\` expcted to have length ${batch_size} but got ${decoder_start_token_id.length}`,
+                        );
+                    }
+                    decoder_input_ids = decoder_start_token_id;
+                } else {
+                    decoder_input_ids = Array.from(
+                        {
+                            length: batch_size,
+                        },
+                        () => [decoder_start_token_id],
+                    );
+                }
+            } else if (!Array.isArray(decoder_input_ids[0])) {
+                // Correct batch size
+                decoder_input_ids = Array.from(
+                    {
+                        length: batch_size,
+                    },
+                    () => decoder_input_ids,
+                );
+            }
+            decoder_input_ids = toI64Tensor(decoder_input_ids);
+        }
+
+        model_kwargs['decoder_attention_mask'] = ones_like(decoder_input_ids);
+
+        return { input_ids: decoder_input_ids, model_inputs };
+    }
+
+    /**
+     * Generates sequences of token ids for models with a language modeling head.
+     * @param {import('../generation/parameters.js').GenerationFunctionParameters} options
+     * @returns {Promise<ModelOutput|Tensor>} The output of the model, which can contain the generated token ids, attentions, and scores.
+     */
+    async generate({
+        inputs = null,
+        generation_config = null,
+        logits_processor = null,
+        stopping_criteria = null,
+        streamer = null,
+
+        // inputs_attention_mask = null,
+        ...kwargs
+    }) {
+        this._validate_model_class();
+
+        // Update generation config with defaults and kwargs
+        generation_config = this._prepare_generation_config(generation_config, kwargs);
+
+        // 3. Define model inputs
+        let { inputs_tensor, model_inputs, model_input_name } = this._prepare_model_inputs({
+            inputs,
+            model_kwargs: kwargs,
+        });
+
+        const is_encoder_decoder = this.config.is_encoder_decoder;
+
+        // 4. Define other model kwargs
+        if (!is_encoder_decoder) {
+            // decoder-only models should use left-padding for generation
+        } else if (!('encoder_outputs' in model_inputs)) {
+            // if model is encoder decoder encoder_outputs are created
+            // and added to `model_kwargs`
+            model_inputs = await this._prepare_encoder_decoder_kwargs_for_generation({
+                inputs_tensor,
+                model_inputs,
+                model_input_name,
+                generation_config,
+            });
+        }
+
+        // 5. Prepare `input_ids` which will be used for auto-regressive generation
+        // TODO: Update to align with HF transformers' implementation
+        let input_ids;
+        if (is_encoder_decoder) {
+            // Generating from the encoder outputs
+            ({ input_ids, model_inputs } = this._prepare_decoder_input_ids_for_generation({
+                batch_size: model_inputs[model_input_name].dims.at(0),
+                model_input_name,
+                model_kwargs: model_inputs,
+                decoder_start_token_id: generation_config.decoder_start_token_id,
+                bos_token_id: generation_config.bos_token_id,
+                generation_config,
+            }));
+        } else {
+            input_ids = model_inputs[model_input_name];
+        }
+
+        // 6. Prepare `max_length` depending on other stopping criteria.
+        let input_ids_length = input_ids.dims.at(-1);
+
+        if (generation_config.max_new_tokens !== null) {
+            generation_config.max_length = input_ids_length + generation_config.max_new_tokens;
+        }
+
+        // input_ids_length = model_inputs[model_input_name].dims.at(1);
+        // // inputs instanceof Tensor ?  : inputs.length;
+
+        // // decoder-only
+        // if (input_ids_length === 0) {
+        //     throw Error("Must supply a non-empty array of input token ids.")
+        // }
+
+        // let decoder_input_ids =
+        // generation_config.decoder_input_ids
+        // ?? generation_config.decoder_start_token_id
+        // ?? generation_config.bos_token_id
+        // ?? generation_config.eos_token_id;
+
+        // Update logits processor
+        // 8. prepare distribution pre_processing samplers
+        const prepared_logits_processor = this._get_logits_processor(
+            generation_config,
+            input_ids_length,
+            logits_processor,
+        );
+
+        // 9. prepare stopping criteria
+        const prepared_stopping_criteria = this._get_stopping_criteria(generation_config, stopping_criteria);
+
+        // /** @type {number[]} */
+        // let eos_token_ids = generation_config.eos_token_id;
+        // if (eos_token_ids !== null && !Array.isArray(eos_token_ids)) {
+        //     eos_token_ids = [eos_token_ids];
+        // }
+
+        const numInputs = model_inputs[model_input_name].dims.at(0);
+
+        // TODO:
+        // done is a list of booleans to keep track of which inputs are done
+        // const done = new Array(numInputs).fill(false);
+        // For efficiency purposes, we remove completed rows from model_inputs
+        // when the beam is complete, and we keep track of the row index
+        // const rowIndexToBatchIndex = new Map();
+
+        const sampler = LogitsSampler.getSampler(generation_config);
+
+        // TODO make > numInputs
+        const scores = new Array(numInputs).fill(0);
+        /** @type {bigint[][]} */
+        const all_input_ids = input_ids.tolist();
+        if (streamer) {
+            streamer.put(all_input_ids);
+        }
+        // const all_generated_input_ids = Array.from({ length: numInputs }, () => []);
+
+        // NOTE: For now, we don't support spawning new beams
+        // TODO: when we do, we simply copy past key values and accumulate into single large tensor
+
+        ////////////////////////////////////////////////////
+        // Generic search which handles 4 generation modes:
+        // - GenerationMode.GREEDY_SEARCH
+        // - GenerationMode.SAMPLE
+        // - GenerationMode.BEAM_SEARCH
+        // - GenerationMode.BEAM_SAMPLE
+        ////////////////////////////////////////////////////
+        let outputs;
+        let attentions = {};
+        let return_dict_items = {};
+        while (true) {
+            // prepare model inputs
+            model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
+            outputs = await this.forward(model_inputs);
+
+            if (generation_config.return_dict_in_generate) {
+                if (generation_config.output_attentions) {
+                    // Get attentions if they are present
+                    const token_attentions = this.getAttentions(outputs);
+                    for (const key in token_attentions) {
+                        if (!(key in attentions)) {
+                            attentions[key] = [];
+                        }
+                        attentions[key].push(token_attentions[key]);
+                    }
+                } else if (this._return_dict_in_generate_keys) {
+                    Object.assign(return_dict_items, pick(outputs, this._return_dict_in_generate_keys));
+                }
+            }
+
+            // Logits are of the form [batch_size, out_seq_length, vocab_size]
+            // In most cases, this will be [batch_size, 1, vocab_size]
+            // So, we select the last token's logits:
+            // (equivalent to `logits = outputs.logits[:, -1, :]`)
+            // The `.to('float32')` is necessary for models with float16 logits,
+            // and is a no-op for float32 logits.
+            // TODO: Support float16 sampling in the sampler directly
+            const logits = outputs.logits.slice(null, -1, null).to('float32');
+
+            const next_tokens_scores = prepared_logits_processor(all_input_ids, logits);
+
+            /** @type {[bigint][]} */
+            const generated_input_ids = [];
+            // const new_kv_cache = [];// NOTE: Only used for beam search when concatenating new kv
+            // Loop over each batch
+            for (let batch_idx = 0; batch_idx < next_tokens_scores.dims.at(0); ++batch_idx) {
+                const logs = next_tokens_scores[batch_idx];
+
+                const sampledTokens = await sampler(logs);
+                for (const [newTokenId, logProb] of sampledTokens) {
+                    const bigint = BigInt(newTokenId);
+                    // TODO: If branching, use previous beam as a starting point
+                    // update generated ids, model inputs, and length for next step
+                    scores[batch_idx] += logProb;
+                    all_input_ids[batch_idx].push(bigint);
+                    generated_input_ids.push([bigint]);
+
+                    // TODO: Support beam search
+                    break;
+                }
+            }
+            if (streamer) {
+                streamer.put(generated_input_ids);
+            }
+
+            const stop = prepared_stopping_criteria(all_input_ids);
+            if (stop.every((x) => x)) {
+                break;
+            }
+
+            model_inputs = this._update_model_kwargs_for_generation({
+                generated_input_ids,
+                outputs,
+                model_inputs,
+                is_encoder_decoder,
+            });
+        }
+
+        if (streamer) {
+            streamer.end();
+        }
+
+        // Retrieve and dispose all final past key values (including encoder attentions)
+        const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
+
+        // TODO: ensure all_input_ids is padded correctly...
+        const sequences = new Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
+
+        if (generation_config.return_dict_in_generate) {
+            return {
+                sequences,
+                past_key_values,
+                ...attentions,
+                ...return_dict_items,
+                // TODO:
+                // scores,
+                // logits,
+            };
+        } else {
+            // Dispose all remaining tensors
+            for (const tensor of Object.values(outputs)) {
+                if (tensor.location === 'gpu-buffer') {
+                    tensor.dispose();
+                }
+            }
+            return sequences;
+        }
+    }
+
+    /**
+     * Returns an object containing past key values from the given decoder results object.
+     *
+     * @param {Object} decoderResults The decoder results object.
+     * @param {Object} pastKeyValues The previous past key values.
+     * @returns {Object} An object containing past key values.
+     */
+    getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
+        const pkvs = Object.create(null);
+
+        for (const name in decoderResults) {
+            if (name.startsWith('present')) {
+                const newName = name
+                    // Hybrid cache architecture
+                    .replace('present_ssm', 'past_ssm') // Mamba
+                    .replace('present_conv', 'past_conv') // LFM2
+
+                    // Standard cache architecture
+                    .replace('present', 'past_key_values');
+                const is_encoder_pkv = name.includes('encoder');
+                if (is_encoder_pkv && pastKeyValues) {
+                    // Optimization introduced by optimum to reuse past key values.
+                    // So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
+                    // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
+                    pkvs[newName] = pastKeyValues[newName];
+                } else {
+                    // decoder or using first encoder PKVs
+                    pkvs[newName] = decoderResults[name];
+                }
+
+                if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
+                    // - Always dispose decoder PKVs
+                    // - Only dispose encoder past key values when requested (after generation)
+                    const t = pastKeyValues[newName];
+                    if (t.location === 'gpu-buffer') {
+                        t.dispose();
+                    }
+                }
+            }
+        }
+        return pkvs;
+    }
+
+    /**
+     * Returns an object containing attentions from the given model output object.
+     *
+     * @param {Object} model_output The output of the model.
+     * @returns {{cross_attentions?: Tensor[]}} An object containing attentions.
+     */
+    getAttentions(model_output) {
+        const attentions = {};
+
+        for (const attnName of ['cross_attentions', 'encoder_attentions', 'decoder_attentions']) {
+            for (const name in model_output) {
+                if (name.startsWith(attnName)) {
+                    if (!(attnName in attentions)) {
+                        attentions[attnName] = [];
+                    }
+                    attentions[attnName].push(model_output[name]);
+                }
+            }
+        }
+        return attentions;
+    }
+
+    /**
+     * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
+     *
+     * @param {Object} decoderFeeds The decoder feeds object to add past key values to.
+     * @param {Object} pastKeyValues An object containing past key values.
+     */
+    addPastKeyValues(decoderFeeds, pastKeyValues) {
+        if (pastKeyValues) {
+            Object.assign(decoderFeeds, pastKeyValues);
+        } else {
+            const session = this.sessions['decoder_model_merged'] ?? this.sessions['model'];
+            const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1;
+
+            const dtype = session?.config?.kv_cache_dtype ?? 'float32';
+            const cls = dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32;
+            const shapes = getCacheShapes(this.config, { batch_size });
+            for (const name in shapes) {
+                const size = shapes[name].reduce((a, b) => a * b, 1);
+                decoderFeeds[name] = new Tensor(dtype, new cls(size), shapes[name]);
+            }
+        }
+    }
+
+    async encode_image({ pixel_values }) {
+        // image_inputs === { pixel_values }
+        return (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features;
+    }
+
+    async encode_text({ input_ids }) {
+        // text_inputs === { input_ids, attention_mask }
+        return (await sessionRun(this.sessions['embed_tokens'], { input_ids })).inputs_embeds;
+    }
+
+    async encode_audio({ audio_values }) {
+        // audio_inputs === { audio_values }
+        return (await sessionRun(this.sessions['audio_encoder'], { audio_values })).audio_features;
+    }
+}
diff --git a/src/models/pre-trained-models/albert-pre-trained-model.js b/src/models/pre-trained-models/albert-pre-trained-model.js
new file mode 100644
index 000000000..88949955d
--- /dev/null
+++ b/src/models/pre-trained-models/albert-pre-trained-model.js
@@ -0,0 +1,38 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput } from '../output.js';
+
+export class AlbertPreTrainedModel extends PreTrainedModel {}
+export class AlbertModel extends AlbertPreTrainedModel {}
+export class AlbertForSequenceClassification extends AlbertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+export class AlbertForQuestionAnswering extends AlbertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
+export class AlbertForMaskedLM extends AlbertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/apertus-pre-trained-model.js b/src/models/pre-trained-models/apertus-pre-trained-model.js
new file mode 100644
index 000000000..ba34da1ce
--- /dev/null
+++ b/src/models/pre-trained-models/apertus-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ApertusPreTrainedModel extends PreTrainedModel {}
+export class ApertusModel extends ApertusPreTrainedModel {}
+export class ApertusForCausalLM extends ApertusPreTrainedModel {}
diff --git a/src/models/pre-trained-models/arcee-pre-trained-model.js b/src/models/pre-trained-models/arcee-pre-trained-model.js
new file mode 100644
index 000000000..77aebca34
--- /dev/null
+++ b/src/models/pre-trained-models/arcee-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ArceePreTrainedModel extends PreTrainedModel {}
+export class ArceeModel extends ArceePreTrainedModel {}
+export class ArceeForCausalLM extends ArceePreTrainedModel {}
diff --git a/src/models/pre-trained-models/ast-pre-trained-model.js b/src/models/pre-trained-models/ast-pre-trained-model.js
new file mode 100644
index 000000000..5b3344da0
--- /dev/null
+++ b/src/models/pre-trained-models/ast-pre-trained-model.js
@@ -0,0 +1,14 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ASTPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare AST Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class ASTModel extends ASTPreTrainedModel {}
+
+/**
+ * Audio Spectrogram Transformer model with an audio classification head on top
+ * (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2.
+ */
+export class ASTForAudioClassification extends ASTPreTrainedModel {}
diff --git a/src/models/pre-trained-models/bart-pretrained-model.js b/src/models/pre-trained-models/bart-pretrained-model.js
new file mode 100644
index 000000000..99c624f9d
--- /dev/null
+++ b/src/models/pre-trained-models/bart-pretrained-model.js
@@ -0,0 +1,29 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class BartPretrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare BART Model outputting raw hidden-states without any specific head on top.
+ */
+export class BartModel extends BartPretrainedModel {}
+
+/**
+ * The BART Model with a language modeling head. Can be used for summarization.
+ */
+export class BartForConditionalGeneration extends BartPretrainedModel {}
+
+/**
+ * Bart model with a sequence classification/head on top (a linear layer on top of the pooled output)
+ */
+export class BartForSequenceClassification extends BartPretrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/beit-pre-trained-model.js b/src/models/pre-trained-models/beit-pre-trained-model.js
new file mode 100644
index 000000000..792683aa5
--- /dev/null
+++ b/src/models/pre-trained-models/beit-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class BeitPreTrainedModel extends PreTrainedModel {}
+export class BeitModel extends BeitPreTrainedModel {}
+export class BeitForImageClassification extends BeitPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/bert-pre-trained-model.js b/src/models/pre-trained-models/bert-pre-trained-model.js
new file mode 100644
index 000000000..ce49f5988
--- /dev/null
+++ b/src/models/pre-trained-models/bert-pre-trained-model.js
@@ -0,0 +1,70 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    SequenceClassifierOutput,
+    MaskedLMOutput,
+    TokenClassifierOutput,
+    QuestionAnsweringModelOutput,
+} from '../output.js';
+
+export class BertPreTrainedModel extends PreTrainedModel {}
+export class BertModel extends BertPreTrainedModel {}
+
+/**
+ * BertForMaskedLM is a class representing a BERT model for masked language modeling.
+ */
+export class BertForMaskedLM extends BertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * BertForSequenceClassification is a class representing a BERT model for sequence classification.
+ */
+export class BertForSequenceClassification extends BertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * BertForTokenClassification is a class representing a BERT model for token classification.
+ */
+export class BertForTokenClassification extends BertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * BertForQuestionAnswering is a class representing a BERT model for question answering.
+ */
+export class BertForQuestionAnswering extends BertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/blenderbot-pre-trained-model.js b/src/models/pre-trained-models/blenderbot-pre-trained-model.js
new file mode 100644
index 000000000..49f6cb03a
--- /dev/null
+++ b/src/models/pre-trained-models/blenderbot-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class BlenderbotPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Blenderbot Model outputting raw hidden-states without any specific head on top.
+ */
+export class BlenderbotModel extends BlenderbotPreTrainedModel {}
+
+/**
+ * The Blenderbot Model with a language modeling head. Can be used for summarization.
+ */
+export class BlenderbotForConditionalGeneration extends BlenderbotPreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/blenderbot-small-pre-trained-model.js b/src/models/pre-trained-models/blenderbot-small-pre-trained-model.js
new file mode 100644
index 000000000..5c7e940d3
--- /dev/null
+++ b/src/models/pre-trained-models/blenderbot-small-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class BlenderbotSmallPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top.
+ */
+export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel {}
+
+/**
+ * The BlenderbotSmall Model with a language modeling head. Can be used for summarization.
+ */
+export class BlenderbotSmallForConditionalGeneration extends BlenderbotSmallPreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/bloom-pre-trained-model.js b/src/models/pre-trained-models/bloom-pre-trained-model.js
new file mode 100644
index 000000000..e313a507c
--- /dev/null
+++ b/src/models/pre-trained-models/bloom-pre-trained-model.js
@@ -0,0 +1,16 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
+ */
+export class BloomPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class BloomModel extends BloomPreTrainedModel {}
+
+/**
+ * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
+ */
+export class BloomForCausalLM extends BloomPreTrainedModel {}
diff --git a/src/models/pre-trained-models/camembert-pre-trained-model.js b/src/models/pre-trained-models/camembert-pre-trained-model.js
new file mode 100644
index 000000000..2a944d700
--- /dev/null
+++ b/src/models/pre-trained-models/camembert-pre-trained-model.js
@@ -0,0 +1,74 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class CamembertPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class CamembertModel extends CamembertPreTrainedModel {}
+
+/**
+ * CamemBERT Model with a `language modeling` head on top.
+ */
+export class CamembertForMaskedLM extends CamembertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks.
+ */
+export class CamembertForSequenceClassification extends CamembertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
+ */
+export class CamembertForTokenClassification extends CamembertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * CamemBERT Model with a span classification head on top for extractive question-answering tasks
+ */
+export class CamembertForQuestionAnswering extends CamembertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/chatterbox-pre-trained-model.js b/src/models/pre-trained-models/chatterbox-pre-trained-model.js
new file mode 100644
index 000000000..48af278ac
--- /dev/null
+++ b/src/models/pre-trained-models/chatterbox-pre-trained-model.js
@@ -0,0 +1,153 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { decoderForward } from '../utils.js';
+import { cat, ones, full, Tensor } from '../../utils/tensor.js';
+
+export class ChatterboxPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'inputs_embeds',
+        'attention_mask',
+        'position_ids',
+        'audio_values',
+        'exaggeration',
+        'audio_features',
+        'audio_tokens',
+        'speaker_embeddings',
+        'speaker_features',
+        'past_key_values',
+    ];
+    main_input_name = 'input_ids';
+
+    _return_dict_in_generate_keys = ['audio_tokens', 'speaker_embeddings', 'speaker_features'];
+}
+export class ChatterboxModel extends ChatterboxPreTrainedModel {
+    /**
+     * @param {Tensor} audio_values
+     * @returns {Promise<{audio_features: Tensor, audio_tokens: Tensor, speaker_embeddings: Tensor, speaker_features: Tensor}>}
+     */
+    async encode_speech(audio_values) {
+        return sessionRun(this.sessions['speech_encoder'], {
+            audio_values,
+        });
+    }
+
+    async forward({
+        // Produced by the tokenizer/processor:
+        input_ids = null,
+        attention_mask = null,
+        audio_values = null,
+        exaggeration = null,
+
+        // Used during generation:
+        position_ids = null,
+        inputs_embeds = null,
+        past_key_values = null,
+
+        // Generic generation parameters
+        generation_config = null,
+        logits_processor = null,
+
+        // Speaker embeddings/features (useful for re-using pre-computed speaker data)
+        audio_features = null, // float32[batch_size,sequence_length,1024]
+        audio_tokens = null, // int64[batch_size,audio_sequence_length]
+        speaker_embeddings = null, // float32[batch_size,192]
+        speaker_features = null, // float32[batch_size,feature_dim,80]
+
+        // TODO: needed?
+        ...kwargs
+    }) {
+        let speech_encoder_outputs;
+        if (!inputs_embeds) {
+            const expected_inputs = this.sessions['embed_tokens'].inputNames;
+            const embed_model_inputs = { input_ids };
+            if (expected_inputs.includes('exaggeration')) {
+                // Support the following types for exaggeration:
+                // 1. null/undefined (no exaggeration): use the default of 0.5
+                // 2. number: broadcast to (batch_size,)
+                // 3. number[]: convert to Tensor of shape (batch_size,)
+                // 4. Tensor of shape (batch_size, 1)
+                if (!(exaggeration instanceof Tensor)) {
+                    const batch_size = input_ids.dims[0];
+                    if (exaggeration == null) {
+                        exaggeration = full([batch_size], 0.5);
+                    } else if (typeof exaggeration === 'number') {
+                        exaggeration = full([batch_size], exaggeration);
+                    } else if (Array.isArray(exaggeration)) {
+                        exaggeration = new Tensor('float32', exaggeration, [batch_size]);
+                    } else {
+                        throw new Error('Unsupported type for `exaggeration` input');
+                    }
+                }
+                embed_model_inputs.exaggeration = exaggeration;
+            }
+            if (expected_inputs.includes('position_ids')) {
+                embed_model_inputs.position_ids = position_ids;
+            }
+
+            ({ inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], embed_model_inputs));
+
+            if (audio_features && audio_tokens && speaker_embeddings && speaker_features) {
+                // Use pre-computed speech encoder outputs
+                speech_encoder_outputs = { audio_features, audio_tokens, speaker_embeddings, speaker_features };
+            }
+
+            if (speech_encoder_outputs || audio_values) {
+                speech_encoder_outputs ??= await this.encode_speech(audio_values);
+
+                // Update LLM inputs
+                inputs_embeds = cat([speech_encoder_outputs.audio_features, inputs_embeds], 1);
+                attention_mask = ones([inputs_embeds.dims[0], inputs_embeds.dims[1]]);
+            } else {
+                const target_length = inputs_embeds.dims[1];
+                if (!past_key_values || target_length !== 1) {
+                    throw new Error('Incorrect state encountered during generation.');
+                }
+                const past_length = Object.values(past_key_values)[0].dims.at(-2);
+                attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
+            }
+        }
+
+        const outputs = await decoderForward(
+            this,
+            {
+                inputs_embeds,
+                past_key_values,
+                attention_mask,
+                generation_config,
+                logits_processor,
+            },
+            false,
+        );
+        return {
+            ...outputs,
+            ...speech_encoder_outputs,
+        };
+    }
+
+    /** @type {PreTrainedModel['generate']} */
+    async generate(params) {
+        const { sequences, audio_tokens, speaker_embeddings, speaker_features } = /** @type {any} */ (
+            await super.generate({
+                ...params,
+                return_dict_in_generate: true,
+            })
+        );
+
+        const new_tokens = sequences.slice(null, [
+            params.input_ids.dims[1], // Exclude start of speech token
+            -1, // Exclude end of speech token
+        ]);
+
+        const SILENCE_TOKEN = 4299n;
+        const silence_tokens = full([new_tokens.dims[0], 3], SILENCE_TOKEN); // Add 3 silence tokens
+        const speech_tokens = cat([audio_tokens, new_tokens, silence_tokens], 1);
+
+        const { waveform } = await sessionRun(this.sessions['conditional_decoder'], {
+            speech_tokens,
+            speaker_features,
+            speaker_embeddings,
+        });
+        return waveform;
+    }
+}
diff --git a/src/models/pre-trained-models/chinese-clip-pre-trained-model.js b/src/models/pre-trained-models/chinese-clip-pre-trained-model.js
new file mode 100644
index 000000000..bbb2047eb
--- /dev/null
+++ b/src/models/pre-trained-models/chinese-clip-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ChineseCLIPPreTrainedModel extends PreTrainedModel {}
+
+export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/clap-pre-trained-model.js b/src/models/pre-trained-models/clap-pre-trained-model.js
new file mode 100644
index 000000000..debadd605
--- /dev/null
+++ b/src/models/pre-trained-models/clap-pre-trained-model.js
@@ -0,0 +1,79 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ClapPreTrainedModel extends PreTrainedModel {}
+
+export class ClapModel extends ClapPreTrainedModel {}
+
+/**
+ * CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+ *
+ * **Example:** Compute text embeddings with `ClapTextModelWithProjection`.
+ *
+ * ```javascript
+ * import { AutoTokenizer, ClapTextModelWithProjection } from '@huggingface/transformers';
+ *
+ * // Load tokenizer and text model
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused');
+ * const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
+ *
+ * // Run tokenization
+ * const texts = ['a sound of a cat', 'a sound of a dog'];
+ * const text_inputs = tokenizer(texts, { padding: true, truncation: true });
+ *
+ * // Compute embeddings
+ * const { text_embeds } = await text_model(text_inputs);
+ * // Tensor {
+ * //   dims: [ 2, 512 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(1024) [ ... ],
+ * //   size: 1024
+ * // }
+ * ```
+ */
+export class ClapTextModelWithProjection extends ClapPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
+        });
+    }
+}
+
+/**
+ * CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
+ *
+ * **Example:** Compute audio embeddings with `ClapAudioModelWithProjection`.
+ *
+ * ```javascript
+ * import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@huggingface/transformers';
+ *
+ * // Load processor and audio model
+ * const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused');
+ * const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused');
+ *
+ * // Read audio and run processor
+ * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav');
+ * const audio_inputs = await processor(audio);
+ *
+ * // Compute embeddings
+ * const { audio_embeds } = await audio_model(audio_inputs);
+ * // Tensor {
+ * //   dims: [ 1, 512 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(512) [ ... ],
+ * //   size: 512
+ * // }
+ * ```
+ */
+export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'audio_model',
+        });
+    }
+}
diff --git a/src/models/pre-trained-models/clip-pre-trained-model.js b/src/models/pre-trained-models/clip-pre-trained-model.js
new file mode 100644
index 000000000..34376da40
--- /dev/null
+++ b/src/models/pre-trained-models/clip-pre-trained-model.js
@@ -0,0 +1,150 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class CLIPPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * CLIP Text and Vision Model with a projection layers on top
+ *
+ * **Example:** Perform zero-shot image classification with a `CLIPModel`.
+ *
+ * ```javascript
+ * import { AutoTokenizer, AutoProcessor, CLIPModel, RawImage } from '@huggingface/transformers';
+ *
+ * // Load tokenizer, processor, and model
+ * let tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16');
+ * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
+ * let model = await CLIPModel.from_pretrained('Xenova/clip-vit-base-patch16');
+ *
+ * // Run tokenization
+ * let texts = ['a photo of a car', 'a photo of a football match']
+ * let text_inputs = tokenizer(texts, { padding: true, truncation: true });
+ *
+ * // Read image and run processor
+ * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
+ * let image_inputs = await processor(image);
+ *
+ * // Run model with both text and pixel inputs
+ * let output = await model({ ...text_inputs, ...image_inputs });
+ * // {
+ * //   logits_per_image: Tensor {
+ * //     dims: [ 1, 2 ],
+ * //     data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ],
+ * //   },
+ * //   logits_per_text: Tensor {
+ * //     dims: [ 2, 1 ],
+ * //     data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ],
+ * //   },
+ * //   text_embeds: Tensor {
+ * //     dims: [ 2, 512 ],
+ * //     data: Float32Array(1024) [ ... ],
+ * //   },
+ * //   image_embeds: Tensor {
+ * //     dims: [ 1, 512 ],
+ * //     data: Float32Array(512) [ ... ],
+ * //   }
+ * // }
+ * ```
+ */
+export class CLIPModel extends CLIPPreTrainedModel {}
+
+/**
+ * The text model from CLIP without any head or projection on top.
+ */
+export class CLIPTextModel extends CLIPPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
+        });
+    }
+}
+
+/**
+ * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output)
+ *
+ * **Example:** Compute text embeddings with `CLIPTextModelWithProjection`.
+ *
+ * ```javascript
+ * import { AutoTokenizer, CLIPTextModelWithProjection } from '@huggingface/transformers';
+ *
+ * // Load tokenizer and text model
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16');
+ * const text_model = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16');
+ *
+ * // Run tokenization
+ * let texts = ['a photo of a car', 'a photo of a football match'];
+ * let text_inputs = tokenizer(texts, { padding: true, truncation: true });
+ *
+ * // Compute embeddings
+ * const { text_embeds } = await text_model(text_inputs);
+ * // Tensor {
+ * //   dims: [ 2, 512 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(1024) [ ... ],
+ * //   size: 1024
+ * // }
+ * ```
+ */
+export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
+        });
+    }
+}
+
+/**
+ * The vision model from CLIP without any head or projection on top.
+ */
+export class CLIPVisionModel extends CLIPPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
+        });
+    }
+}
+
+/**
+ * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
+ *
+ * **Example:** Compute vision embeddings with `CLIPVisionModelWithProjection`.
+ *
+ * ```javascript
+ * import { AutoProcessor, CLIPVisionModelWithProjection, RawImage} from '@huggingface/transformers';
+ *
+ * // Load processor and vision model
+ * const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');
+ * const vision_model = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16');
+ *
+ * // Read image and run processor
+ * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
+ * let image_inputs = await processor(image);
+ *
+ * // Compute embeddings
+ * const { image_embeds } = await vision_model(image_inputs);
+ * // Tensor {
+ * //   dims: [ 1, 512 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(512) [ ... ],
+ * //   size: 512
+ * // }
+ * ```
+ */
+export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
+        });
+    }
+}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/clip-seg-pre-trained-model.js b/src/models/pre-trained-models/clip-seg-pre-trained-model.js
new file mode 100644
index 000000000..386a7a1fe
--- /dev/null
+++ b/src/models/pre-trained-models/clip-seg-pre-trained-model.js
@@ -0,0 +1,53 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class CLIPSegPreTrainedModel extends PreTrainedModel {}
+
+export class CLIPSegModel extends CLIPSegPreTrainedModel {}
+
+/**
+ * CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
+ *
+ * **Example:** Perform zero-shot image segmentation with a `CLIPSegForImageSegmentation` model.
+ *
+ * ```javascript
+ * import { AutoTokenizer, AutoProcessor, CLIPSegForImageSegmentation, RawImage } from '@huggingface/transformers';
+ *
+ * // Load tokenizer, processor, and model
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clipseg-rd64-refined');
+ * const processor = await AutoProcessor.from_pretrained('Xenova/clipseg-rd64-refined');
+ * const model = await CLIPSegForImageSegmentation.from_pretrained('Xenova/clipseg-rd64-refined');
+ *
+ * // Run tokenization
+ * const texts = ['a glass', 'something to fill', 'wood', 'a jar'];
+ * const text_inputs = tokenizer(texts, { padding: true, truncation: true });
+ *
+ * // Read image and run processor
+ * const image = await RawImage.read('https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true');
+ * const image_inputs = await processor(image);
+ *
+ * // Run model with both text and pixel inputs
+ * const { logits } = await model({ ...text_inputs, ...image_inputs });
+ * // logits: Tensor {
+ * //   dims: [4, 352, 352],
+ * //   type: 'float32',
+ * //   data: Float32Array(495616) [ ... ],
+ * //   size: 495616
+ * // }
+ * ```
+ *
+ * You can visualize the predictions as follows:
+ * ```javascript
+ * const preds = logits
+ *   .unsqueeze_(1)
+ *   .sigmoid_()
+ *   .mul_(255)
+ *   .round_()
+ *   .to('uint8');
+ *
+ * for (let i = 0; i < preds.dims[0]; ++i) {
+ *   const img = RawImage.fromTensor(preds[i]);
+ *   img.save(`prediction_${i}.png`);
+ * }
+ * ```
+ */
+export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel {}
diff --git a/src/models/pre-trained-models/code-gen-pre-trained-model.js b/src/models/pre-trained-models/code-gen-pre-trained-model.js
new file mode 100644
index 000000000..3ea4a6f5b
--- /dev/null
+++ b/src/models/pre-trained-models/code-gen-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class CodeGenPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * CodeGenModel is a class representing a code generation model without a language model head.
+ */
+export class CodeGenModel extends CodeGenPreTrainedModel {}
+
+/**
+ * CodeGenForCausalLM is a class that represents a code generation model based on the GPT-2 architecture. It extends the `CodeGenPreTrainedModel` class.
+ */
+export class CodeGenForCausalLM extends CodeGenPreTrainedModel {}
diff --git a/src/models/pre-trained-models/cohere-pre-trained-model.js b/src/models/pre-trained-models/cohere-pre-trained-model.js
new file mode 100644
index 000000000..b3b1ccb89
--- /dev/null
+++ b/src/models/pre-trained-models/cohere-pre-trained-model.js
@@ -0,0 +1,9 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Cohere Model outputting raw hidden-states without any specific head on top.
+ */
+export class CoherePreTrainedModel extends PreTrainedModel {}
+export class CohereModel extends CoherePreTrainedModel {}
+
+export class CohereForCausalLM extends CoherePreTrainedModel {}
diff --git a/src/models/pre-trained-models/conv-bert-pre-trained-model.js b/src/models/pre-trained-models/conv-bert-pre-trained-model.js
new file mode 100644
index 000000000..9d049d790
--- /dev/null
+++ b/src/models/pre-trained-models/conv-bert-pre-trained-model.js
@@ -0,0 +1,76 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class ConvBertPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class ConvBertModel extends ConvBertPreTrainedModel {}
+
+/**
+ * ConvBERT Model with a language modeling head on top.
+ */
+export class ConvBertForMaskedLM extends ConvBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class ConvBertForSequenceClassification extends ConvBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output)
+ * e.g. for Named-Entity-Recognition (NER) tasks.
+ */
+export class ConvBertForTokenClassification extends ConvBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD
+ * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`)
+ */
+export class ConvBertForQuestionAnswering extends ConvBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/conv-next-pre-trained-model.js b/src/models/pre-trained-models/conv-next-pre-trained-model.js
new file mode 100644
index 000000000..d8e24875a
--- /dev/null
+++ b/src/models/pre-trained-models/conv-next-pre-trained-model.js
@@ -0,0 +1,21 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class ConvNextPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare ConvNext model outputting raw features without any specific head on top.
+ */
+export class ConvNextModel extends ConvNextPreTrainedModel {}
+
+/**
+ * ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet.
+ */
+export class ConvNextForImageClassification extends ConvNextPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/conv-next-v2-pre-trained-model.js b/src/models/pre-trained-models/conv-next-v2-pre-trained-model.js
new file mode 100644
index 000000000..1dbb28859
--- /dev/null
+++ b/src/models/pre-trained-models/conv-next-v2-pre-trained-model.js
@@ -0,0 +1,21 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class ConvNextV2PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare ConvNextV2 model outputting raw features without any specific head on top.
+ */
+export class ConvNextV2Model extends ConvNextV2PreTrainedModel {}
+
+/**
+ * ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet.
+ */
+export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/d-fine-pre-trained-model.js b/src/models/pre-trained-models/d-fine-pre-trained-model.js
new file mode 100644
index 000000000..73c601435
--- /dev/null
+++ b/src/models/pre-trained-models/d-fine-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { RTDetrObjectDetectionOutput } from './rt-detr-pre-trained-model.js';
+
+export class DFinePreTrainedModel extends PreTrainedModel {}
+export class DFineModel extends DFinePreTrainedModel {}
+export class DFineForObjectDetection extends DFinePreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new RTDetrObjectDetectionOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/dac-pre-trained-model.js b/src/models/pre-trained-models/dac-pre-trained-model.js
new file mode 100644
index 000000000..3b940752c
--- /dev/null
+++ b/src/models/pre-trained-models/dac-pre-trained-model.js
@@ -0,0 +1,54 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { DacEncoderOutput, DacDecoderOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class DacPreTrainedModel extends PreTrainedModel {
+    main_input_name = 'input_values';
+    forward_params = ['input_values'];
+}
+
+/**
+ * The DAC (Descript Audio Codec) model.
+ */
+export class DacModel extends DacPreTrainedModel {
+    /**
+     * Encodes the input audio waveform into discrete codes.
+     * @param {Object} inputs Model inputs
+     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
+     * @returns {Promise<DacEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
+     */
+    async encode(inputs) {
+        return new DacEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
+    }
+
+    /**
+     * Decodes the given frames into an output audio waveform.
+     * @param {DacEncoderOutput} inputs The encoded audio codes.
+     * @returns {Promise<DacDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    async decode(inputs) {
+        return new DacDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
+    }
+}
+
+export class DacEncoderModel extends DacPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'encoder_model',
+        });
+    }
+}
+export class DacDecoderModel extends DacPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'decoder_model',
+        });
+    }
+}
diff --git a/src/models/pre-trained-models/deberta-pre-trained-model.js b/src/models/pre-trained-models/deberta-pre-trained-model.js
new file mode 100644
index 000000000..5542efd27
--- /dev/null
+++ b/src/models/pre-trained-models/deberta-pre-trained-model.js
@@ -0,0 +1,75 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class DebertaPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class DebertaModel extends DebertaPreTrainedModel {}
+
+/**
+ * DeBERTa Model with a `language modeling` head on top.
+ */
+export class DebertaForMaskedLM extends DebertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class DebertaForSequenceClassification extends DebertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
+ */
+export class DebertaForTokenClassification extends DebertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+ * layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ */
+export class DebertaForQuestionAnswering extends DebertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/deberta-v2-pre-trained-model.js b/src/models/pre-trained-models/deberta-v2-pre-trained-model.js
new file mode 100644
index 000000000..ec4ef080e
--- /dev/null
+++ b/src/models/pre-trained-models/deberta-v2-pre-trained-model.js
@@ -0,0 +1,75 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class DebertaV2PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare DeBERTa-V2 Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class DebertaV2Model extends DebertaV2PreTrainedModel {}
+
+/**
+ * DeBERTa-V2 Model with a `language modeling` head on top.
+ */
+export class DebertaV2ForMaskedLM extends DebertaV2PreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DeBERTa-V2 Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class DebertaV2ForSequenceClassification extends DebertaV2PreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DeBERTa-V2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
+ */
+export class DebertaV2ForTokenClassification extends DebertaV2PreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DeBERTa-V2 Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+ * layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ */
+export class DebertaV2ForQuestionAnswering extends DebertaV2PreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/decision-transformer-pre-trained-model.js b/src/models/pre-trained-models/decision-transformer-pre-trained-model.js
new file mode 100644
index 000000000..fb1dbc6be
--- /dev/null
+++ b/src/models/pre-trained-models/decision-transformer-pre-trained-model.js
@@ -0,0 +1,9 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class DecisionTransformerPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
+ * Refer to the paper for more details: https://huggingface.co/papers/2106.01345
+ */
+export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel {}
diff --git a/src/models/pre-trained-models/dei-t-pre-trained-model.js b/src/models/pre-trained-models/dei-t-pre-trained-model.js
new file mode 100644
index 000000000..46e0f295f
--- /dev/null
+++ b/src/models/pre-trained-models/dei-t-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class DeiTPreTrainedModel extends PreTrainedModel {}
+export class DeiTModel extends DeiTPreTrainedModel {}
+export class DeiTForImageClassification extends DeiTPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/depth-anything-pre-trained-model.js b/src/models/pre-trained-models/depth-anything-pre-trained-model.js
new file mode 100644
index 000000000..0b0124f72
--- /dev/null
+++ b/src/models/pre-trained-models/depth-anything-pre-trained-model.js
@@ -0,0 +1,8 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class DepthAnythingPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
+ */
+export class DepthAnythingForDepthEstimation extends DepthAnythingPreTrainedModel {}
diff --git a/src/models/pre-trained-models/depth-pro-pre-trained-model.js b/src/models/pre-trained-models/depth-pro-pre-trained-model.js
new file mode 100644
index 000000000..1f19c48bf
--- /dev/null
+++ b/src/models/pre-trained-models/depth-pro-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class DepthProPreTrainedModel extends PreTrainedModel {}
+export class DepthProForDepthEstimation extends DepthProPreTrainedModel {}
diff --git a/src/models/pre-trained-models/detr-pre-trained-model.js b/src/models/pre-trained-models/detr-pre-trained-model.js
new file mode 100644
index 000000000..e4797f769
--- /dev/null
+++ b/src/models/pre-trained-models/detr-pre-trained-model.js
@@ -0,0 +1,54 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { ModelOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class DetrPreTrainedModel extends PreTrainedModel {}
+export class DetrModel extends DetrPreTrainedModel {}
+export class DetrForObjectDetection extends DetrPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new DetrObjectDetectionOutput(await super._call(model_inputs));
+    }
+}
+
+export class DetrForSegmentation extends DetrPreTrainedModel {
+    /**
+     * Runs the model with the provided inputs
+     * @param {Object} model_inputs Model inputs
+     * @returns {Promise<DetrSegmentationOutput>} Object containing segmentation outputs
+     */
+    async _call(model_inputs) {
+        return new DetrSegmentationOutput(await super._call(model_inputs));
+    }
+}
+
+export class DetrObjectDetectionOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Classification logits (including no-object) for all queries.
+     * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
+     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
+     */
+    constructor({ logits, pred_boxes }) {
+        super();
+        this.logits = logits;
+        this.pred_boxes = pred_boxes;
+    }
+}
+
+export class DetrSegmentationOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits The output logits of the model.
+     * @param {Tensor} output.pred_boxes Predicted boxes.
+     * @param {Tensor} output.pred_masks Predicted masks.
+     */
+    constructor({ logits, pred_boxes, pred_masks }) {
+        super();
+        this.logits = logits;
+        this.pred_boxes = pred_boxes;
+        this.pred_masks = pred_masks;
+    }
+}
diff --git a/src/models/pre-trained-models/din-ov3-conv-next-pre-trained-model.js b/src/models/pre-trained-models/din-ov3-conv-next-pre-trained-model.js
new file mode 100644
index 000000000..793b84ab8
--- /dev/null
+++ b/src/models/pre-trained-models/din-ov3-conv-next-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class DINOv3ConvNextPreTrainedModel extends PreTrainedModel {}
+export class DINOv3ConvNextModel extends DINOv3ConvNextPreTrainedModel {}
diff --git a/src/models/pre-trained-models/din-ov3-vi-t-pre-trained-model.js b/src/models/pre-trained-models/din-ov3-vi-t-pre-trained-model.js
new file mode 100644
index 000000000..c5839fe56
--- /dev/null
+++ b/src/models/pre-trained-models/din-ov3-vi-t-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class DINOv3ViTPreTrainedModel extends PreTrainedModel {}
+export class DINOv3ViTModel extends DINOv3ViTPreTrainedModel {}
diff --git a/src/models/pre-trained-models/dinov2-pre-trained-model.js b/src/models/pre-trained-models/dinov2-pre-trained-model.js
new file mode 100644
index 000000000..f85f191af
--- /dev/null
+++ b/src/models/pre-trained-models/dinov2-pre-trained-model.js
@@ -0,0 +1,21 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class Dinov2PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class Dinov2Model extends Dinov2PreTrainedModel {}
+
+/**
+ * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet.
+ */
+export class Dinov2ForImageClassification extends Dinov2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/dinov2-with-registers-pre-trained-model.js b/src/models/pre-trained-models/dinov2-with-registers-pre-trained-model.js
new file mode 100644
index 000000000..07dc51777
--- /dev/null
+++ b/src/models/pre-trained-models/dinov2-with-registers-pre-trained-model.js
@@ -0,0 +1,21 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class Dinov2WithRegistersPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class Dinov2WithRegistersModel extends Dinov2WithRegistersPreTrainedModel {}
+
+/**
+ * Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet.
+ */
+export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/distil-bert-pre-trained-model.js b/src/models/pre-trained-models/distil-bert-pre-trained-model.js
new file mode 100644
index 000000000..42ee40608
--- /dev/null
+++ b/src/models/pre-trained-models/distil-bert-pre-trained-model.js
@@ -0,0 +1,70 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class DistilBertPreTrainedModel extends PreTrainedModel {}
+export class DistilBertModel extends DistilBertPreTrainedModel {}
+
+/**
+ * DistilBertForSequenceClassification is a class representing a DistilBERT model for sequence classification.
+ */
+export class DistilBertForSequenceClassification extends DistilBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DistilBertForTokenClassification is a class representing a DistilBERT model for token classification.
+ */
+export class DistilBertForTokenClassification extends DistilBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DistilBertForQuestionAnswering is a class representing a DistilBERT model for question answering.
+ */
+export class DistilBertForQuestionAnswering extends DistilBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * DistilBertForMaskedLM is a class representing a DistilBERT model for masking task.
+ */
+export class DistilBertForMaskedLM extends DistilBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/donut-swin-pre-trained-model.js b/src/models/pre-trained-models/donut-swin-pre-trained-model.js
new file mode 100644
index 000000000..b1bacbd1c
--- /dev/null
+++ b/src/models/pre-trained-models/donut-swin-pre-trained-model.js
@@ -0,0 +1,79 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class DonutSwinPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top.
+ *
+ * **Example:** Step-by-step Document Parsing.
+ *
+ * ```javascript
+ * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers';
+ *
+ * // Choose model to use
+ * const model_id = 'Xenova/donut-base-finetuned-cord-v2';
+ *
+ * // Prepare image inputs
+ * const processor = await AutoProcessor.from_pretrained(model_id);
+ * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/receipt.png';
+ * const image = await RawImage.read(url);
+ * const image_inputs = await processor(image);
+ *
+ * // Prepare decoder inputs
+ * const tokenizer = await AutoTokenizer.from_pretrained(model_id);
+ * const task_prompt = '<s_cord-v2>';
+ * const decoder_input_ids = tokenizer(task_prompt, {
+ *   add_special_tokens: false,
+ * }).input_ids;
+ *
+ * // Create the model
+ * const model = await AutoModelForVision2Seq.from_pretrained(model_id);
+ *
+ * // Run inference
+ * const output = await model.generate(image_inputs.pixel_values, {
+ *   decoder_input_ids,
+ *   max_length: model.config.decoder.max_position_embeddings,
+ * });
+ *
+ * // Decode output
+ * const decoded = tokenizer.batch_decode(output)[0];
+ * // <s_cord-v2><s_menu><s_nm> CINNAMON SUGAR</s_nm><s_unitprice> 17,000</s_unitprice><s_cnt> 1 x</s_cnt><s_price> 17,000</s_price></s_menu><s_sub_total><s_subtotal_price> 17,000</s_subtotal_price></s_sub_total><s_total><s_total_price> 17,000</s_total_price><s_cashprice> 20,000</s_cashprice><s_changeprice> 3,000</s_changeprice></s_total></s>
+ * ```
+ *
+ * **Example:** Step-by-step Document Visual Question Answering (DocVQA)
+ *
+ * ```javascript
+ * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers';
+ *
+ * // Choose model to use
+ * const model_id = 'Xenova/donut-base-finetuned-docvqa';
+ *
+ * // Prepare image inputs
+ * const processor = await AutoProcessor.from_pretrained(model_id);
+ * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/invoice.png';
+ * const image = await RawImage.read(url);
+ * const image_inputs = await processor(image);
+ *
+ * // Prepare decoder inputs
+ * const tokenizer = await AutoTokenizer.from_pretrained(model_id);
+ * const question = 'What is the invoice number?';
+ * const task_prompt = `<s_docvqa><s_question>${question}</s_question><s_answer>`;
+ * const decoder_input_ids = tokenizer(task_prompt, {
+ *   add_special_tokens: false,
+ * }).input_ids;
+ *
+ * // Create the model
+ * const model = await AutoModelForVision2Seq.from_pretrained(model_id);
+ *
+ * // Run inference
+ * const output = await model.generate(image_inputs.pixel_values, {
+ *   decoder_input_ids,
+ *   max_length: model.config.decoder.max_position_embeddings,
+ * });
+ *
+ * // Decode output
+ * const decoded = tokenizer.batch_decode(output)[0];
+ * // <s_docvqa><s_question> What is the invoice number?</s_question><s_answer> us-001</s_answer></s>
+ * ```
+ */
+export class DonutSwinModel extends DonutSwinPreTrainedModel {}
diff --git a/src/models/pre-trained-models/dpt-pre-trained-model.js b/src/models/pre-trained-models/dpt-pre-trained-model.js
new file mode 100644
index 000000000..72ee11167
--- /dev/null
+++ b/src/models/pre-trained-models/dpt-pre-trained-model.js
@@ -0,0 +1,51 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class DPTPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare DPT Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class DPTModel extends DPTPreTrainedModel {}
+
+/**
+ * DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
+ *
+ * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`.
+ * ```javascript
+ * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
+ *
+ * // Load model and processor
+ * const model_id = 'Xenova/dpt-hybrid-midas';
+ * const model = await DPTForDepthEstimation.from_pretrained(model_id);
+ * const processor = await AutoProcessor.from_pretrained(model_id);
+ *
+ * // Load image from URL
+ * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
+ * const image = await RawImage.read(url);
+ *
+ * // Prepare image for the model
+ * const inputs = await processor(image);
+ *
+ * // Run model
+ * const { predicted_depth } = await model(inputs);
+ *
+ * // Interpolate to original size
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+ * size: image.size.reverse(),
+ * mode: 'bilinear',
+ * })).squeeze(1);
+ *
+ * // Visualize the prediction
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
+ * const depth = RawImage.fromTensor(formatted);
+ * // RawImage {
+ * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
+ * //   width: 640,
+ * //   height: 480,
+ * //   channels: 1
+ * // }
+ * ```
+ */
+export class DPTForDepthEstimation extends DPTPreTrainedModel {}
diff --git a/src/models/pre-trained-models/efficient-net-pre-trained-model.js b/src/models/pre-trained-models/efficient-net-pre-trained-model.js
new file mode 100644
index 000000000..b5141e704
--- /dev/null
+++ b/src/models/pre-trained-models/efficient-net-pre-trained-model.js
@@ -0,0 +1,21 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class EfficientNetPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare EfficientNet model outputting raw features without any specific head on top.
+ */
+export class EfficientNetModel extends EfficientNetPreTrainedModel {}
+
+/**
+ * EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features).
+ */
+export class EfficientNetForImageClassification extends EfficientNetPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/electra-pre-trained-model.js b/src/models/pre-trained-models/electra-pre-trained-model.js
new file mode 100644
index 000000000..104ff2b45
--- /dev/null
+++ b/src/models/pre-trained-models/electra-pre-trained-model.js
@@ -0,0 +1,77 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class ElectraPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Electra Model transformer outputting raw hidden-states without any specific head on top.
+ * Identical to the BERT model except that it uses an additional linear layer between the embedding
+ * layer and the encoder if the hidden size and embedding size are different.
+ */
+export class ElectraModel extends ElectraPreTrainedModel {}
+// TODO add ElectraForPreTraining
+/**
+ * Electra model with a language modeling head on top.
+ */
+export class ElectraForMaskedLM extends ElectraPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class ElectraForSequenceClassification extends ElectraPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * Electra model with a token classification head on top.
+ */
+export class ElectraForTokenClassification extends ElectraPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * LECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD
+ * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ */
+export class ElectraForQuestionAnswering extends ElectraPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/ernie4_5_pretrained-model.js b/src/models/pre-trained-models/ernie4_5_pretrained-model.js
new file mode 100644
index 000000000..d3e5dec1c
--- /dev/null
+++ b/src/models/pre-trained-models/ernie4_5_pretrained-model.js
@@ -0,0 +1,7 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Ernie4_5_PretrainedModel extends PreTrainedModel {}
+
+export class Ernie4_5_Model extends Ernie4_5_PretrainedModel {}
+
+export class Ernie4_5_ForCausalLM extends Ernie4_5_PretrainedModel {}
diff --git a/src/models/pre-trained-models/esm-pre-trained-model.js b/src/models/pre-trained-models/esm-pre-trained-model.js
new file mode 100644
index 000000000..2e1c85718
--- /dev/null
+++ b/src/models/pre-trained-models/esm-pre-trained-model.js
@@ -0,0 +1,59 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class EsmPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare ESM Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class EsmModel extends EsmPreTrainedModel {}
+
+/**
+ * ESM Model with a `language modeling` head on top.
+ */
+export class EsmForMaskedLM extends EsmPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class EsmForSequenceClassification extends EsmPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * ESM Model with a token classification head on top (a linear layer on top of the hidden-states output)
+ * e.g. for Named-Entity-Recognition (NER) tasks.
+ */
+export class EsmForTokenClassification extends EsmPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/exaone-pre-trained-model.js b/src/models/pre-trained-models/exaone-pre-trained-model.js
new file mode 100644
index 000000000..0f8500668
--- /dev/null
+++ b/src/models/pre-trained-models/exaone-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ExaonePreTrainedModel extends PreTrainedModel {}
+export class ExaoneModel extends ExaonePreTrainedModel {}
+export class ExaoneForCausalLM extends ExaonePreTrainedModel {}
diff --git a/src/models/pre-trained-models/falcon-pre-trained-model.js b/src/models/pre-trained-models/falcon-pre-trained-model.js
new file mode 100644
index 000000000..81aaf890c
--- /dev/null
+++ b/src/models/pre-trained-models/falcon-pre-trained-model.js
@@ -0,0 +1,10 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Falcon Model outputting raw hidden-states without any specific head on top.
+ */
+export class FalconPreTrainedModel extends PreTrainedModel {}
+
+export class FalconModel extends FalconPreTrainedModel {}
+
+export class FalconForCausalLM extends FalconPreTrainedModel {}
diff --git a/src/models/pre-trained-models/fast-vi-t-pre-trained-model.js b/src/models/pre-trained-models/fast-vi-t-pre-trained-model.js
new file mode 100644
index 000000000..822def48f
--- /dev/null
+++ b/src/models/pre-trained-models/fast-vi-t-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class FastViTPreTrainedModel extends PreTrainedModel {}
+export class FastViTModel extends FastViTPreTrainedModel {}
+export class FastViTForImageClassification extends FastViTPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/florence2-pre-trained-model.js b/src/models/pre-trained-models/florence2-pre-trained-model.js
new file mode 100644
index 000000000..a9d3965ec
--- /dev/null
+++ b/src/models/pre-trained-models/florence2-pre-trained-model.js
@@ -0,0 +1,115 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { cat, ones } from '../../utils/tensor.js';
+import { encoderForward, decoderForward } from '../utils.js';
+
+export class Florence2PreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        // Encoder inputs
+        'input_ids',
+        'inputs_embeds',
+        'attention_mask',
+        'pixel_values',
+
+        // Decoder inputs
+        'encoder_outputs',
+        'decoder_input_ids',
+        'decoder_inputs_embeds',
+        'decoder_attention_mask',
+        'past_key_values',
+    ];
+    main_input_name = 'inputs_embeds';
+}
+
+export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel {
+    _merge_input_ids_with_image_features({ inputs_embeds, image_features, input_ids, attention_mask }) {
+        return {
+            inputs_embeds: cat(
+                [
+                    image_features, // image embeds
+                    inputs_embeds, // task prefix embeds
+                ],
+                1,
+            ),
+            attention_mask: cat(
+                [
+                    ones(image_features.dims.slice(0, 2)), // image attention mask
+                    attention_mask, // task prefix attention mask
+                ],
+                1,
+            ),
+        };
+    }
+
+    async _prepare_inputs_embeds({ input_ids, pixel_values, inputs_embeds, attention_mask }) {
+        if (!input_ids && !pixel_values) {
+            throw new Error('Either `input_ids` or `pixel_values` should be provided.');
+        }
+
+        // 1. Possibly, extract the input embeddings
+        let text_features, image_features;
+        if (input_ids) {
+            text_features = await this.encode_text({ input_ids });
+        }
+        if (pixel_values) {
+            image_features = await this.encode_image({ pixel_values });
+        }
+
+        // 2. Possibly, merge text and images
+        if (text_features && image_features) {
+            ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({
+                inputs_embeds: text_features,
+                image_features,
+                input_ids,
+                attention_mask,
+            }));
+        } else {
+            inputs_embeds = text_features || image_features;
+        }
+
+        return { inputs_embeds, attention_mask };
+    }
+
+    async forward({
+        input_ids,
+        pixel_values,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        encoder_outputs,
+        past_key_values,
+
+        inputs_embeds,
+        decoder_inputs_embeds,
+    }) {
+        if (!inputs_embeds) {
+            ({ inputs_embeds, attention_mask } = await this._prepare_inputs_embeds({
+                input_ids,
+                pixel_values,
+                inputs_embeds,
+                attention_mask,
+            }));
+        }
+
+        if (!encoder_outputs) {
+            // Must compute encoder outputs
+            let { last_hidden_state } = await encoderForward(this, { inputs_embeds, attention_mask });
+            encoder_outputs = last_hidden_state;
+        }
+
+        if (!decoder_inputs_embeds) {
+            if (!decoder_input_ids) {
+                throw new Error('Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.');
+            }
+            decoder_inputs_embeds = await this.encode_text({ input_ids: decoder_input_ids });
+        }
+
+        const decoderFeeds = {
+            inputs_embeds: decoder_inputs_embeds,
+            attention_mask: decoder_attention_mask,
+            encoder_attention_mask: attention_mask,
+            encoder_hidden_states: encoder_outputs,
+            past_key_values,
+        };
+        return await decoderForward(this, decoderFeeds, true);
+    }
+}
diff --git a/src/models/pre-trained-models/gemma-pre-trained-model.js b/src/models/pre-trained-models/gemma-pre-trained-model.js
new file mode 100644
index 000000000..1621b9718
--- /dev/null
+++ b/src/models/pre-trained-models/gemma-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Gemma Model outputting raw hidden-states without any specific head on top.
+ */
+export class GemmaPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Gemma Model outputting raw hidden-states without any specific head on top.
+ */
+export class GemmaModel extends GemmaPreTrainedModel {}
+
+export class GemmaForCausalLM extends GemmaPreTrainedModel {}
diff --git a/src/models/pre-trained-models/gemma2-pre-trained-model.js b/src/models/pre-trained-models/gemma2-pre-trained-model.js
new file mode 100644
index 000000000..ac0d231b3
--- /dev/null
+++ b/src/models/pre-trained-models/gemma2-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Gemma2 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Gemma2PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Gemma2 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Gemma2Model extends Gemma2PreTrainedModel {}
+
+export class Gemma2ForCausalLM extends Gemma2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/gemma3-pre-trained-model.js b/src/models/pre-trained-models/gemma3-pre-trained-model.js
new file mode 100644
index 000000000..8bb9fc3b5
--- /dev/null
+++ b/src/models/pre-trained-models/gemma3-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Gemma3PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Gemma3 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Gemma3Model extends Gemma3PreTrainedModel {}
+
+export class Gemma3ForCausalLM extends Gemma3PreTrainedModel {}
diff --git a/src/models/pre-trained-models/gemma3n-pre-trained-model.js b/src/models/pre-trained-models/gemma3n-pre-trained-model.js
new file mode 100644
index 000000000..c90c5900e
--- /dev/null
+++ b/src/models/pre-trained-models/gemma3n-pre-trained-model.js
@@ -0,0 +1,118 @@
+import { Tensor } from '../../utils/tensor.js';
+import {
+    decoderForward,
+    default_merge_input_ids_with_image_features,
+    default_merge_input_ids_with_audio_features,
+} from '../utils.js';
+import { sessionRun } from '../session.js';
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Gemma3nPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'attention_mask',
+        'inputs_embeds',
+        'per_layer_inputs',
+
+        'position_ids',
+        'pixel_values',
+        'input_features',
+        'input_features_mask',
+        'past_key_values',
+    ];
+}
+export class Gemma3nForConditionalGeneration extends Gemma3nPreTrainedModel {
+    async forward({
+        // Produced by the tokenizer/processor:
+        input_ids = null,
+        attention_mask = null,
+        pixel_values = null,
+        input_features = null,
+        input_features_mask = null,
+
+        // Used during generation:
+        position_ids = null,
+        inputs_embeds = null,
+        per_layer_inputs = null,
+        past_key_values = null,
+
+        // Generic generation parameters
+        generation_config = null,
+        logits_processor = null,
+
+        // TODO: needed?
+        ...kwargs
+    }) {
+        if (!inputs_embeds || !per_layer_inputs) {
+            // 1. Extract the text embeddings.
+            ({ inputs_embeds, per_layer_inputs } = await sessionRun(this.sessions['embed_tokens'], {
+                input_ids,
+            }));
+            if (input_ids.dims[1] !== 1) {
+                if (pixel_values) {
+                    // Encode the image
+                    const { image_features } = await sessionRun(this.sessions['vision_encoder'], {
+                        pixel_values,
+                    });
+                    ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({
+                        image_features,
+                        inputs_embeds,
+                        input_ids,
+                        attention_mask,
+                    }));
+                }
+
+                if (input_features) {
+                    // Encode the audio
+                    const { audio_features } = await sessionRun(this.sessions['audio_encoder'], {
+                        input_features,
+                        input_features_mask,
+                    });
+                    ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_audio_features({
+                        audio_features,
+                        inputs_embeds,
+                        input_ids,
+                        attention_mask,
+                    }));
+                }
+            }
+        }
+
+        const outputs = await decoderForward(
+            this,
+            {
+                inputs_embeds,
+                per_layer_inputs,
+                past_key_values,
+                attention_mask,
+                position_ids,
+                generation_config,
+                logits_processor,
+            },
+            true,
+        );
+        return outputs;
+    }
+
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_id,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        });
+    }
+    _merge_input_ids_with_audio_features(kwargs) {
+        const audio_hidden_size = kwargs.audio_features.dims.at(-1);
+        const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
+
+        return default_merge_input_ids_with_audio_features({
+            // @ts-ignore
+            audio_token_id: this.config.audio_token_id,
+            ...kwargs,
+            audio_features: reshaped_audio_features,
+        });
+    }
+}
diff --git a/src/models/pre-trained-models/glm-pre-trained-model.js b/src/models/pre-trained-models/glm-pre-trained-model.js
new file mode 100644
index 000000000..dd49c5269
--- /dev/null
+++ b/src/models/pre-trained-models/glm-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GlmPreTrainedModel extends PreTrainedModel {}
+export class GlmModel extends GlmPreTrainedModel {}
+export class GlmForCausalLM extends GlmPreTrainedModel {}
diff --git a/src/models/pre-trained-models/glpn-pre-trained-model.js b/src/models/pre-trained-models/glpn-pre-trained-model.js
new file mode 100644
index 000000000..975d2e4f6
--- /dev/null
+++ b/src/models/pre-trained-models/glpn-pre-trained-model.js
@@ -0,0 +1,47 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GLPNPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.
+ */
+export class GLPNModel extends GLPNPreTrainedModel {}
+
+/**
+ * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers';
+ *
+ * // Load model and processor
+ * const model_id = 'Xenova/glpn-kitti';
+ * const model = await GLPNForDepthEstimation.from_pretrained(model_id);
+ * const processor = await AutoProcessor.from_pretrained(model_id);
+ *
+ * // Load image from URL
+ * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg';
+ * const image = await RawImage.read(url);
+ *
+ * // Prepare image for the model
+ * const inputs = await processor(image);
+ *
+ * // Run model
+ * const { predicted_depth } = await model(inputs);
+ *
+ * // Interpolate to original size
+ * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), {
+ * size: image.size.reverse(),
+ * mode: 'bilinear',
+ * })).squeeze(1);
+ *
+ * // Visualize the prediction
+ * const min = prediction.min().item();
+ * const max = prediction.max().item();
+ * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8');
+ * const depth = RawImage.fromTensor(formatted);
+ * // RawImage {
+ * //   data: Uint8Array(307200) [ 85, 85, 84, ... ],
+ * //   width: 640,
+ * //   height: 480,
+ * //   channels: 1
+ * // }
+ * ```
+ */
+export class GLPNForDepthEstimation extends GLPNPreTrainedModel {}
diff --git a/src/models/pre-trained-models/gpt-big-code-pre-trained-model.js b/src/models/pre-trained-models/gpt-big-code-pre-trained-model.js
new file mode 100644
index 000000000..d10288b5b
--- /dev/null
+++ b/src/models/pre-trained-models/gpt-big-code-pre-trained-model.js
@@ -0,0 +1,6 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GPTBigCodePreTrainedModel extends PreTrainedModel {}
+export class GPTBigCodeModel extends GPTBigCodePreTrainedModel {}
+
+export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel {}
diff --git a/src/models/pre-trained-models/gpt-neo-pre-trained-model.js b/src/models/pre-trained-models/gpt-neo-pre-trained-model.js
new file mode 100644
index 000000000..5934d5e80
--- /dev/null
+++ b/src/models/pre-trained-models/gpt-neo-pre-trained-model.js
@@ -0,0 +1,6 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GPTNeoPreTrainedModel extends PreTrainedModel {}
+export class GPTNeoModel extends GPTNeoPreTrainedModel {}
+
+export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/gpt-neo-x-pre-trained-model.js b/src/models/pre-trained-models/gpt-neo-x-pre-trained-model.js
new file mode 100644
index 000000000..d848106fc
--- /dev/null
+++ b/src/models/pre-trained-models/gpt-neo-x-pre-trained-model.js
@@ -0,0 +1,6 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GPTNeoXPreTrainedModel extends PreTrainedModel {}
+export class GPTNeoXModel extends GPTNeoXPreTrainedModel {}
+
+export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel {}
diff --git a/src/models/pre-trained-models/gpt-oss-pre-trained-model.js b/src/models/pre-trained-models/gpt-oss-pre-trained-model.js
new file mode 100644
index 000000000..769e45c8f
--- /dev/null
+++ b/src/models/pre-trained-models/gpt-oss-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GptOssPreTrainedModel extends PreTrainedModel {}
+export class GptOssModel extends GptOssPreTrainedModel {}
+export class GptOssForCausalLM extends GptOssPreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/gpt2-pre-trained-model.js b/src/models/pre-trained-models/gpt2-pre-trained-model.js
new file mode 100644
index 000000000..0b5e09ce3
--- /dev/null
+++ b/src/models/pre-trained-models/gpt2-pre-trained-model.js
@@ -0,0 +1,12 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GPT2PreTrainedModel extends PreTrainedModel {}
+
+export class GPT2Model extends GPT2PreTrainedModel {}
+
+/**
+ * GPT-2 language model head on top of the GPT-2 base model. This model is suitable for text generation tasks.
+ */
+export class GPT2LMHeadModel extends GPT2PreTrainedModel {}
+// export class GPT2ForSequenceClassification extends GPT2PreTrainedModel {
+// TODO
\ No newline at end of file
diff --git a/src/models/pre-trained-models/gptj-pre-trained-model.js b/src/models/pre-trained-models/gptj-pre-trained-model.js
new file mode 100644
index 000000000..a229c265a
--- /dev/null
+++ b/src/models/pre-trained-models/gptj-pre-trained-model.js
@@ -0,0 +1,6 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GPTJPreTrainedModel extends PreTrainedModel {}
+export class GPTJModel extends GPTJPreTrainedModel {}
+
+export class GPTJForCausalLM extends GPTJPreTrainedModel {}
diff --git a/src/models/pre-trained-models/granite-moe-hybrid-pre-trained-model.js b/src/models/pre-trained-models/granite-moe-hybrid-pre-trained-model.js
new file mode 100644
index 000000000..ce1262413
--- /dev/null
+++ b/src/models/pre-trained-models/granite-moe-hybrid-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GraniteMoeHybridPreTrainedModel extends PreTrainedModel {}
+export class GraniteMoeHybridModel extends GraniteMoeHybridPreTrainedModel {}
+export class GraniteMoeHybridForCausalLM extends GraniteMoeHybridPreTrainedModel {}
diff --git a/src/models/pre-trained-models/granite-pre-trained-model.js b/src/models/pre-trained-models/granite-pre-trained-model.js
new file mode 100644
index 000000000..3eaee85e4
--- /dev/null
+++ b/src/models/pre-trained-models/granite-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GranitePreTrainedModel extends PreTrainedModel {}
+export class GraniteModel extends GranitePreTrainedModel {}
+export class GraniteForCausalLM extends GranitePreTrainedModel {}
diff --git a/src/models/pre-trained-models/grounding-dino-pre-trained-model.js b/src/models/pre-trained-models/grounding-dino-pre-trained-model.js
new file mode 100644
index 000000000..61f92805a
--- /dev/null
+++ b/src/models/pre-trained-models/grounding-dino-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GroundingDinoPreTrainedModel extends PreTrainedModel {}
+export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel {}
diff --git a/src/models/pre-trained-models/group-vi-t-pre-trained-model.js b/src/models/pre-trained-models/group-vi-t-pre-trained-model.js
new file mode 100644
index 000000000..c6796f902
--- /dev/null
+++ b/src/models/pre-trained-models/group-vi-t-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class GroupViTPreTrainedModel extends PreTrainedModel {}
+export class GroupViTModel extends GroupViTPreTrainedModel {}
diff --git a/src/models/pre-trained-models/helium-pre-trained-model.js b/src/models/pre-trained-models/helium-pre-trained-model.js
new file mode 100644
index 000000000..51208e560
--- /dev/null
+++ b/src/models/pre-trained-models/helium-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class HeliumPreTrainedModel extends PreTrainedModel {}
+export class HeliumModel extends HeliumPreTrainedModel {}
+export class HeliumForCausalLM extends HeliumPreTrainedModel {}
diff --git a/src/models/pre-trained-models/hiera-pre-trained-model.js b/src/models/pre-trained-models/hiera-pre-trained-model.js
new file mode 100644
index 000000000..98623ec56
--- /dev/null
+++ b/src/models/pre-trained-models/hiera-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class HieraPreTrainedModel extends PreTrainedModel {}
+export class HieraModel extends HieraPreTrainedModel {}
+export class HieraForImageClassification extends HieraPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/hubert-pre-trained-model.js b/src/models/pre-trained-models/hubert-pre-trained-model.js
new file mode 100644
index 000000000..e53dca111
--- /dev/null
+++ b/src/models/pre-trained-models/hubert-pre-trained-model.js
@@ -0,0 +1,62 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CausalLMOutput, SequenceClassifierOutput } from '../output.js';
+import { Wav2Vec2PreTrainedModel } from './wav2-vec2-pre-trained-model.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class HubertPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Hubert Model transformer outputting raw hidden-states without any specific head on top.
+ *
+ * **Example:** Load and run a `HubertModel` for feature extraction.
+ *
+ * ```javascript
+ * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
+ *
+ * // Read and preprocess audio
+ * const processor = await AutoProcessor.from_pretrained('Xenova/hubert-base-ls960');
+ * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000);
+ * const inputs = await processor(audio);
+ *
+ * // Load and run model with inputs
+ * const model = await AutoModel.from_pretrained('Xenova/hubert-base-ls960');
+ * const output = await model(inputs);
+ * // {
+ * //   last_hidden_state: Tensor {
+ * //     dims: [ 1, 549, 768 ],
+ * //     type: 'float32',
+ * //     data: Float32Array(421632) [0.0682469978928566, 0.08104046434164047, -0.4975186586380005, ...],
+ * //     size: 421632
+ * //   }
+ * // }
+ * ```
+ */
+export class HubertModel extends Wav2Vec2PreTrainedModel {}
+
+/**
+ * Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+ */
+export class HubertForCTC extends Wav2Vec2PreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB Keyword Spotting.
+ */
+export class HubertForSequenceClassification extends Wav2Vec2PreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/i-jepa-pre-trained-model.js b/src/models/pre-trained-models/i-jepa-pre-trained-model.js
new file mode 100644
index 000000000..534988c81
--- /dev/null
+++ b/src/models/pre-trained-models/i-jepa-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class IJepaPreTrainedModel extends PreTrainedModel {}
+export class IJepaModel extends IJepaPreTrainedModel {}
+export class IJepaForImageClassification extends IJepaPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/idefics3-pre-trained-model.js b/src/models/pre-trained-models/idefics3-pre-trained-model.js
new file mode 100644
index 000000000..913c0785c
--- /dev/null
+++ b/src/models/pre-trained-models/idefics3-pre-trained-model.js
@@ -0,0 +1,43 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { default_merge_input_ids_with_image_features } from '../utils.js';
+
+export class Idefics3PreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'attention_mask',
+        'pixel_values',
+        'pixel_attention_mask',
+        'position_ids',
+        'past_key_values',
+    ];
+}
+
+/**
+ * The Idefics3 model which consists of a vision backbone and a language model.
+ */
+export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
+    async encode_image({ pixel_values, pixel_attention_mask }) {
+        const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask }))
+            .image_features;
+        return features;
+    }
+
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_id,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        });
+    }
+}
+
+/**
+ * The SmolVLM Model with a language modeling head.
+ * It is made up a SigLIP vision encoder, with a language modeling head on top.
+ */
+export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/index.js b/src/models/pre-trained-models/index.js
new file mode 100644
index 000000000..f180a3b5c
--- /dev/null
+++ b/src/models/pre-trained-models/index.js
@@ -0,0 +1,163 @@
+export * from './albert-pre-trained-model.js';
+export * from './apertus-pre-trained-model.js';
+export * from './arcee-pre-trained-model.js';
+export * from './ast-pre-trained-model.js';
+export * from './bart-pretrained-model.js';
+export * from './beit-pre-trained-model.js';
+export * from './bert-pre-trained-model.js';
+export * from './blenderbot-pre-trained-model.js';
+export * from './blenderbot-small-pre-trained-model.js';
+export * from './bloom-pre-trained-model.js';
+export * from './camembert-pre-trained-model.js';
+export * from './chatterbox-pre-trained-model.js';
+export * from './chinese-clip-pre-trained-model.js';
+export * from './clap-pre-trained-model.js';
+export * from './clip-pre-trained-model.js';
+export * from './clip-seg-pre-trained-model.js';
+export * from './code-gen-pre-trained-model.js';
+export * from './cohere-pre-trained-model.js';
+export * from './conv-bert-pre-trained-model.js';
+export * from './conv-next-pre-trained-model.js';
+export * from './conv-next-v2-pre-trained-model.js';
+export * from './d-fine-pre-trained-model.js';
+export * from './dac-pre-trained-model.js';
+export * from './deberta-pre-trained-model.js';
+export * from './deberta-v2-pre-trained-model.js';
+export * from './decision-transformer-pre-trained-model.js';
+export * from './dei-t-pre-trained-model.js';
+export * from './depth-anything-pre-trained-model.js';
+export * from './depth-pro-pre-trained-model.js';
+export * from './detr-pre-trained-model.js';
+export * from './din-ov3-conv-next-pre-trained-model.js';
+export * from './din-ov3-vi-t-pre-trained-model.js';
+export * from './dinov2-pre-trained-model.js';
+export * from './dinov2-with-registers-pre-trained-model.js';
+export * from './distil-bert-pre-trained-model.js';
+export * from './donut-swin-pre-trained-model.js';
+export * from './dpt-pre-trained-model.js';
+export * from './efficient-net-pre-trained-model.js';
+export * from './electra-pre-trained-model.js';
+export * from './ernie4_5_pretrained-model.js';
+export * from './esm-pre-trained-model.js';
+export * from './exaone-pre-trained-model.js';
+export * from './falcon-pre-trained-model.js';
+export * from './fast-vi-t-pre-trained-model.js';
+export * from './florence2-pre-trained-model.js';
+export * from './gemma-pre-trained-model.js';
+export * from './gemma2-pre-trained-model.js';
+export * from './gemma3-pre-trained-model.js';
+export * from './gemma3n-pre-trained-model.js';
+export * from './glm-pre-trained-model.js';
+export * from './glpn-pre-trained-model.js';
+export * from './gpt-big-code-pre-trained-model.js';
+export * from './gpt-neo-pre-trained-model.js';
+export * from './gpt-neo-x-pre-trained-model.js';
+export * from './gpt-oss-pre-trained-model.js';
+export * from './gpt2-pre-trained-model.js';
+export * from './gptj-pre-trained-model.js';
+export * from './granite-moe-hybrid-pre-trained-model.js';
+export * from './granite-pre-trained-model.js';
+export * from './grounding-dino-pre-trained-model.js';
+export * from './group-vi-t-pre-trained-model.js';
+export * from './helium-pre-trained-model.js';
+export * from './hiera-pre-trained-model.js';
+export * from './hubert-pre-trained-model.js';
+export * from './i-jepa-pre-trained-model.js';
+export * from './idefics3-pre-trained-model.js';
+export * from './jais-pre-trained-model.js';
+export * from './jina-clip-pre-trained-model.js';
+export * from './lfm2-pre-trained-model.js';
+export * from './llama-pre-trained-model.js';
+export * from './llama4-pre-trained-model.js';
+export * from './llava-pre-trained-model.js';
+export * from './long-t5-pre-trained-model.js';
+export * from './m-bart-pre-trained-model.js';
+export * from './m2-m100-pre-trained-model.js';
+export * from './marian-pre-trained-model.js';
+export * from './mask-former-pre-trained-model.js';
+export * from './metric3-d-pre-trained-model.js';
+export * from './metric3-dv2-pre-trained-model.js';
+export * from './mgpstr-pre-trained-model.js';
+export * from './mimi-pre-trained-model.js';
+export * from './mistral-pre-trained-model.js';
+export * from './mobile-bert-pre-trained-model.js';
+export * from './mobile-llm-pre-trained-model.js';
+export * from './mobile-net-v1-pre-trained-model.js';
+export * from './mobile-net-v2-pre-trained-model.js';
+export * from './mobile-net-v3-pre-trained-model.js';
+export * from './mobile-net-v4-pre-trained-model.js';
+export * from './mobile-vi-t-pre-trained-model.js';
+export * from './mobile-vi-tv2-pre-trained-model.js';
+export * from './modern-bert-decoder-pre-trained-model.js';
+export * from './modern-bert-pre-trained-model.js';
+export * from './moonshine-pre-trained-model.js';
+export * from './mp-net-pre-trained-model.js';
+export * from './mpt-pre-trained-model.js';
+export * from './mt5-pre-trained-model.js';
+export * from './multi-modality-pre-trained-model.js';
+export * from './musicgen-pre-trained-model.js';
+export * from './nano-chat-pre-trained-model.js';
+export * from './neo-bert-pre-trained-model.js';
+export * from './nomic-bert-pre-trained-model.js';
+export * from './olmo-pre-trained-model.js';
+export * from './olmo2-pre-trained-model.js';
+export * from './olmo3-pre-trained-model.js';
+export * from './open-elm-pre-trained-model.js';
+export * from './opt-pre-trained-model.js';
+export * from './owl-vi-t-pre-trained-model.js';
+export * from './owlv2-pre-trained-model.js';
+export * from './pali-gemma-pre-trained-model.js';
+export * from './parakeet-pre-trained-model.js';
+export * from './patch-ts-mixer-pre-trained-model.js';
+export * from './patch-tst-pre-trained-model.js';
+export * from './phi-pre-trained-model.js';
+export * from './phi3-pre-trained-model.js';
+export * from './phi3-v-pre-trained-model.js';
+export * from './pvt-pre-trained-model.js';
+export * from './py-annote-pre-trained-model.js';
+export * from './qwen2-pre-trained-model.js';
+export * from './qwen2-vl-pre-trained-model.js';
+export * from './qwen3-pre-trained-model.js';
+export * from './res-net-pre-trained-model.js';
+export * from './rf-detr-pre-trained-model.js';
+export * from './ro-former-pre-trained-model.js';
+export * from './roberta-pre-trained-model.js';
+export * from './rt-detr-pre-trained-model.js';
+export * from './rt-detr-v2-pre-trained-model.js';
+export * from './sam-pre-trained-model.js';
+export * from './sam2-pre-trained-model.js';
+export * from './sapiens-pre-trained-model.js';
+export * from './segformer-pre-trained-model.js';
+export * from './siglip-pre-trained-model.js';
+export * from './smol-lm3-pre-trained-model.js';
+export * from './snac-pre-trained-model.js';
+export * from './speech-t5-pre-trained-model.js';
+export * from './squeeze-bert-pre-trained-model.js';
+export * from './stable-lm-pre-trained-model.js';
+export * from './starcoder2-pre-trained-model.js';
+export * from './style-text-to-speech2-pre-trained-model.js';
+export * from './supertonic-pre-trained-model.js';
+export * from './swin-pre-trained-model.js';
+export * from './swin2-sr-pre-trained-model.js';
+export * from './t5-pre-trained-model.js';
+export * from './table-transformer-pre-trained-model.js';
+export * from './tr-ocr-pre-trained-model.js';
+export * from './ultravox-pre-trained-model.js';
+export * from './uni-speech-pre-trained-model.js';
+export * from './uni-speech-sat-pre-trained-model.js';
+export * from './vault-gemma-pre-trained-model.js';
+export * from './vi-t-pre-trained-model.js';
+export * from './vi-tmae-pre-trained-model.js';
+export * from './vi-tmsn-pre-trained-model.js';
+export * from './vision-encoder-decoder-model.js';
+export * from './vit-matte-pre-trained-model.js';
+export * from './vit-pose-pre-trained-model.js';
+export * from './vits-pre-trained-model.js';
+export * from './wav-lm-pre-trained-model.js';
+export * from './wav2-vec2-bert-pre-trained-model.js';
+export * from './wav2-vec2-pre-trained-model.js';
+export * from './we-speaker-res-net-pre-trained-model.js';
+export * from './whisper-pre-trained-model.js';
+export * from './xlm-pre-trained-model.js';
+export * from './xlm-roberta-pre-trained-model.js';
+export * from './yolos-pre-trained-model.js';
\ No newline at end of file
diff --git a/src/models/pre-trained-models/jais-pre-trained-model.js b/src/models/pre-trained-models/jais-pre-trained-model.js
new file mode 100644
index 000000000..cdef7e0b9
--- /dev/null
+++ b/src/models/pre-trained-models/jais-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class JAISPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare JAIS Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class JAISModel extends JAISPreTrainedModel {}
+
+/**
+ * The JAIS Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
+ */
+export class JAISLMHeadModel extends JAISPreTrainedModel {}
diff --git a/src/models/pre-trained-models/jina-clip-pre-trained-model.js b/src/models/pre-trained-models/jina-clip-pre-trained-model.js
new file mode 100644
index 000000000..6bdbfee37
--- /dev/null
+++ b/src/models/pre-trained-models/jina-clip-pre-trained-model.js
@@ -0,0 +1,65 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { ones, full } from '../../utils/tensor.js';
+
+export class JinaCLIPPreTrainedModel extends PreTrainedModel {}
+
+export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
+    async forward(model_inputs) {
+        const missing_text_inputs = !model_inputs.input_ids;
+        const missing_image_inputs = !model_inputs.pixel_values;
+
+        if (missing_text_inputs && missing_image_inputs) {
+            throw new Error('Either `input_ids` or `pixel_values` should be provided.');
+        }
+
+        // If either `input_ids` or `pixel_values` aren't passed, we need to create dummy input since the model requires a value to be specified.
+        if (missing_text_inputs) {
+            // NOTE: We cannot pass zero-dimension tensor as input for input_ids.
+            // Fortunately, the majority of time is spent in the vision encoder, so this shouldn't significantly impact performance.
+            model_inputs.input_ids = ones([model_inputs.pixel_values.dims[0], 1]);
+        }
+
+        if (missing_image_inputs) {
+            // NOTE: Since we create a zero-sized tensor, this does not increase computation time.
+            // @ts-ignore
+            const { image_size } = this.config.vision_config;
+            model_inputs.pixel_values = full([0, 3, image_size, image_size], 0.0); // (pass zero-dimension tensor)
+        }
+
+        const { text_embeddings, image_embeddings, l2norm_text_embeddings, l2norm_image_embeddings } =
+            await super.forward(model_inputs);
+
+        const result = {};
+        if (!missing_text_inputs) {
+            result.text_embeddings = text_embeddings;
+            result.l2norm_text_embeddings = l2norm_text_embeddings;
+        }
+        if (!missing_image_inputs) {
+            result.image_embeddings = image_embeddings;
+            result.l2norm_image_embeddings = l2norm_image_embeddings;
+        }
+        return result;
+    }
+}
+
+export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
+        });
+    }
+}
+
+export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
+        });
+    }
+}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/lfm2-pre-trained-model.js b/src/models/pre-trained-models/lfm2-pre-trained-model.js
new file mode 100644
index 000000000..6793cf220
--- /dev/null
+++ b/src/models/pre-trained-models/lfm2-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Lfm2PreTrainedModel extends PreTrainedModel {}
+export class Lfm2Model extends Lfm2PreTrainedModel {}
+export class Lfm2ForCausalLM extends Lfm2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/llama-pre-trained-model.js b/src/models/pre-trained-models/llama-pre-trained-model.js
new file mode 100644
index 000000000..c727221a8
--- /dev/null
+++ b/src/models/pre-trained-models/llama-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare LLama Model outputting raw hidden-states without any specific head on top.
+ */
+export class LlamaPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare LLaMA Model outputting raw hidden-states without any specific head on top.
+ */
+export class LlamaModel extends LlamaPreTrainedModel {}
+
+export class LlamaForCausalLM extends LlamaPreTrainedModel {}
diff --git a/src/models/pre-trained-models/llama4-pre-trained-model.js b/src/models/pre-trained-models/llama4-pre-trained-model.js
new file mode 100644
index 000000000..33c934569
--- /dev/null
+++ b/src/models/pre-trained-models/llama4-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Llama4PreTrainedModel extends PreTrainedModel {}
+export class Llama4ForCausalLM extends Llama4PreTrainedModel {}
diff --git a/src/models/pre-trained-models/llava-pre-trained-model.js b/src/models/pre-trained-models/llava-pre-trained-model.js
new file mode 100644
index 000000000..c535b4ccf
--- /dev/null
+++ b/src/models/pre-trained-models/llava-pre-trained-model.js
@@ -0,0 +1,41 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { default_merge_input_ids_with_features, default_merge_input_ids_with_image_features } from '../utils.js';
+
+export class LlavaPreTrainedModel extends PreTrainedModel {
+    forward_params = ['input_ids', 'attention_mask', 'pixel_values', 'position_ids', 'past_key_values'];
+}
+
+/**
+ * The LLAVA model which consists of a vision backbone and a language model.
+ */
+export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+
+        return default_merge_input_ids_with_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_index,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        });
+    }
+}
+
+export class LlavaOnevisionForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration
+
+export class Moondream1ForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration
+
+export class LlavaQwen2ForCausalLM extends LlavaPreTrainedModel {
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_index,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        });
+    }
+}
diff --git a/src/models/pre-trained-models/long-t5-pre-trained-model.js b/src/models/pre-trained-models/long-t5-pre-trained-model.js
new file mode 100644
index 000000000..3a70ef0b9
--- /dev/null
+++ b/src/models/pre-trained-models/long-t5-pre-trained-model.js
@@ -0,0 +1,15 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+/**
+ * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
+ */
+export class LongT5PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class LongT5Model extends LongT5PreTrainedModel {}
+
+/**
+ * LONGT5 Model with a `language modeling` head on top.
+ */
+export class LongT5ForConditionalGeneration extends LongT5PreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/m-bart-pre-trained-model.js b/src/models/pre-trained-models/m-bart-pre-trained-model.js
new file mode 100644
index 000000000..3ed4ad46d
--- /dev/null
+++ b/src/models/pre-trained-models/m-bart-pre-trained-model.js
@@ -0,0 +1,31 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class MBartPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare MBART Model outputting raw hidden-states without any specific head on top.
+ */
+export class MBartModel extends MBartPreTrainedModel {}
+
+/**
+ * The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.
+ */
+export class MBartForConditionalGeneration extends MBartPreTrainedModel {}
+
+/**
+ * MBart model with a sequence classification/head on top (a linear layer on top of the pooled output).
+ */
+export class MBartForSequenceClassification extends MBartPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+export class MBartForCausalLM extends MBartPreTrainedModel {}
diff --git a/src/models/pre-trained-models/m2-m100-pre-trained-model.js b/src/models/pre-trained-models/m2-m100-pre-trained-model.js
new file mode 100644
index 000000000..378814433
--- /dev/null
+++ b/src/models/pre-trained-models/m2-m100-pre-trained-model.js
@@ -0,0 +1,7 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class M2M100PreTrainedModel extends PreTrainedModel {}
+
+export class M2M100Model extends M2M100PreTrainedModel {}
+
+export class M2M100ForConditionalGeneration extends M2M100PreTrainedModel {}
diff --git a/src/models/pre-trained-models/marian-pre-trained-model.js b/src/models/pre-trained-models/marian-pre-trained-model.js
new file mode 100644
index 000000000..5786cfc86
--- /dev/null
+++ b/src/models/pre-trained-models/marian-pre-trained-model.js
@@ -0,0 +1,7 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class MarianPreTrainedModel extends PreTrainedModel {}
+
+export class MarianModel extends MarianPreTrainedModel {}
+
+export class MarianMTModel extends MarianPreTrainedModel {}
diff --git a/src/models/pre-trained-models/mask-former-pre-trained-model.js b/src/models/pre-trained-models/mask-former-pre-trained-model.js
new file mode 100644
index 000000000..190ea05b6
--- /dev/null
+++ b/src/models/pre-trained-models/mask-former-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class MaskFormerPreTrainedModel extends PreTrainedModel {}
+export class MaskFormerModel extends MaskFormerPreTrainedModel {}
+export class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel {}
diff --git a/src/models/pre-trained-models/metric3-d-pre-trained-model.js b/src/models/pre-trained-models/metric3-d-pre-trained-model.js
new file mode 100644
index 000000000..4062435f9
--- /dev/null
+++ b/src/models/pre-trained-models/metric3-d-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Metric3DPreTrainedModel extends PreTrainedModel {}
+export class Metric3DForDepthEstimation extends Metric3DPreTrainedModel {}
diff --git a/src/models/pre-trained-models/metric3-dv2-pre-trained-model.js b/src/models/pre-trained-models/metric3-dv2-pre-trained-model.js
new file mode 100644
index 000000000..6ea0ce8a5
--- /dev/null
+++ b/src/models/pre-trained-models/metric3-dv2-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Metric3Dv2PreTrainedModel extends PreTrainedModel {}
+export class Metric3Dv2ForDepthEstimation extends Metric3Dv2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/mgpstr-pre-trained-model.js b/src/models/pre-trained-models/mgpstr-pre-trained-model.js
new file mode 100644
index 000000000..d35d2b204
--- /dev/null
+++ b/src/models/pre-trained-models/mgpstr-pre-trained-model.js
@@ -0,0 +1,17 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { MgpstrModelOutput } from '../output.js';
+
+export class MgpstrPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * MGP-STR Model transformer with three classification heads on top
+ * (three A^3 modules and three linear layer on top of the transformer encoder output) for scene text recognition (STR).
+ */
+export class MgpstrForSceneTextRecognition extends MgpstrPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new MgpstrModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/mimi-pre-trained-model.js b/src/models/pre-trained-models/mimi-pre-trained-model.js
new file mode 100644
index 000000000..c33dcd503
--- /dev/null
+++ b/src/models/pre-trained-models/mimi-pre-trained-model.js
@@ -0,0 +1,54 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { MimiEncoderOutput, MimiDecoderOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class MimiPreTrainedModel extends PreTrainedModel {
+    main_input_name = 'input_values';
+    forward_params = ['input_values'];
+}
+
+/**
+ * The Mimi neural audio codec model.
+ */
+export class MimiModel extends MimiPreTrainedModel {
+    /**
+     * Encodes the input audio waveform into discrete codes.
+     * @param {Object} inputs Model inputs
+     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
+     * @returns {Promise<MimiEncoderOutput>} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`.
+     */
+    async encode(inputs) {
+        return new MimiEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs));
+    }
+
+    /**
+     * Decodes the given frames into an output audio waveform.
+     * @param {MimiEncoderOutput} inputs The encoded audio codes.
+     * @returns {Promise<MimiDecoderOutput>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    async decode(inputs) {
+        return new MimiDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs));
+    }
+}
+
+export class MimiEncoderModel extends MimiPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'encoder_model',
+        });
+    }
+}
+export class MimiDecoderModel extends MimiPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'decoder_model',
+        });
+    }
+}
diff --git a/src/models/pre-trained-models/mistral-pre-trained-model.js b/src/models/pre-trained-models/mistral-pre-trained-model.js
new file mode 100644
index 000000000..157137966
--- /dev/null
+++ b/src/models/pre-trained-models/mistral-pre-trained-model.js
@@ -0,0 +1,10 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Mistral Model outputting raw hidden-states without any specific head on top.
+ */
+export class MistralPreTrainedModel extends PreTrainedModel {}
+
+export class MistralModel extends MistralPreTrainedModel {}
+
+export class MistralForCausalLM extends MistralPreTrainedModel {}
diff --git a/src/models/pre-trained-models/mobile-bert-pre-trained-model.js b/src/models/pre-trained-models/mobile-bert-pre-trained-model.js
new file mode 100644
index 000000000..1a615980a
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-bert-pre-trained-model.js
@@ -0,0 +1,50 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput } from '../output.js';
+
+export class MobileBertPreTrainedModel extends PreTrainedModel {}
+export class MobileBertModel extends MobileBertPreTrainedModel {}
+
+/**
+ * MobileBertForMaskedLM is a class representing a MobileBERT model for masking task.
+ */
+export class MobileBertForMaskedLM extends MobileBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class MobileBertForSequenceClassification extends MobileBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * MobileBert Model with a span classification head on top for extractive question-answering tasks
+ */
+export class MobileBertForQuestionAnswering extends MobileBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/mobile-llm-pre-trained-model.js b/src/models/pre-trained-models/mobile-llm-pre-trained-model.js
new file mode 100644
index 000000000..e1f20ee19
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-llm-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class MobileLLMPreTrainedModel extends PreTrainedModel {}
+export class MobileLLMModel extends MobileLLMPreTrainedModel {}
+export class MobileLLMForCausalLM extends MobileLLMPreTrainedModel {}
diff --git a/src/models/pre-trained-models/mobile-net-v1-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v1-pre-trained-model.js
new file mode 100644
index 000000000..c3f467dc4
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-net-v1-pre-trained-model.js
@@ -0,0 +1,24 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class MobileNetV1PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare MobileNetV1 model outputting raw hidden-states without any specific head on top.
+ */
+export class MobileNetV1Model extends MobileNetV1PreTrainedModel {}
+
+/**
+ * MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features),
+ * e.g. for ImageNet.
+ */
+export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel {}
diff --git a/src/models/pre-trained-models/mobile-net-v2-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v2-pre-trained-model.js
new file mode 100644
index 000000000..9801a6c83
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-net-v2-pre-trained-model.js
@@ -0,0 +1,23 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class MobileNetV2PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare MobileNetV2 model outputting raw hidden-states without any specific head on top.
+ */
+export class MobileNetV2Model extends MobileNetV2PreTrainedModel {}
+
+/**
+ * MobileNetV2 model with an image classification head on top (a linear layer on top of the pooled features),
+ * e.g. for ImageNet.
+ */
+export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/mobile-net-v3-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v3-pre-trained-model.js
new file mode 100644
index 000000000..c5564a4ac
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-net-v3-pre-trained-model.js
@@ -0,0 +1,23 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class MobileNetV3PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare MobileNetV3 model outputting raw hidden-states without any specific head on top.
+ */
+export class MobileNetV3Model extends MobileNetV3PreTrainedModel {}
+
+/**
+ * MobileNetV3 model with an image classification head on top (a linear layer on top of the pooled features),
+ * e.g. for ImageNet.
+ */
+export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel {}
diff --git a/src/models/pre-trained-models/mobile-net-v4-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v4-pre-trained-model.js
new file mode 100644
index 000000000..fbb6584d4
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-net-v4-pre-trained-model.js
@@ -0,0 +1,23 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class MobileNetV4PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare MobileNetV4 model outputting raw hidden-states without any specific head on top.
+ */
+export class MobileNetV4Model extends MobileNetV4PreTrainedModel {}
+
+/**
+ * MobileNetV4 model with an image classification head on top (a linear layer on top of the pooled features),
+ * e.g. for ImageNet.
+ */
+export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel {}
diff --git a/src/models/pre-trained-models/mobile-vi-t-pre-trained-model.js b/src/models/pre-trained-models/mobile-vi-t-pre-trained-model.js
new file mode 100644
index 000000000..20fc9465b
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-vi-t-pre-trained-model.js
@@ -0,0 +1,14 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class MobileViTPreTrainedModel extends PreTrainedModel {}
+export class MobileViTModel extends MobileViTPreTrainedModel {}
+export class MobileViTForImageClassification extends MobileViTPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+// TODO: MobileViTForSemanticSegmentation
diff --git a/src/models/pre-trained-models/mobile-vi-tv2-pre-trained-model.js b/src/models/pre-trained-models/mobile-vi-tv2-pre-trained-model.js
new file mode 100644
index 000000000..a13e3334c
--- /dev/null
+++ b/src/models/pre-trained-models/mobile-vi-tv2-pre-trained-model.js
@@ -0,0 +1,14 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class MobileViTV2PreTrainedModel extends PreTrainedModel {}
+export class MobileViTV2Model extends MobileViTV2PreTrainedModel {}
+export class MobileViTV2ForImageClassification extends MobileViTV2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+// TODO: MobileViTV2ForSemanticSegmentation
diff --git a/src/models/pre-trained-models/modern-bert-decoder-pre-trained-model.js b/src/models/pre-trained-models/modern-bert-decoder-pre-trained-model.js
new file mode 100644
index 000000000..117025583
--- /dev/null
+++ b/src/models/pre-trained-models/modern-bert-decoder-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ModernBertDecoderPreTrainedModel extends PreTrainedModel {}
+export class ModernBertDecoderModel extends ModernBertDecoderPreTrainedModel {}
+export class ModernBertDecoderForCausalLM extends ModernBertDecoderPreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/modern-bert-pre-trained-model.js b/src/models/pre-trained-models/modern-bert-pre-trained-model.js
new file mode 100644
index 000000000..3387eeffb
--- /dev/null
+++ b/src/models/pre-trained-models/modern-bert-pre-trained-model.js
@@ -0,0 +1,41 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { MaskedLMOutput, SequenceClassifierOutput, TokenClassifierOutput } from '../output.js';
+
+export class ModernBertPreTrainedModel extends PreTrainedModel {}
+export class ModernBertModel extends ModernBertPreTrainedModel {}
+
+export class ModernBertForMaskedLM extends ModernBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+export class ModernBertForSequenceClassification extends ModernBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+export class ModernBertForTokenClassification extends ModernBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/moonshine-pre-trained-model.js b/src/models/pre-trained-models/moonshine-pre-trained-model.js
new file mode 100644
index 000000000..bad6b4b6d
--- /dev/null
+++ b/src/models/pre-trained-models/moonshine-pre-trained-model.js
@@ -0,0 +1,14 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class MoonshinePreTrainedModel extends PreTrainedModel {
+    requires_attention_mask = false;
+    main_input_name = 'input_values';
+    forward_params = ['input_values', 'decoder_input_ids', 'past_key_values'];
+}
+
+/**
+ * MoonshineModel class for training Moonshine models without a language model head.
+ */
+export class MoonshineModel extends MoonshinePreTrainedModel {}
+
+export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel {}
diff --git a/src/models/pre-trained-models/mp-net-pre-trained-model.js b/src/models/pre-trained-models/mp-net-pre-trained-model.js
new file mode 100644
index 000000000..278f3b922
--- /dev/null
+++ b/src/models/pre-trained-models/mp-net-pre-trained-model.js
@@ -0,0 +1,74 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class MPNetPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare MPNet Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class MPNetModel extends MPNetPreTrainedModel {}
+
+/**
+ * MPNetForMaskedLM is a class representing a MPNet model for masked language modeling.
+ */
+export class MPNetForMaskedLM extends MPNetPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * MPNetForSequenceClassification is a class representing a MPNet model for sequence classification.
+ */
+export class MPNetForSequenceClassification extends MPNetPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * MPNetForTokenClassification is a class representing a MPNet model for token classification.
+ */
+export class MPNetForTokenClassification extends MPNetPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * MPNetForQuestionAnswering is a class representing a MPNet model for question answering.
+ */
+export class MPNetForQuestionAnswering extends MPNetPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/mpt-pre-trained-model.js b/src/models/pre-trained-models/mpt-pre-trained-model.js
new file mode 100644
index 000000000..0e1d5504c
--- /dev/null
+++ b/src/models/pre-trained-models/mpt-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class MptPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class MptModel extends MptPreTrainedModel {}
+
+/**
+ * The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
+ */
+export class MptForCausalLM extends MptPreTrainedModel {}
diff --git a/src/models/pre-trained-models/mt5-pre-trained-model.js b/src/models/pre-trained-models/mt5-pre-trained-model.js
new file mode 100644
index 000000000..ea3b9cea8
--- /dev/null
+++ b/src/models/pre-trained-models/mt5-pre-trained-model.js
@@ -0,0 +1,10 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class MT5PreTrainedModel extends PreTrainedModel {}
+
+export class MT5Model extends MT5PreTrainedModel {}
+
+/**
+ * A class representing a conditional sequence-to-sequence model based on the MT5 architecture.
+ */
+export class MT5ForConditionalGeneration extends MT5PreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/multi-modality-pre-trained-model.js b/src/models/pre-trained-models/multi-modality-pre-trained-model.js
new file mode 100644
index 000000000..c5685b8d1
--- /dev/null
+++ b/src/models/pre-trained-models/multi-modality-pre-trained-model.js
@@ -0,0 +1,113 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { pick } from '../../utils/core.js';
+import { decoderForward } from '../utils.js';
+import { RawImage } from '../../utils/image.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class MultiModalityPreTrainedModel extends PreTrainedModel {}
+export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
+    forward_params = [
+        // prepare_inputs_embeds
+        'input_ids',
+        'pixel_values',
+        'images_seq_mask',
+        'images_emb_mask',
+
+        // language_model
+        'attention_mask',
+        'position_ids',
+        'past_key_values',
+    ];
+
+    /**
+     * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel>} args
+     */
+    constructor(...args) {
+        super(...args);
+
+        // State-based approach to switch out which heads to use during generation
+        this._generation_mode = 'text';
+    }
+
+    async forward(model_inputs) {
+        const mode = this._generation_mode ?? 'text';
+
+        // TODO support re-using PKVs for input_ids.dims[1] !== 1
+        // if (model_inputs.past_key_values) {
+        //     //  && model_inputs.input_ids.dims[1] === 1
+        // }
+
+        let output_1;
+        if (mode === 'text' || !model_inputs.past_key_values) {
+            const session = this.sessions['prepare_inputs_embeds'];
+            const prep_inputs = pick(model_inputs, session.inputNames);
+            output_1 = await sessionRun(session, prep_inputs);
+        } else {
+            const session = this.sessions['gen_img_embeds'];
+            const prep_inputs = pick(
+                {
+                    image_ids: model_inputs.input_ids,
+                },
+                session.inputNames,
+            );
+            output_1 = await sessionRun(session, prep_inputs);
+        }
+
+        const input_2 = { ...model_inputs, ...output_1 };
+        const output_2 = await decoderForward(this, input_2);
+
+        const head = this.sessions[mode === 'text' ? 'lm_head' : 'gen_head'];
+        if (!head) {
+            throw new Error(`Unable to find "${head}" generation head`);
+        }
+
+        const output_3 = await sessionRun(head, pick(output_2, head.inputNames));
+
+        return {
+            ...output_1,
+            ...output_2,
+            ...output_3,
+        };
+    }
+
+    /**
+     * @param {import('../../generation/parameters.js').GenerationFunctionParameters} options
+     */
+    async generate(options) {
+        this._generation_mode = 'text';
+        return super.generate(options);
+    }
+
+    /**
+     * @param {import('../../generation/parameters.js').GenerationFunctionParameters} options
+     */
+    async generate_images(options) {
+        this._generation_mode = 'image';
+
+        const start_num_tokens = (options.inputs ?? options[this.main_input_name]).dims[1];
+        const all_tokens = await super.generate(options);
+
+        const generated_tokens = /** @type {Tensor} */ (all_tokens).slice(null, [start_num_tokens, null]);
+
+        const image_decode = this.sessions['image_decode'];
+        const { decoded_image } = await sessionRun(image_decode, {
+            generated_tokens,
+        });
+
+        // Equivalent to `np.clip((dec + 1) / 2 * 255, 0, 255)`
+        const clamped = decoded_image
+            .add_(1)
+            .mul_(255 / 2)
+            .clamp_(0, 255)
+            .to('uint8');
+
+        // Return as a list of images
+        const images = [];
+        for (const tensor of clamped) {
+            const img = RawImage.fromTensor(tensor);
+            images.push(img);
+        }
+        return images;
+    }
+}
diff --git a/src/models/pre-trained-models/musicgen-pre-trained-model.js b/src/models/pre-trained-models/musicgen-pre-trained-model.js
new file mode 100644
index 000000000..d00a30c00
--- /dev/null
+++ b/src/models/pre-trained-models/musicgen-pre-trained-model.js
@@ -0,0 +1,139 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { Tensor } from '../../utils/tensor.js';
+import { ModelOutput } from '../output.js';
+
+export class MusicgenPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Musicgen decoder model outputting raw hidden-states without any specific head on top.
+ */
+export class MusicgenModel extends MusicgenPreTrainedModel {}
+
+/**
+ * The MusicGen decoder model with a language modelling head on top.
+ */
+export class MusicgenForCausalLM extends MusicgenPreTrainedModel {}
+
+/**
+ * The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder,
+ * for music generation tasks with one or both of text and audio prompts.
+ *
+ * **Example:** Generate music from text with `Xenova/musicgen-small`.
+ * ```javascript
+ * import { AutoTokenizer, MusicgenForConditionalGeneration } from '@huggingface/transformers';
+ *
+ * // Load tokenizer and model
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/musicgen-small');
+ * const model = await MusicgenForConditionalGeneration.from_pretrained(
+ *   'Xenova/musicgen-small', { dtype: 'fp32' }
+ * );
+ *
+ * // Prepare text input
+ * const prompt = '80s pop track with bassy drums and synth';
+ * const inputs = tokenizer(prompt);
+ *
+ * // Generate audio
+ * const audio_values = await model.generate({
+ *   ...inputs,
+ *   max_new_tokens: 512,
+ *   do_sample: true,
+ *   guidance_scale: 3,
+ * });
+ *
+ * // (Optional) Write the output to a WAV file
+ * import wavefile from 'wavefile';
+ * import fs from 'fs';
+ *
+ * const wav = new wavefile.WaveFile();
+ * wav.fromScratch(1, model.config.audio_encoder.sampling_rate, '32f', audio_values.data);
+ * fs.writeFileSync('musicgen_out.wav', wav.toBuffer());
+ * ```
+ */
+export class MusicgenForConditionalGeneration extends PreTrainedModel {
+    // NOTE: not MusicgenPreTrainedModel
+    forward_params = [
+        'input_ids',
+        'attention_mask',
+        'encoder_outputs',
+        'decoder_input_ids',
+        'decoder_attention_mask',
+        'past_key_values',
+    ];
+
+    /**
+     * Apply the pattern mask to the final ids,
+     * then revert the pattern delay mask by filtering the pad token id in a single step.
+     * @param {Tensor} outputs The output tensor from the model.
+     * @returns {Tensor} The filtered output tensor.
+     */
+    _apply_and_filter_by_delay_pattern_mask(outputs) {
+        const [bs_x_codebooks, seqLength] = outputs.dims;
+        // @ts-expect-error TS2339
+        const num_codebooks = this.config.decoder.num_codebooks;
+        const upperBound = seqLength - num_codebooks;
+
+        let newDataSize = 0;
+        for (let i = 0; i < outputs.size; ++i) {
+            // @ts-expect-error TS2339
+            if (outputs.data[i] === this.config.decoder.pad_token_id) {
+                continue;
+            }
+
+            const row = i % seqLength;
+            const col = Math.floor(i / seqLength) % num_codebooks;
+
+            const diff = row - col;
+            if (diff > 0 && diff <= upperBound) {
+                outputs.data[newDataSize++] = outputs.data[i];
+            }
+        }
+
+        const batch_size = Math.floor(bs_x_codebooks / num_codebooks);
+        const inferred = newDataSize / (batch_size * num_codebooks);
+        // TODO: assert `inferred` is an integer
+        return new Tensor(outputs.type, outputs.data.slice(0, newDataSize), [batch_size, num_codebooks, inferred]);
+    }
+
+    prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
+        // apply the delay pattern mask
+        let clonedInputIds = structuredClone(input_ids);
+        for (let i = 0; i < clonedInputIds.length; ++i) {
+            for (let j = 0; j < clonedInputIds[i].length; ++j) {
+                // @ts-expect-error TS2339
+                if (i % this.config.decoder.num_codebooks >= j) {
+                    // @ts-expect-error TS2339
+                    clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id);
+                }
+            }
+        }
+        // for classifier free guidance we need to replicate the decoder args across the batch dim
+        // (we'll split these before sampling)
+        if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
+            // [batch, seqLength] -> [2 * batch, seqLength]
+            clonedInputIds = clonedInputIds.concat(clonedInputIds);
+        }
+
+        const prepped = super.prepare_inputs_for_generation(clonedInputIds, model_inputs, generation_config);
+        return prepped;
+    }
+
+    /**
+     * Generates sequences of token ids for models with a language modeling head.
+     * @param {import('../../generation/parameters.js').GenerationFunctionParameters} options
+     * @returns {Promise<ModelOutput|Tensor>} The output of the model, which can contain the generated token ids, attentions, and scores.
+     */
+    async generate(options) {
+        const output_ids = await super.generate(options);
+
+        // apply the pattern mask to the final ids
+        // tensor: int64[1,batch_size,4,chunk_length]
+        const audio_codes = this._apply_and_filter_by_delay_pattern_mask(/** @type {Tensor} */ (output_ids)).unsqueeze_(
+            0,
+        ); // append the frame dimension back to the audio codes
+
+        const { audio_values } = await sessionRun(this.sessions['encodec_decode'], { audio_codes });
+
+        return audio_values;
+    }
+}
diff --git a/src/models/pre-trained-models/nano-chat-pre-trained-model.js b/src/models/pre-trained-models/nano-chat-pre-trained-model.js
new file mode 100644
index 000000000..11935915e
--- /dev/null
+++ b/src/models/pre-trained-models/nano-chat-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class NanoChatPreTrainedModel extends PreTrainedModel {}
+export class NanoChatModel extends NanoChatPreTrainedModel {}
+export class NanoChatForCausalLM extends NanoChatPreTrainedModel {}
diff --git a/src/models/pre-trained-models/neo-bert-pre-trained-model.js b/src/models/pre-trained-models/neo-bert-pre-trained-model.js
new file mode 100644
index 000000000..5c352ff86
--- /dev/null
+++ b/src/models/pre-trained-models/neo-bert-pre-trained-model.js
@@ -0,0 +1,58 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    QuestionAnsweringModelOutput,
+} from '../output.js';
+
+export class NeoBertPreTrainedModel extends PreTrainedModel {}
+export class NeoBertModel extends NeoBertPreTrainedModel {}
+
+export class NeoBertForMaskedLM extends NeoBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+export class NeoBertForSequenceClassification extends NeoBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+export class NeoBertForTokenClassification extends NeoBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+export class NeoBertForQuestionAnswering extends NeoBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/nomic-bert-pre-trained-model.js b/src/models/pre-trained-models/nomic-bert-pre-trained-model.js
new file mode 100644
index 000000000..fc6b8b17c
--- /dev/null
+++ b/src/models/pre-trained-models/nomic-bert-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class NomicBertPreTrainedModel extends PreTrainedModel {}
+export class NomicBertModel extends NomicBertPreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/olmo-pre-trained-model.js b/src/models/pre-trained-models/olmo-pre-trained-model.js
new file mode 100644
index 000000000..15c656d77
--- /dev/null
+++ b/src/models/pre-trained-models/olmo-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class OlmoPreTrainedModel extends PreTrainedModel {}
+export class OlmoModel extends OlmoPreTrainedModel {}
+export class OlmoForCausalLM extends OlmoPreTrainedModel {}
diff --git a/src/models/pre-trained-models/olmo2-pre-trained-model.js b/src/models/pre-trained-models/olmo2-pre-trained-model.js
new file mode 100644
index 000000000..53a88c7b9
--- /dev/null
+++ b/src/models/pre-trained-models/olmo2-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Olmo2PreTrainedModel extends PreTrainedModel {}
+export class Olmo2Model extends Olmo2PreTrainedModel {}
+export class Olmo2ForCausalLM extends Olmo2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/olmo3-pre-trained-model.js b/src/models/pre-trained-models/olmo3-pre-trained-model.js
new file mode 100644
index 000000000..77c98cb35
--- /dev/null
+++ b/src/models/pre-trained-models/olmo3-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Olmo3PreTrainedModel extends PreTrainedModel {}
+export class Olmo3Model extends Olmo3PreTrainedModel {}
+export class Olmo3ForCausalLM extends Olmo3PreTrainedModel {}
diff --git a/src/models/pre-trained-models/open-elm-pre-trained-model.js b/src/models/pre-trained-models/open-elm-pre-trained-model.js
new file mode 100644
index 000000000..9c76a0342
--- /dev/null
+++ b/src/models/pre-trained-models/open-elm-pre-trained-model.js
@@ -0,0 +1,6 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class OpenELMPreTrainedModel extends PreTrainedModel {}
+export class OpenELMModel extends OpenELMPreTrainedModel {}
+
+export class OpenELMForCausalLM extends OpenELMPreTrainedModel {}
diff --git a/src/models/pre-trained-models/opt-pre-trained-model.js b/src/models/pre-trained-models/opt-pre-trained-model.js
new file mode 100644
index 000000000..a529a51bd
--- /dev/null
+++ b/src/models/pre-trained-models/opt-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class OPTPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare OPT Model outputting raw hidden-states without any specific head on top.
+ */
+export class OPTModel extends OPTPreTrainedModel {}
+
+/**
+ * The OPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
+ */
+export class OPTForCausalLM extends OPTPreTrainedModel {}
diff --git a/src/models/pre-trained-models/owl-vi-t-pre-trained-model.js b/src/models/pre-trained-models/owl-vi-t-pre-trained-model.js
new file mode 100644
index 000000000..a74da309d
--- /dev/null
+++ b/src/models/pre-trained-models/owl-vi-t-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class OwlViTPreTrainedModel extends PreTrainedModel {}
+export class OwlViTModel extends OwlViTPreTrainedModel {}
+export class OwlViTForObjectDetection extends OwlViTPreTrainedModel {}
diff --git a/src/models/pre-trained-models/owlv2-pre-trained-model.js b/src/models/pre-trained-models/owlv2-pre-trained-model.js
new file mode 100644
index 000000000..6e720ef74
--- /dev/null
+++ b/src/models/pre-trained-models/owlv2-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Owlv2PreTrainedModel extends PreTrainedModel {}
+export class Owlv2Model extends Owlv2PreTrainedModel {}
+export class Owlv2ForObjectDetection extends Owlv2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/pali-gemma-pre-trained-model.js b/src/models/pre-trained-models/pali-gemma-pre-trained-model.js
new file mode 100644
index 000000000..7745b08a0
--- /dev/null
+++ b/src/models/pre-trained-models/pali-gemma-pre-trained-model.js
@@ -0,0 +1,27 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { default_merge_input_ids_with_image_features } from '../utils.js';
+
+export class PaliGemmaPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        // 'inputs_embeds',
+        'attention_mask',
+        'pixel_values',
+        'position_ids',
+        'past_key_values',
+    ];
+}
+
+export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
+    _merge_input_ids_with_image_features(kwargs) {
+        const vision_hidden_size = kwargs.image_features.dims.at(-1);
+        const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
+
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_index,
+            ...kwargs,
+            image_features: reshaped_image_hidden_states,
+        });
+    }
+}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/parakeet-pre-trained-model.js b/src/models/pre-trained-models/parakeet-pre-trained-model.js
new file mode 100644
index 000000000..99245700a
--- /dev/null
+++ b/src/models/pre-trained-models/parakeet-pre-trained-model.js
@@ -0,0 +1,15 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CausalLMOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class ParakeetPreTrainedModel extends PreTrainedModel {}
+export class ParakeetForCTC extends ParakeetPreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/patch-ts-mixer-pre-trained-model.js b/src/models/pre-trained-models/patch-ts-mixer-pre-trained-model.js
new file mode 100644
index 000000000..c04dfd3ca
--- /dev/null
+++ b/src/models/pre-trained-models/patch-ts-mixer-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class PatchTSMixerPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare PatchTSMixer Model outputting raw hidden-states without any specific head.
+ */
+export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel {}
+
+/**
+ * The PatchTSMixer for prediction model.
+ */
+export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel {}
diff --git a/src/models/pre-trained-models/patch-tst-pre-trained-model.js b/src/models/pre-trained-models/patch-tst-pre-trained-model.js
new file mode 100644
index 000000000..ba24a1b9f
--- /dev/null
+++ b/src/models/pre-trained-models/patch-tst-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class PatchTSTPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare PatchTST Model outputting raw hidden-states without any specific head.
+ */
+export class PatchTSTModel extends PatchTSTPreTrainedModel {}
+
+/**
+ * The PatchTST for prediction model.
+ */
+export class PatchTSTForPrediction extends PatchTSTPreTrainedModel {}
diff --git a/src/models/pre-trained-models/phi-pre-trained-model.js b/src/models/pre-trained-models/phi-pre-trained-model.js
new file mode 100644
index 000000000..979e4c8ef
--- /dev/null
+++ b/src/models/pre-trained-models/phi-pre-trained-model.js
@@ -0,0 +1,10 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class PhiPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Phi Model outputting raw hidden-states without any specific head on top.
+ */
+export class PhiModel extends PhiPreTrainedModel {}
+
+export class PhiForCausalLM extends PhiPreTrainedModel {}
diff --git a/src/models/pre-trained-models/phi3-pre-trained-model.js b/src/models/pre-trained-models/phi3-pre-trained-model.js
new file mode 100644
index 000000000..5bc47915c
--- /dev/null
+++ b/src/models/pre-trained-models/phi3-pre-trained-model.js
@@ -0,0 +1,10 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Phi3PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Phi3 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Phi3Model extends Phi3PreTrainedModel {}
+
+export class Phi3ForCausalLM extends Phi3PreTrainedModel {}
diff --git a/src/models/pre-trained-models/phi3-v-pre-trained-model.js b/src/models/pre-trained-models/phi3-v-pre-trained-model.js
new file mode 100644
index 000000000..9c5331a42
--- /dev/null
+++ b/src/models/pre-trained-models/phi3-v-pre-trained-model.js
@@ -0,0 +1,74 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { decoderForward } from '../utils.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class Phi3VPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'inputs_embeds',
+        'attention_mask',
+        'position_ids',
+        'pixel_values',
+        'image_sizes',
+        'past_key_values',
+    ];
+}
+export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
+    async forward({
+        // Produced by the tokenizer/processor:
+        input_ids = null,
+        attention_mask = null,
+        pixel_values = null,
+        image_sizes = null,
+
+        // Used during generation:
+        position_ids = null,
+        inputs_embeds = null,
+        past_key_values = null,
+
+        // Generic generation parameters
+        generation_config = null,
+        logits_processor = null,
+
+        // TODO: needed?
+        ...kwargs
+    }) {
+        if (!inputs_embeds) {
+            let image_features;
+            if (pixel_values && input_ids.dims[1] !== 1) {
+                if (!image_sizes) {
+                    throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
+                }
+
+                // Encode the image
+                ({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
+                    pixel_values,
+                    image_sizes,
+                }));
+            } else {
+                const hidden_size = this.config.normalized_config.hidden_size;
+                image_features = new Tensor('float32', [], [0, hidden_size]);
+            }
+
+            ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
+                input_ids,
+                image_features,
+            }));
+        }
+
+        const outputs = await decoderForward(
+            this,
+            {
+                inputs_embeds,
+                past_key_values,
+                attention_mask,
+                position_ids,
+                generation_config,
+                logits_processor,
+            },
+            false,
+        );
+        return outputs;
+    }
+}
diff --git a/src/models/pre-trained-models/pvt-pre-trained-model.js b/src/models/pre-trained-models/pvt-pre-trained-model.js
new file mode 100644
index 000000000..70ea887fb
--- /dev/null
+++ b/src/models/pre-trained-models/pvt-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class PvtPreTrainedModel extends PreTrainedModel {}
+export class PvtModel extends PvtPreTrainedModel {}
+export class PvtForImageClassification extends PvtPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/py-annote-pre-trained-model.js b/src/models/pre-trained-models/py-annote-pre-trained-model.js
new file mode 100644
index 000000000..f7e02c169
--- /dev/null
+++ b/src/models/pre-trained-models/py-annote-pre-trained-model.js
@@ -0,0 +1,77 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { TokenClassifierOutput } from '../output.js';
+
+export class PyAnnotePreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare PyAnnote Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class PyAnnoteModel extends PyAnnotePreTrainedModel {}
+
+/**
+ * PyAnnote Model with a frame classification head on top for tasks like Speaker Diarization.
+ *
+ * **Example:** Load and run a `PyAnnoteForAudioFrameClassification` for speaker diarization.
+ *
+ * ```javascript
+ * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers';
+ *
+ * // Load model and processor
+ * const model_id = 'onnx-community/pyannote-segmentation-3.0';
+ * const model = await AutoModelForAudioFrameClassification.from_pretrained(model_id);
+ * const processor = await AutoProcessor.from_pretrained(model_id);
+ *
+ * // Read and preprocess audio
+ * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.wav';
+ * const audio = await read_audio(url, processor.feature_extractor.config.sampling_rate);
+ * const inputs = await processor(audio);
+ *
+ * // Run model with inputs
+ * const { logits } = await model(inputs);
+ * // {
+ * //   logits: Tensor {
+ * //     dims: [ 1, 767, 7 ],  // [batch_size, num_frames, num_classes]
+ * //     type: 'float32',
+ * //     data: Float32Array(5369) [ ... ],
+ * //     size: 5369
+ * //   }
+ * // }
+ *
+ * const result = processor.post_process_speaker_diarization(logits, audio.length);
+ * // [
+ * //   [
+ * //     { id: 0, start: 0, end: 1.0512535626298245, confidence: 0.8220156481664611 },
+ * //     { id: 2, start: 1.0512535626298245, end: 2.3398869619825127, confidence: 0.9008811707860472 },
+ * //     ...
+ * //   ]
+ * // ]
+ *
+ * // Display result
+ * console.table(result[0], ['start', 'end', 'id', 'confidence']);
+ * // ┌─────────┬────────────────────┬────────────────────┬────┬─────────────────────┐
+ * // │ (index) │ start              │ end                │ id │ confidence          │
+ * // ├─────────┼────────────────────┼────────────────────┼────┼─────────────────────┤
+ * // │ 0       │ 0                  │ 1.0512535626298245 │ 0  │ 0.8220156481664611  │
+ * // │ 1       │ 1.0512535626298245 │ 2.3398869619825127 │ 2  │ 0.9008811707860472  │
+ * // │ 2       │ 2.3398869619825127 │ 3.5946089560890773 │ 0  │ 0.7521651315796233  │
+ * // │ 3       │ 3.5946089560890773 │ 4.578039708226655  │ 2  │ 0.8491978128022479  │
+ * // │ 4       │ 4.578039708226655  │ 4.594995410849717  │ 0  │ 0.2935352600416393  │
+ * // │ 5       │ 4.594995410849717  │ 6.121008646925269  │ 3  │ 0.6788051309866024  │
+ * // │ 6       │ 6.121008646925269  │ 6.256654267909762  │ 0  │ 0.37125512393851134 │
+ * // │ 7       │ 6.256654267909762  │ 8.630452635138397  │ 2  │ 0.7467035186353542  │
+ * // │ 8       │ 8.630452635138397  │ 10.088643060721703 │ 0  │ 0.7689364814666032  │
+ * // │ 9       │ 10.088643060721703 │ 12.58113134631177  │ 2  │ 0.9123324509131324  │
+ * // │ 10      │ 12.58113134631177  │ 13.005023911888312 │ 0  │ 0.4828358177572041  │
+ * // └─────────┴────────────────────┴────────────────────┴────┴─────────────────────┘
+ * ```
+ */
+export class PyAnnoteForAudioFrameClassification extends PyAnnotePreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/qwen2-pre-trained-model.js b/src/models/pre-trained-models/qwen2-pre-trained-model.js
new file mode 100644
index 000000000..23cd85995
--- /dev/null
+++ b/src/models/pre-trained-models/qwen2-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Qwen2 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Qwen2PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Qwen2 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Qwen2Model extends Qwen2PreTrainedModel {}
+
+export class Qwen2ForCausalLM extends Qwen2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/qwen2-vl-pre-trained-model.js b/src/models/pre-trained-models/qwen2-vl-pre-trained-model.js
new file mode 100644
index 000000000..92b5d256a
--- /dev/null
+++ b/src/models/pre-trained-models/qwen2-vl-pre-trained-model.js
@@ -0,0 +1,251 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { stack, Tensor, ones_like, zeros } from '../../utils/tensor.js';
+import { max } from '../../utils/maths.js';
+import { cumsum_masked_fill, default_merge_input_ids_with_image_features } from '../utils.js';
+
+export class Qwen2VLPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        // Text inputs
+        'input_ids',
+        'attention_mask',
+        'position_ids',
+        'past_key_values',
+
+        // Vision inputs
+        'pixel_values',
+        'image_grid_thw',
+    ];
+}
+export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
+    /**
+     * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+     *
+     * Explanation:
+     *     Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+     *
+     *     For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+     *     Examples:
+     *         input_ids: [T T T T T], here T is for text.
+     *         temporal position_ids: [0, 1, 2, 3, 4]
+     *         height position_ids: [0, 1, 2, 3, 4]
+     *         width position_ids: [0, 1, 2, 3, 4]
+     *
+     *     For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+     *     and 1D rotary position embeddin for text part.
+     *     Examples:
+     *         Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+     *         input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+     *         vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+     *         vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+     *         vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+     *         text temporal position_ids: [3, 4, 5, 6, 7]
+     *         text height position_ids: [3, 4, 5, 6, 7]
+     *         text width position_ids: [3, 4, 5, 6, 7]
+     *         Here we calculate the text start position_ids as the max vision position_ids plus 1.
+     *
+     * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
+     * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
+     * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
+     * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
+     * - 1 for tokens that are **not masked**,
+     * - 0 for tokens that are **masked**.
+     * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
+     * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
+     * - mrope_position_deltas: Tensor of shape `(batch_size)`.
+     */
+    get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
+        // @ts-ignore
+        const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
+        const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
+
+        const mrope_position_deltas = [];
+        if (image_grid_thw || video_grid_thw) {
+            let total_input_ids = input_ids.tolist();
+            if (!attention_mask) {
+                attention_mask = ones_like(input_ids);
+            }
+
+            const attention_mask_list = attention_mask.tolist();
+            const position_ids_list = Array.from({ length: 3 }, (_) =>
+                Array.from({ length: input_ids.dims[0] }, (_) => Array.from({ length: input_ids.dims[1] }, (_) => 1)),
+            );
+
+            const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
+            const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
+
+            let image_index = 0;
+            let video_index = 0;
+            for (let i = 0; i < total_input_ids.length; ++i) {
+                const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
+
+                const vision_start_indices = ids.reduce((acc, x, idx) => {
+                    if (x == vision_start_token_id) acc.push(idx);
+                    return acc;
+                }, []);
+
+                const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
+                const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
+                const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
+
+                /** @type {number[][]} */
+                let llm_pos_ids_list = [];
+                let st = 0;
+                let remain_images = image_nums;
+                let remain_videos = video_nums;
+                for (let j = 0; j < vision_tokens.length; ++j) {
+                    const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
+                    const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
+
+                    const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
+
+                    const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
+
+                    let ed;
+                    let t, h, w;
+                    if (ed_image < ed_video) {
+                        [t, h, w] = image_grid_thw_list[image_index];
+                        ++image_index;
+                        --remain_images;
+                        ed = ed_image;
+                    } else {
+                        [t, h, w] = video_grid_thw_list[video_index];
+                        ++video_index;
+                        --remain_videos;
+                        ed = ed_video;
+                    }
+
+                    const [llm_grid_t, llm_grid_h, llm_grid_w] = [
+                        Number(t),
+                        Math.floor(Number(h) / spatial_merge_size),
+                        Math.floor(Number(w) / spatial_merge_size),
+                    ];
+                    const text_len = ed - st;
+                    const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
+
+                    llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
+
+                    const offset = text_len + st_idx;
+                    const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
+                    const t_index = Array.from(
+                        { length: grid_size },
+                        (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)),
+                    );
+                    const h_index = Array.from(
+                        { length: grid_size },
+                        (_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h),
+                    );
+                    const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w));
+
+                    llm_pos_ids_list.push([t_index, h_index, w_index].flat());
+
+                    st = ed + grid_size;
+                }
+
+                if (st < ids.length) {
+                    const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
+                    const text_len = ids.length - st;
+
+                    llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
+                }
+
+                // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
+                // meaning to perform concatenation along dim=1, we can do the following:
+                const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
+                /** @type {number[]} */
+                const llm_positions = new Array(num_items);
+                let index = 0;
+                for (let x = 0; x < 3; ++x) {
+                    for (let y = 0; y < llm_pos_ids_list.length; ++y) {
+                        const val = llm_pos_ids_list[y];
+                        const text_len = val.length / 3;
+                        for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
+                            llm_positions[index++] = val[z];
+                        }
+                    }
+                }
+
+                let count = 0;
+                const attn_mask = attention_mask_list[i];
+                for (let y = 0; y < attn_mask.length; ++y) {
+                    if (attn_mask[y] == 1) {
+                        for (let x = 0; x < 3; ++x) {
+                            position_ids_list[x][i][y] = llm_positions[(x * num_items) / 3 + count];
+                        }
+                        ++count;
+                    }
+                }
+
+                const max_llm_positions = max(llm_positions)[0];
+                mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
+            }
+
+            return [
+                new Tensor('int64', position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
+                new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
+            ];
+        } else {
+            // Text-only
+            if (attention_mask) {
+                const { data, dims } = cumsum_masked_fill(attention_mask);
+
+                const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
+                /** @type {bigint[]} */
+                const mrope_position_deltas = Array.from(
+                    { length: dims[0] },
+                    (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]),
+                );
+
+                return [
+                    new Tensor('int64', position_ids, [3, ...dims]),
+                    new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
+                ];
+            } else {
+                const [batch_size, seq_length] = input_ids.dims;
+                const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) =>
+                    BigInt(Math.floor((i % seq_length) / batch_size)),
+                );
+
+                return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
+            }
+        }
+    }
+
+    async encode_image({ pixel_values, image_grid_thw }) {
+        const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, grid_thw: image_grid_thw }))
+            .image_features;
+        return features;
+    }
+
+    _merge_input_ids_with_image_features(kwargs) {
+        return default_merge_input_ids_with_image_features({
+            // @ts-ignore
+            image_token_id: this.config.image_token_id,
+            ...kwargs,
+        });
+    }
+
+    prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
+        // Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        if (model_inputs.attention_mask && !model_inputs.position_ids) {
+            // Calculate position_ids and rope_deltas
+            if (!model_inputs.past_key_values) {
+                [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
+                    model_inputs.input_ids,
+                    model_inputs.image_grid_thw,
+                    model_inputs.video_grid_thw,
+                    model_inputs.attention_mask,
+                );
+            } else {
+                model_inputs.pixel_values = null;
+                // model_inputs.pixel_values_videos = null;
+
+                const delta = BigInt(Object.values(model_inputs.past_key_values)[0].dims.at(-2));
+                const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
+                model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
+            }
+        }
+
+        return model_inputs;
+    }
+}
diff --git a/src/models/pre-trained-models/qwen3-pre-trained-model.js b/src/models/pre-trained-models/qwen3-pre-trained-model.js
new file mode 100644
index 000000000..bcef352b5
--- /dev/null
+++ b/src/models/pre-trained-models/qwen3-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Qwen3PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Qwen3 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Qwen3Model extends Qwen3PreTrainedModel {}
+
+export class Qwen3ForCausalLM extends Qwen3PreTrainedModel {}
diff --git a/src/models/pre-trained-models/res-net-pre-trained-model.js b/src/models/pre-trained-models/res-net-pre-trained-model.js
new file mode 100644
index 000000000..9bb37a428
--- /dev/null
+++ b/src/models/pre-trained-models/res-net-pre-trained-model.js
@@ -0,0 +1,24 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+/**
+ * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
+ */
+export class ResNetPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare ResNet model outputting raw features without any specific head on top.
+ */
+export class ResNetModel extends ResNetPreTrainedModel {}
+
+/**
+ * ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet.
+ */
+export class ResNetForImageClassification extends ResNetPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/rf-detr-pre-trained-model.js b/src/models/pre-trained-models/rf-detr-pre-trained-model.js
new file mode 100644
index 000000000..9a7d79f4b
--- /dev/null
+++ b/src/models/pre-trained-models/rf-detr-pre-trained-model.js
@@ -0,0 +1,15 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { RTDetrObjectDetectionOutput } from './rt-detr-pre-trained-model.js';
+
+export class RFDetrPreTrainedModel extends PreTrainedModel {}
+export class RFDetrModel extends RFDetrPreTrainedModel {}
+export class RFDetrForObjectDetection extends RFDetrPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new RFDetrObjectDetectionOutput(await super._call(model_inputs));
+    }
+}
+
+export class RFDetrObjectDetectionOutput extends RTDetrObjectDetectionOutput {}
diff --git a/src/models/pre-trained-models/ro-former-pre-trained-model.js b/src/models/pre-trained-models/ro-former-pre-trained-model.js
new file mode 100644
index 000000000..a9f41d6e1
--- /dev/null
+++ b/src/models/pre-trained-models/ro-former-pre-trained-model.js
@@ -0,0 +1,77 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    MaskedLMOutput,
+} from '../output.js';
+
+export class RoFormerPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class RoFormerModel extends RoFormerPreTrainedModel {}
+
+/**
+ * RoFormer Model with a `language modeling` head on top.
+ */
+export class RoFormerForMaskedLM extends RoFormerPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class RoFormerForSequenceClassification extends RoFormerPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output)
+ * e.g. for Named-Entity-Recognition (NER) tasks.
+ */
+export class RoFormerForTokenClassification extends RoFormerPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD
+ * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+ */
+export class RoFormerForQuestionAnswering extends RoFormerPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} An object containing the model's output logits for question answering.
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
+// TODO: Add RoFormerForCausalLM and RoFormerForMultipleChoice
\ No newline at end of file
diff --git a/src/models/pre-trained-models/roberta-pre-trained-model.js b/src/models/pre-trained-models/roberta-pre-trained-model.js
new file mode 100644
index 000000000..982bc1b5c
--- /dev/null
+++ b/src/models/pre-trained-models/roberta-pre-trained-model.js
@@ -0,0 +1,70 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class RobertaPreTrainedModel extends PreTrainedModel {}
+export class RobertaModel extends RobertaPreTrainedModel {}
+
+/**
+ * RobertaForMaskedLM class for performing masked language modeling on Roberta models.
+ */
+export class RobertaForMaskedLM extends RobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * RobertaForSequenceClassification class for performing sequence classification on Roberta models.
+ */
+export class RobertaForSequenceClassification extends RobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * RobertaForTokenClassification class for performing token classification on Roberta models.
+ */
+export class RobertaForTokenClassification extends RobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * RobertaForQuestionAnswering class for performing question answering on Roberta models.
+ */
+export class RobertaForQuestionAnswering extends RobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/rt-detr-pre-trained-model.js b/src/models/pre-trained-models/rt-detr-pre-trained-model.js
new file mode 100644
index 000000000..fa39c4678
--- /dev/null
+++ b/src/models/pre-trained-models/rt-detr-pre-trained-model.js
@@ -0,0 +1,28 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { ModelOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class RTDetrPreTrainedModel extends PreTrainedModel {}
+export class RTDetrModel extends RTDetrPreTrainedModel {}
+export class RTDetrForObjectDetection extends RTDetrPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new RTDetrObjectDetectionOutput(await super._call(model_inputs));
+    }
+}
+
+export class RTDetrObjectDetectionOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Classification logits (including no-object) for all queries.
+     * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
+     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
+     */
+    constructor({ logits, pred_boxes }) {
+        super();
+        this.logits = logits;
+        this.pred_boxes = pred_boxes;
+    }
+}
diff --git a/src/models/pre-trained-models/rt-detr-v2-pre-trained-model.js b/src/models/pre-trained-models/rt-detr-v2-pre-trained-model.js
new file mode 100644
index 000000000..02a9398e4
--- /dev/null
+++ b/src/models/pre-trained-models/rt-detr-v2-pre-trained-model.js
@@ -0,0 +1,15 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { RTDetrObjectDetectionOutput } from './rt-detr-pre-trained-model.js';
+
+export class RTDetrV2PreTrainedModel extends PreTrainedModel {}
+export class RTDetrV2Model extends RTDetrV2PreTrainedModel {}
+export class RTDetrV2ForObjectDetection extends RTDetrV2PreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new RTDetrV2ObjectDetectionOutput(await super._call(model_inputs));
+    }
+}
+
+export class RTDetrV2ObjectDetectionOutput extends RTDetrObjectDetectionOutput {}
diff --git a/src/models/pre-trained-models/sam-pre-trained-model.js b/src/models/pre-trained-models/sam-pre-trained-model.js
new file mode 100644
index 000000000..3b4d9d0cf
--- /dev/null
+++ b/src/models/pre-trained-models/sam-pre-trained-model.js
@@ -0,0 +1,130 @@
+import { encoderForward } from '../utils.js';
+import { ones } from '../../utils/tensor.js';
+import { sessionRun } from '../session.js';
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SamImageSegmentationOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class SamPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * Segment Anything Model (SAM) for generating segmentation masks, given an input image
+ * and optional 2D location and bounding boxes.
+ *
+ * **Example:** Perform mask generation w/ `Xenova/sam-vit-base`.
+ * ```javascript
+ * import { SamModel, AutoProcessor, RawImage } from '@huggingface/transformers';
+ *
+ * const model = await SamModel.from_pretrained('Xenova/sam-vit-base');
+ * const processor = await AutoProcessor.from_pretrained('Xenova/sam-vit-base');
+ *
+ * const img_url = 'https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png';
+ * const raw_image = await RawImage.read(img_url);
+ * const input_points = [[[450, 600]]] // 2D localization of a window
+ *
+ * const inputs = await processor(raw_image, { input_points });
+ * const outputs = await model(inputs);
+ *
+ * const masks = await processor.post_process_masks(outputs.pred_masks, inputs.original_sizes, inputs.reshaped_input_sizes);
+ * // [
+ * //   Tensor {
+ * //     dims: [ 1, 3, 1764, 2646 ],
+ * //     type: 'bool',
+ * //     data: Uint8Array(14002632) [ ... ],
+ * //     size: 14002632
+ * //   }
+ * // ]
+ * const scores = outputs.iou_scores;
+ * // Tensor {
+ * //   dims: [ 1, 1, 3 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(3) [
+ * //     0.8892380595207214,
+ * //     0.9311248064041138,
+ * //     0.983696699142456
+ * //   ],
+ * //   size: 3
+ * // }
+ * ```
+ */
+export class SamModel extends SamPreTrainedModel {
+    /**
+     * Compute image embeddings and positional image embeddings, given the pixel values of an image.
+     * @param {Object} model_inputs Object containing the model inputs.
+     * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`.
+     * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings.
+     */
+    async get_image_embeddings({ pixel_values }) {
+        // in:
+        //  - pixel_values: tensor.float32[batch_size,3,1024,1024]
+        //
+        // out:
+        //  - image_embeddings: tensor.float32[batch_size,256,64,64]
+        //  - image_positional_embeddings: tensor.float32[batch_size,256,64,64]
+        return await encoderForward(this, { pixel_values });
+    }
+
+    /**
+     * @typedef {Object} SamModelInputs Object containing the model inputs.
+     * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`.
+     * These can be obtained using a `SamProcessor`.
+     * @property {Tensor} [input_points] Input 2D spatial points with shape `(batch_size, num_points, 2)`.
+     * This is used by the prompt encoder to encode the prompt.
+     * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`.
+     * This is used by the prompt encoder to encode the prompt. There are 4 types of labels:
+     *  - `1`: the point is a point that contains the object of interest
+     *  - `0`: the point is a point that does not contain the object of interest
+     *  - `-1`: the point corresponds to the background
+     *  - `-10`: the point is a padding point, thus should be ignored by the prompt encoder
+     * @property {Tensor} [input_boxes] Input bounding boxes with shape `(batch_size, num_boxes, 4)`.
+     * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder.
+     * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder.
+     */
+
+    /**
+     * @param {SamModelInputs} model_inputs Object containing the model inputs.
+     * @returns {Promise<Object>} The output of the model.
+     */
+    async forward(model_inputs) {
+        if (!model_inputs.image_embeddings || !model_inputs.image_positional_embeddings) {
+            // Compute the image embeddings if they are missing
+            model_inputs = {
+                ...model_inputs,
+                ...(await this.get_image_embeddings(model_inputs)),
+            };
+        } else {
+            model_inputs = { ...model_inputs };
+        }
+
+        // Set default input labels if they are missing
+        model_inputs.input_labels ??= ones(model_inputs.input_points.dims.slice(0, -1));
+
+        const decoder_inputs = {
+            image_embeddings: model_inputs.image_embeddings,
+            image_positional_embeddings: model_inputs.image_positional_embeddings,
+        };
+        if (model_inputs.input_points) {
+            decoder_inputs.input_points = model_inputs.input_points;
+        }
+        if (model_inputs.input_labels) {
+            decoder_inputs.input_labels = model_inputs.input_labels;
+        }
+        if (model_inputs.input_boxes) {
+            decoder_inputs.input_boxes = model_inputs.input_boxes;
+        }
+
+        // Returns:
+        //  - iou_scores: tensor.float32[batch_size,point_batch_size,3]
+        //  - pred_masks: tensor.float32[batch_size,point_batch_size,3,256,256]
+        return await sessionRun(this.sessions['prompt_encoder_mask_decoder'], decoder_inputs);
+    }
+
+    /**
+     * Runs the model with the provided inputs
+     * @param {Object} model_inputs Model inputs
+     * @returns {Promise<SamImageSegmentationOutput>} Object containing segmentation outputs
+     */
+    async _call(model_inputs) {
+        return new SamImageSegmentationOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/sam2-pre-trained-model.js b/src/models/pre-trained-models/sam2-pre-trained-model.js
new file mode 100644
index 000000000..e32cca1d9
--- /dev/null
+++ b/src/models/pre-trained-models/sam2-pre-trained-model.js
@@ -0,0 +1,81 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { encoderForward } from '../utils.js';
+import { sessionRun } from '../session.js';
+import { Sam2ImageSegmentationOutput } from '../output.js';
+import { ones, full } from '../../utils/tensor.js';
+import { pick } from '../../utils/core.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class Sam2PreTrainedModel extends PreTrainedModel {}
+export class Sam2Model extends Sam2PreTrainedModel {
+    /**
+     * Compute image embeddings and positional image embeddings, given the pixel values of an image.
+     * @param {Object} model_inputs Object containing the model inputs.
+     * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `Sam2Processor`.
+     * @returns {Promise<Record<String, Tensor>>} The image embeddings.
+     */
+    async get_image_embeddings({ pixel_values }) {
+        // in:
+        //  - pixel_values: tensor.float32[batch_size,3,1024,1024]
+        //
+        // out:
+        //  - image_embeddings.0: tensor.float32[batch_size,32,256,256]
+        //  - image_embeddings.1: tensor.float32[batch_size,64,128,128]
+        //  - image_embeddings.2: tensor.float32[batch_size,256,64,64]
+        return await encoderForward(this, { pixel_values });
+    }
+
+    async forward(model_inputs) {
+        // @ts-expect-error ts(2339)
+        const { num_feature_levels } = this.config.vision_config;
+        const image_embeddings_name = Array.from({ length: num_feature_levels }, (_, i) => `image_embeddings.${i}`);
+
+        if (image_embeddings_name.some((name) => !model_inputs[name])) {
+            // Compute the image embeddings if they are missing
+            model_inputs = {
+                ...model_inputs,
+                ...(await this.get_image_embeddings(model_inputs)),
+            };
+        } else {
+            model_inputs = { ...model_inputs };
+        }
+
+        if (model_inputs.input_points) {
+            if (model_inputs.input_boxes && model_inputs.input_boxes.dims[1] !== 1) {
+                throw new Error(
+                    'When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.',
+                );
+            }
+            const shape = model_inputs.input_points.dims;
+            model_inputs.input_labels ??= ones(shape.slice(0, -1));
+            model_inputs.input_boxes ??= full([shape[0], 0, 4], 0.0);
+        } else if (model_inputs.input_boxes) {
+            // only boxes
+            const shape = model_inputs.input_boxes.dims;
+            model_inputs.input_labels = full([shape[0], shape[1], 0], -1n);
+            model_inputs.input_points = full([shape[0], 1, 0, 2], 0.0);
+        } else {
+            throw new Error('At least one of `input_points` or `input_boxes` must be provided.');
+        }
+
+        const prompt_encoder_mask_decoder_session = this.sessions['prompt_encoder_mask_decoder'];
+        const decoder_inputs = pick(model_inputs, prompt_encoder_mask_decoder_session.inputNames);
+
+        // Returns:
+        //  - iou_scores: tensor.float32[batch_size,num_boxes_or_points,3]
+        //  - pred_masks: tensor.float32[batch_size,num_boxes_or_points,3,256,256]
+        //  - object_score_logits: tensor.float32[batch_size,num_boxes_or_points,1]
+        return await sessionRun(prompt_encoder_mask_decoder_session, decoder_inputs);
+    }
+
+    /**
+     * Runs the model with the provided inputs
+     * @param {Object} model_inputs Model inputs
+     * @returns {Promise<Sam2ImageSegmentationOutput>} Object containing segmentation outputs
+     */
+    async _call(model_inputs) {
+        return new Sam2ImageSegmentationOutput(await super._call(model_inputs));
+    }
+}
+export class EdgeTamModel extends Sam2Model {} // NOTE: extends Sam2Model
+export class Sam3TrackerModel extends Sam2Model {} // NOTE: extends Sam2Model
\ No newline at end of file
diff --git a/src/models/pre-trained-models/sapiens-pre-trained-model.js b/src/models/pre-trained-models/sapiens-pre-trained-model.js
new file mode 100644
index 000000000..8b9c24256
--- /dev/null
+++ b/src/models/pre-trained-models/sapiens-pre-trained-model.js
@@ -0,0 +1,6 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class SapiensPreTrainedModel extends PreTrainedModel {}
+export class SapiensForSemanticSegmentation extends SapiensPreTrainedModel {}
+export class SapiensForDepthEstimation extends SapiensPreTrainedModel {}
+export class SapiensForNormalEstimation extends SapiensPreTrainedModel {}
diff --git a/src/models/pre-trained-models/segformer-pre-trained-model.js b/src/models/pre-trained-models/segformer-pre-trained-model.js
new file mode 100644
index 000000000..e8f28c211
--- /dev/null
+++ b/src/models/pre-trained-models/segformer-pre-trained-model.js
@@ -0,0 +1,18 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class SegformerPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.
+ */
+export class SegformerModel extends SegformerPreTrainedModel {}
+
+/**
+ * SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet.
+ */
+export class SegformerForImageClassification extends SegformerPreTrainedModel {}
+
+/**
+ * SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.
+ */
+export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel {}
diff --git a/src/models/pre-trained-models/siglip-pre-trained-model.js b/src/models/pre-trained-models/siglip-pre-trained-model.js
new file mode 100644
index 000000000..81c26dcef
--- /dev/null
+++ b/src/models/pre-trained-models/siglip-pre-trained-model.js
@@ -0,0 +1,123 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CLIPPreTrainedModel } from './clip-pre-trained-model.js';
+
+export class SiglipPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * SigLIP Text and Vision Model with a projection layers on top
+ *
+ * **Example:** Perform zero-shot image classification with a `SiglipModel`.
+ *
+ * ```javascript
+ * import { AutoTokenizer, AutoProcessor, SiglipModel, RawImage } from '@huggingface/transformers';
+ *
+ * // Load tokenizer, processor, and model
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224');
+ * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224');
+ * const model = await SiglipModel.from_pretrained('Xenova/siglip-base-patch16-224');
+ *
+ * // Run tokenization
+ * const texts = ['a photo of 2 cats', 'a photo of 2 dogs'];
+ * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true });
+ *
+ * // Read image and run processor
+ * const image = await RawImage.read('http://images.cocodataset.org/val2017/000000039769.jpg');
+ * const image_inputs = await processor(image);
+ *
+ * // Run model with both text and pixel inputs
+ * const output = await model({ ...text_inputs, ...image_inputs });
+ * // {
+ * //   logits_per_image: Tensor {
+ * //     dims: [ 1, 2 ],
+ * //     data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ],
+ * //   },
+ * //   logits_per_text: Tensor {
+ * //     dims: [ 2, 1 ],
+ * //     data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ],
+ * //   },
+ * //   text_embeds: Tensor {
+ * //     dims: [ 2, 768 ],
+ * //     data: Float32Array(1536) [ ... ],
+ * //   },
+ * //   image_embeds: Tensor {
+ * //     dims: [ 1, 768 ],
+ * //     data: Float32Array(768) [ ... ],
+ * //   }
+ * // }
+ * ```
+ */
+export class SiglipModel extends SiglipPreTrainedModel {}
+
+/**
+ * The text model from SigLIP without any head or projection on top.
+ *
+ * **Example:** Compute text embeddings with `SiglipTextModel`.
+ *
+ * ```javascript
+ * import { AutoTokenizer, SiglipTextModel } from '@huggingface/transformers';
+ *
+ * // Load tokenizer and text model
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224');
+ * const text_model = await SiglipTextModel.from_pretrained('Xenova/siglip-base-patch16-224');
+ *
+ * // Run tokenization
+ * const texts = ['a photo of 2 cats', 'a photo of 2 dogs'];
+ * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true });
+ *
+ * // Compute embeddings
+ * const { pooler_output } = await text_model(text_inputs);
+ * // Tensor {
+ * //   dims: [ 2, 768 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(1536) [ ... ],
+ * //   size: 1536
+ * // }
+ * ```
+ */
+export class SiglipTextModel extends SiglipPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'text_model',
+        });
+    }
+}
+
+/**
+ * The vision model from SigLIP without any head or projection on top.
+ *
+ * **Example:** Compute vision embeddings with `SiglipVisionModel`.
+ *
+ * ```javascript
+ * import { AutoProcessor, SiglipVisionModel, RawImage} from '@huggingface/transformers';
+ *
+ * // Load processor and vision model
+ * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224');
+ * const vision_model = await SiglipVisionModel.from_pretrained('Xenova/siglip-base-patch16-224');
+ *
+ * // Read image and run processor
+ * const image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg');
+ * const image_inputs = await processor(image);
+ *
+ * // Compute embeddings
+ * const { pooler_output } = await vision_model(image_inputs);
+ * // Tensor {
+ * //   dims: [ 1, 768 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(768) [ ... ],
+ * //   size: 768
+ * // }
+ * ```
+ */
+export class SiglipVisionModel extends CLIPPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'vision_model',
+        });
+    }
+}
diff --git a/src/models/pre-trained-models/smol-lm3-pre-trained-model.js b/src/models/pre-trained-models/smol-lm3-pre-trained-model.js
new file mode 100644
index 000000000..77c02063c
--- /dev/null
+++ b/src/models/pre-trained-models/smol-lm3-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class SmolLM3PreTrainedModel extends PreTrainedModel {}
+export class SmolLM3Model extends SmolLM3PreTrainedModel {}
+export class SmolLM3ForCausalLM extends SmolLM3PreTrainedModel {}
diff --git a/src/models/pre-trained-models/snac-pre-trained-model.js b/src/models/pre-trained-models/snac-pre-trained-model.js
new file mode 100644
index 000000000..fb1c84035
--- /dev/null
+++ b/src/models/pre-trained-models/snac-pre-trained-model.js
@@ -0,0 +1,53 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class SnacPreTrainedModel extends PreTrainedModel {
+    main_input_name = 'input_values';
+    forward_params = ['input_values'];
+}
+
+/**
+ * The SNAC (Multi-Scale Neural Audio Codec) model.
+ */
+export class SnacModel extends SnacPreTrainedModel {
+    /**
+     * Encodes the input audio waveform into discrete codes.
+     * @param {Object} inputs Model inputs
+     * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`).
+     * @returns {Promise<Record<string, Tensor>>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`.
+     */
+    async encode(inputs) {
+        return await sessionRun(this.sessions['encoder_model'], inputs);
+    }
+
+    /**
+     * Decodes the given frames into an output audio waveform.
+     * @param {Record<string, Tensor>} inputs The encoded audio codes.
+     * @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`.
+     */
+    async decode(inputs) {
+        return await sessionRun(this.sessions['decoder_model'], inputs);
+    }
+}
+
+export class SnacEncoderModel extends SnacPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'encoder_model',
+        });
+    }
+}
+export class SnacDecoderModel extends SnacPreTrainedModel {
+    /** @type {typeof PreTrainedModel.from_pretrained} */
+    static async from_pretrained(pretrained_model_name_or_path, options = {}) {
+        return super.from_pretrained(pretrained_model_name_or_path, {
+            ...options,
+            // Update default model file name if not provided
+            model_file_name: options.model_file_name ?? 'decoder_model',
+        });
+    }
+}
diff --git a/src/models/pre-trained-models/speech-t5-pre-trained-model.js b/src/models/pre-trained-models/speech-t5-pre-trained-model.js
new file mode 100644
index 000000000..2f0a051a7
--- /dev/null
+++ b/src/models/pre-trained-models/speech-t5-pre-trained-model.js
@@ -0,0 +1,165 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { sessionRun } from '../session.js';
+import { Tensor, boolTensor, cat } from '../../utils/tensor.js';
+import { encoderForward } from '../utils.js';
+
+/**
+ * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
+ */
+export class SpeechT5PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
+ */
+export class SpeechT5Model extends SpeechT5PreTrainedModel {}
+
+/**
+ * SpeechT5 Model with a speech encoder and a text decoder.
+ *
+ * **Example:** Generate speech from text with `SpeechT5ForSpeechToText`.
+ * ```javascript
+ * import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@huggingface/transformers';
+ *
+ * // Load the tokenizer and processor
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts');
+ * const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts');
+ *
+ * // Load the models
+ * // NOTE: We use the full-precision versions as they are more accurate
+ * const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { dtype: 'fp32' });
+ * const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { dtype: 'fp32' });
+ *
+ * // Load speaker embeddings from URL
+ * const speaker_embeddings_data = new Float32Array(
+ *     await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer()
+ * );
+ * const speaker_embeddings = new Tensor(
+ *     'float32',
+ *     speaker_embeddings_data,
+ *     [1, speaker_embeddings_data.length]
+ * )
+ *
+ * // Run tokenization
+ * const { input_ids } = tokenizer('Hello, my dog is cute');
+ *
+ * // Generate waveform
+ * const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
+ * console.log(waveform)
+ * // Tensor {
+ * //   dims: [ 26112 ],
+ * //   type: 'float32',
+ * //   size: 26112,
+ * //   data: Float32Array(26112) [ -0.00043630177970044315, -0.00018082228780258447, ... ],
+ * // }
+ * ```
+ */
+export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel {}
+
+/**
+ * SpeechT5 Model with a text encoder and a speech decoder.
+ */
+export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
+    /**
+     * @typedef {Object} SpeechOutput
+     * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape
+     * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided
+     * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided.
+     * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape
+     * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`.
+     */
+
+    /**
+     * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder.
+     * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary.
+     * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings.
+     * @param {Object} options Optional parameters for generating speech.
+     * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value.
+     * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence.
+     * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence.
+     * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram.
+     * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers.
+     * @returns {Promise<SpeechOutput>} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors.
+     */
+    async generate_speech(
+        input_values,
+        speaker_embeddings,
+        {
+            threshold = 0.5,
+            minlenratio = 0.0,
+            maxlenratio = 20.0,
+            vocoder = null,
+            // output_cross_attentions = false, // TODO add
+        } = {},
+    ) {
+        const model_inputs = {
+            input_ids: input_values,
+        };
+
+        const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs);
+
+        // @ts-expect-error TS2339
+        const r = encoder_outputs.dims[1] / this.config.reduction_factor;
+        const maxlen = Math.floor(r * maxlenratio);
+        const minlen = Math.floor(r * minlenratio);
+
+        // @ts-expect-error TS2339
+        const num_mel_bins = this.config.num_mel_bins;
+
+        let spectrogramParts = [];
+        let past_key_values = null;
+        let decoder_outputs = null;
+        let idx = 0;
+
+        while (true) {
+            ++idx;
+
+            const use_cache_branch = boolTensor(!!decoder_outputs);
+            let output_sequence;
+            if (decoder_outputs) {
+                output_sequence = decoder_outputs.output_sequence_out;
+            } else {
+                output_sequence = new Tensor('float32', new Float32Array(num_mel_bins), [1, 1, num_mel_bins]);
+            }
+            let decoderFeeds = {
+                use_cache_branch,
+                output_sequence,
+                encoder_attention_mask: encoder_attention_mask,
+                speaker_embeddings: speaker_embeddings,
+                encoder_hidden_states: encoder_outputs,
+            };
+
+            this.addPastKeyValues(decoderFeeds, past_key_values);
+            decoder_outputs = await sessionRun(this.sessions['decoder_model_merged'], decoderFeeds);
+            past_key_values = this.getPastKeyValues(decoder_outputs, past_key_values);
+
+            const { prob, spectrum } = decoder_outputs;
+            spectrogramParts.push(spectrum);
+
+            if (
+                idx >= minlen &&
+                // Finished when stop token or maximum length is reached.
+                (Array.from(prob.data).filter((p) => p >= threshold).length > 0 || idx >= maxlen)
+            ) {
+                break;
+            }
+        }
+
+        const spectrogram = cat(spectrogramParts);
+        const { waveform } = await sessionRun(vocoder.sessions['model'], { spectrogram });
+
+        return {
+            spectrogram,
+            waveform,
+            // cross_attentions: null, // TODO add
+        };
+    }
+}
+
+/**
+ * HiFi-GAN vocoder.
+ *
+ * See [SpeechT5ForSpeechToText](./models#module_models.SpeechT5ForSpeechToText) for example usage.
+ */
+export class SpeechT5HifiGan extends PreTrainedModel {
+    main_input_name = 'spectrogram';
+}
diff --git a/src/models/pre-trained-models/squeeze-bert-pre-trained-model.js b/src/models/pre-trained-models/squeeze-bert-pre-trained-model.js
new file mode 100644
index 000000000..9c9b19f72
--- /dev/null
+++ b/src/models/pre-trained-models/squeeze-bert-pre-trained-model.js
@@ -0,0 +1,38 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput } from '../output.js';
+
+export class SqueezeBertPreTrainedModel extends PreTrainedModel {}
+export class SqueezeBertModel extends SqueezeBertPreTrainedModel {}
+export class SqueezeBertForMaskedLM extends SqueezeBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+export class SqueezeBertForSequenceClassification extends SqueezeBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+export class SqueezeBertForQuestionAnswering extends SqueezeBertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/stable-lm-pre-trained-model.js b/src/models/pre-trained-models/stable-lm-pre-trained-model.js
new file mode 100644
index 000000000..9f7876955
--- /dev/null
+++ b/src/models/pre-trained-models/stable-lm-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class StableLmPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare StableLm Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class StableLmModel extends StableLmPreTrainedModel {}
+
+/**
+ * StableLm Model with a `language modeling` head on top for Causal Language Modeling (with past).
+ */
+export class StableLmForCausalLM extends StableLmPreTrainedModel {}
diff --git a/src/models/pre-trained-models/starcoder2-pre-trained-model.js b/src/models/pre-trained-models/starcoder2-pre-trained-model.js
new file mode 100644
index 000000000..4814e056a
--- /dev/null
+++ b/src/models/pre-trained-models/starcoder2-pre-trained-model.js
@@ -0,0 +1,10 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * The bare Starcoder2 Model outputting raw hidden-states without any specific head on top.
+ */
+export class Starcoder2PreTrainedModel extends PreTrainedModel {}
+
+export class Starcoder2Model extends Starcoder2PreTrainedModel {}
+
+export class Starcoder2ForCausalLM extends Starcoder2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/style-text-to-speech2-pre-trained-model.js b/src/models/pre-trained-models/style-text-to-speech2-pre-trained-model.js
new file mode 100644
index 000000000..21006e035
--- /dev/null
+++ b/src/models/pre-trained-models/style-text-to-speech2-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel {}
+export class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel {}
diff --git a/src/models/pre-trained-models/supertonic-pre-trained-model.js b/src/models/pre-trained-models/supertonic-pre-trained-model.js
new file mode 100644
index 000000000..69686ba59
--- /dev/null
+++ b/src/models/pre-trained-models/supertonic-pre-trained-model.js
@@ -0,0 +1,59 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { ones, full, randn } from '../../utils/tensor.js';
+import { sessionRun } from '../session.js';
+
+export class SupertonicPreTrainedModel extends PreTrainedModel {}
+export class SupertonicForConditionalGeneration extends SupertonicPreTrainedModel {
+    async generate_speech({
+        // Required inputs
+        input_ids,
+        attention_mask,
+        style,
+
+        // Optional inputs
+        num_inference_steps = 5,
+        speed = 1.05,
+    }) {
+        // @ts-expect-error TS2339
+        const { sampling_rate, chunk_compress_factor, base_chunk_size, latent_dim } = this.config;
+
+        // 1. Text Encoder
+        const { last_hidden_state, durations } = await sessionRun(this.sessions['text_encoder'], {
+            input_ids,
+            attention_mask,
+            style,
+        });
+        durations.div_(speed); // Apply speed factor to duration
+
+        // 2. Latent Denoiser
+        const wav_len_max = durations.max().item() * sampling_rate;
+        const chunk_size = base_chunk_size * chunk_compress_factor;
+        const latent_len = Math.floor((wav_len_max + chunk_size - 1) / chunk_size);
+        const batch_size = input_ids.dims[0];
+        const latent_mask = ones([batch_size, latent_len]);
+        const num_steps = full([batch_size], num_inference_steps);
+
+        let noisy_latents = randn([batch_size, latent_dim * chunk_compress_factor, latent_len]);
+        for (let step = 0; step < num_inference_steps; ++step) {
+            const timestep = full([batch_size], step);
+            ({ denoised_latents: noisy_latents } = await sessionRun(this.sessions['latent_denoiser'], {
+                style,
+                noisy_latents,
+                latent_mask,
+                encoder_outputs: last_hidden_state,
+                attention_mask,
+                timestep,
+                num_inference_steps: num_steps,
+            }));
+        }
+
+        // 3. Voice Decoder
+        const { waveform } = await sessionRun(this.sessions['voice_decoder'], {
+            latents: noisy_latents,
+        });
+        return {
+            waveform,
+            durations,
+        };
+    }
+}
diff --git a/src/models/pre-trained-models/swin-pre-trained-model.js b/src/models/pre-trained-models/swin-pre-trained-model.js
new file mode 100644
index 000000000..082f81f89
--- /dev/null
+++ b/src/models/pre-trained-models/swin-pre-trained-model.js
@@ -0,0 +1,14 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class SwinPreTrainedModel extends PreTrainedModel {}
+export class SwinModel extends SwinPreTrainedModel {}
+export class SwinForImageClassification extends SwinPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+export class SwinForSemanticSegmentation extends SwinPreTrainedModel {}
diff --git a/src/models/pre-trained-models/swin2-sr-pre-trained-model.js b/src/models/pre-trained-models/swin2-sr-pre-trained-model.js
new file mode 100644
index 000000000..ac1494bdb
--- /dev/null
+++ b/src/models/pre-trained-models/swin2-sr-pre-trained-model.js
@@ -0,0 +1,42 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class Swin2SRPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class Swin2SRModel extends Swin2SRPreTrainedModel {}
+
+/**
+ * Swin2SR Model transformer with an upsampler head on top for image super resolution and restoration.
+ *
+ * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64`.
+ *
+ * ```javascript
+ * import { AutoProcessor, Swin2SRForImageSuperResolution, RawImage } from '@huggingface/transformers';
+ *
+ * // Load processor and model
+ * const model_id = 'Xenova/swin2SR-classical-sr-x2-64';
+ * const processor = await AutoProcessor.from_pretrained(model_id);
+ * const model = await Swin2SRForImageSuperResolution.from_pretrained(model_id);
+ *
+ * // Prepare model inputs
+ * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/butterfly.jpg';
+ * const image = await RawImage.fromURL(url);
+ * const inputs = await processor(image);
+ *
+ * // Run model
+ * const outputs = await model(inputs);
+ *
+ * // Convert Tensor to RawImage
+ * const output = outputs.reconstruction.squeeze().clamp_(0, 1).mul_(255).round_().to('uint8');
+ * const outputImage = RawImage.fromTensor(output);
+ * // RawImage {
+ * //   data: Uint8Array(786432) [ 41, 31, 24, ... ],
+ * //   width: 512,
+ * //   height: 512,
+ * //   channels: 3
+ * // }
+ * ```
+ */
+export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel {}
diff --git a/src/models/pre-trained-models/t5-pre-trained-model.js b/src/models/pre-trained-models/t5-pre-trained-model.js
new file mode 100644
index 000000000..96118b833
--- /dev/null
+++ b/src/models/pre-trained-models/t5-pre-trained-model.js
@@ -0,0 +1,19 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class T5PreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'attention_mask',
+        'encoder_outputs',
+        'decoder_input_ids',
+        'decoder_attention_mask',
+        'past_key_values',
+    ];
+}
+
+export class T5Model extends T5PreTrainedModel {}
+
+/**
+ * T5Model is a class representing a T5 model for conditional generation.
+ */
+export class T5ForConditionalGeneration extends T5PreTrainedModel {}
\ No newline at end of file
diff --git a/src/models/pre-trained-models/table-transformer-pre-trained-model.js b/src/models/pre-trained-models/table-transformer-pre-trained-model.js
new file mode 100644
index 000000000..dfcb9ad9b
--- /dev/null
+++ b/src/models/pre-trained-models/table-transformer-pre-trained-model.js
@@ -0,0 +1,24 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { DetrObjectDetectionOutput } from './detr-pre-trained-model.js';
+
+export class TableTransformerPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer)
+ * outputting raw hidden-states without any specific head on top.
+ */
+export class TableTransformerModel extends TableTransformerPreTrainedModel {}
+
+/**
+ * Table Transformer Model (consisting of a backbone and encoder-decoder Transformer)
+ * with object detection heads on top, for tasks such as COCO detection.
+ */
+export class TableTransformerForObjectDetection extends TableTransformerPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new TableTransformerObjectDetectionOutput(await super._call(model_inputs));
+    }
+}
+export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput {}
diff --git a/src/models/pre-trained-models/tr-ocr-pre-trained-model.js b/src/models/pre-trained-models/tr-ocr-pre-trained-model.js
new file mode 100644
index 000000000..9ecaa7c6f
--- /dev/null
+++ b/src/models/pre-trained-models/tr-ocr-pre-trained-model.js
@@ -0,0 +1,8 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class TrOCRPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The TrOCR Decoder with a language modeling head.
+ */
+export class TrOCRForCausalLM extends TrOCRPreTrainedModel {}
diff --git a/src/models/pre-trained-models/ultravox-pre-trained-model.js b/src/models/pre-trained-models/ultravox-pre-trained-model.js
new file mode 100644
index 000000000..1c3f6022d
--- /dev/null
+++ b/src/models/pre-trained-models/ultravox-pre-trained-model.js
@@ -0,0 +1,22 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { default_merge_input_ids_with_audio_features } from '../utils.js';
+
+export class UltravoxPreTrainedModel extends PreTrainedModel {
+    forward_params = ['input_ids', 'attention_mask', 'position_ids', 'audio_values', 'past_key_values'];
+}
+
+export class UltravoxModel extends UltravoxPreTrainedModel {
+    _merge_input_ids_with_audio_features(kwargs) {
+        const audio_hidden_size = kwargs.audio_features.dims.at(-1);
+        const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
+
+        return default_merge_input_ids_with_audio_features({
+            // @ts-ignore
+            audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
+            ...kwargs,
+            audio_features: reshaped_audio_features,
+        });
+    }
+}
+
+export class VoxtralForConditionalGeneration extends UltravoxModel {}
diff --git a/src/models/pre-trained-models/uni-speech-pre-trained-model.js b/src/models/pre-trained-models/uni-speech-pre-trained-model.js
new file mode 100644
index 000000000..2d26b9c90
--- /dev/null
+++ b/src/models/pre-trained-models/uni-speech-pre-trained-model.js
@@ -0,0 +1,38 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CausalLMOutput, SequenceClassifierOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class UniSpeechPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class UniSpeechModel extends UniSpeechPreTrainedModel {}
+
+/**
+ * UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+ */
+export class UniSpeechForCTC extends UniSpeechPreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output).
+ */
+export class UniSpeechForSequenceClassification extends UniSpeechPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/uni-speech-sat-pre-trained-model.js b/src/models/pre-trained-models/uni-speech-sat-pre-trained-model.js
new file mode 100644
index 000000000..a66b6f2a4
--- /dev/null
+++ b/src/models/pre-trained-models/uni-speech-sat-pre-trained-model.js
@@ -0,0 +1,52 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class UniSpeechSatPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class UniSpeechSatModel extends UniSpeechSatPreTrainedModel {}
+
+/**
+ * UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+ */
+export class UniSpeechSatForCTC extends UniSpeechSatPreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output).
+ */
+export class UniSpeechSatForSequenceClassification extends UniSpeechSatPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization.
+ */
+export class UniSpeechSatForAudioFrameClassification extends UniSpeechSatPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/vault-gemma-pre-trained-model.js b/src/models/pre-trained-models/vault-gemma-pre-trained-model.js
new file mode 100644
index 000000000..fb2223a2b
--- /dev/null
+++ b/src/models/pre-trained-models/vault-gemma-pre-trained-model.js
@@ -0,0 +1,5 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class VaultGemmaPreTrainedModel extends PreTrainedModel {}
+export class VaultGemmaModel extends VaultGemmaPreTrainedModel {}
+export class VaultGemmaForCausalLM extends VaultGemmaPreTrainedModel {}
diff --git a/src/models/pre-trained-models/vi-t-pre-trained-model.js b/src/models/pre-trained-models/vi-t-pre-trained-model.js
new file mode 100644
index 000000000..2f9589c59
--- /dev/null
+++ b/src/models/pre-trained-models/vi-t-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class ViTPreTrainedModel extends PreTrainedModel {}
+export class ViTModel extends ViTPreTrainedModel {}
+export class ViTForImageClassification extends ViTPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/vi-tmae-pre-trained-model.js b/src/models/pre-trained-models/vi-tmae-pre-trained-model.js
new file mode 100644
index 000000000..c25e7b910
--- /dev/null
+++ b/src/models/pre-trained-models/vi-tmae-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class ViTMAEPreTrainedModel extends PreTrainedModel {}
+export class ViTMAEModel extends ViTMAEPreTrainedModel {}
diff --git a/src/models/pre-trained-models/vi-tmsn-pre-trained-model.js b/src/models/pre-trained-models/vi-tmsn-pre-trained-model.js
new file mode 100644
index 000000000..47f577064
--- /dev/null
+++ b/src/models/pre-trained-models/vi-tmsn-pre-trained-model.js
@@ -0,0 +1,13 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { SequenceClassifierOutput } from '../output.js';
+
+export class ViTMSNPreTrainedModel extends PreTrainedModel {}
+export class ViTMSNModel extends ViTMSNPreTrainedModel {}
+export class ViTMSNForImageClassification extends ViTMSNPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/vision-encoder-decoder-model.js b/src/models/pre-trained-models/vision-encoder-decoder-model.js
new file mode 100644
index 000000000..6066dd24a
--- /dev/null
+++ b/src/models/pre-trained-models/vision-encoder-decoder-model.js
@@ -0,0 +1,17 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+/**
+ * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
+ */
+export class VisionEncoderDecoderModel extends PreTrainedModel {
+    main_input_name = 'pixel_values';
+    forward_params = [
+        // Encoder inputs
+        'pixel_values',
+
+        // Decoder inpputs
+        'decoder_input_ids',
+        'encoder_hidden_states',
+        'past_key_values',
+    ];
+}
diff --git a/src/models/pre-trained-models/vit-matte-pre-trained-model.js b/src/models/pre-trained-models/vit-matte-pre-trained-model.js
new file mode 100644
index 000000000..b8b351c41
--- /dev/null
+++ b/src/models/pre-trained-models/vit-matte-pre-trained-model.js
@@ -0,0 +1,64 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { ImageMattingOutput } from '../output.js';
+
+export class VitMattePreTrainedModel extends PreTrainedModel {}
+
+/**
+ * ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
+ *
+ * **Example:** Perform image matting with a `VitMatteForImageMatting` model.
+ * ```javascript
+ * import { AutoProcessor, VitMatteForImageMatting, RawImage } from '@huggingface/transformers';
+ *
+ * // Load processor and model
+ * const processor = await AutoProcessor.from_pretrained('Xenova/vitmatte-small-distinctions-646');
+ * const model = await VitMatteForImageMatting.from_pretrained('Xenova/vitmatte-small-distinctions-646');
+ *
+ * // Load image and trimap
+ * const image = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_image.png');
+ * const trimap = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_trimap.png');
+ *
+ * // Prepare image + trimap for the model
+ * const inputs = await processor(image, trimap);
+ *
+ * // Predict alpha matte
+ * const { alphas } = await model(inputs);
+ * // Tensor {
+ * //   dims: [ 1, 1, 640, 960 ],
+ * //   type: 'float32',
+ * //   size: 614400,
+ * //   data: Float32Array(614400) [ 0.9894027709960938, 0.9970508813858032, ... ]
+ * // }
+ * ```
+ *
+ * You can visualize the alpha matte as follows:
+ * ```javascript
+ * import { Tensor, cat } from '@huggingface/transformers';
+ *
+ * // Visualize predicted alpha matte
+ * const imageTensor = image.toTensor();
+ *
+ * // Convert float (0-1) alpha matte to uint8 (0-255)
+ * const alphaChannel = alphas
+ *   .squeeze(0)
+ *   .mul_(255)
+ *   .clamp_(0, 255)
+ *   .round_()
+ *   .to('uint8');
+ *
+ * // Concatenate original image with predicted alpha
+ * const imageData = cat([imageTensor, alphaChannel], 0);
+ *
+ * // Save output image
+ * const outputImage = RawImage.fromTensor(imageData);
+ * outputImage.save('output.png');
+ * ```
+ */
+export class VitMatteForImageMatting extends VitMattePreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new ImageMattingOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/vit-pose-pre-trained-model.js b/src/models/pre-trained-models/vit-pose-pre-trained-model.js
new file mode 100644
index 000000000..1c9d2faf9
--- /dev/null
+++ b/src/models/pre-trained-models/vit-pose-pre-trained-model.js
@@ -0,0 +1,8 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class VitPosePreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The VitPose model with a pose estimation head on top.
+ */
+export class VitPoseForPoseEstimation extends VitPosePreTrainedModel {}
diff --git a/src/models/pre-trained-models/vits-pre-trained-model.js b/src/models/pre-trained-models/vits-pre-trained-model.js
new file mode 100644
index 000000000..e7a558d0e
--- /dev/null
+++ b/src/models/pre-trained-models/vits-pre-trained-model.js
@@ -0,0 +1,39 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { VitsModelOutput } from '../output.js';
+
+export class VitsPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The complete VITS model, for text-to-speech synthesis.
+ *
+ * **Example:** Generate speech from text with `VitsModel`.
+ * ```javascript
+ * import { AutoTokenizer, VitsModel } from '@huggingface/transformers';
+ *
+ * // Load the tokenizer and model
+ * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/mms-tts-eng');
+ * const model = await VitsModel.from_pretrained('Xenova/mms-tts-eng');
+ *
+ * // Run tokenization
+ * const inputs = tokenizer('I love transformers');
+ *
+ * // Generate waveform
+ * const { waveform } = await model(inputs);
+ * // Tensor {
+ * //   dims: [ 1, 35328 ],
+ * //   type: 'float32',
+ * //   data: Float32Array(35328) [ ... ],
+ * //   size: 35328,
+ * // }
+ * ```
+ */
+export class VitsModel extends VitsPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<VitsModelOutput>} The outputs for the VITS model.
+     */
+    async _call(model_inputs) {
+        return new VitsModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/wav-lm-pre-trained-model.js b/src/models/pre-trained-models/wav-lm-pre-trained-model.js
new file mode 100644
index 000000000..46b2d881c
--- /dev/null
+++ b/src/models/pre-trained-models/wav-lm-pre-trained-model.js
@@ -0,0 +1,155 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput, XVectorOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+/**
+ * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
+ */
+export class WavLMPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.
+ *
+ * **Example:** Load and run a `WavLMModel` for feature extraction.
+ *
+ * ```javascript
+ * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
+ *
+ * // Read and preprocess audio
+ * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base');
+ * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000);
+ * const inputs = await processor(audio);
+ *
+ * // Run model with inputs
+ * const model = await AutoModel.from_pretrained('Xenova/wavlm-base');
+ * const output = await model(inputs);
+ * // {
+ * //   last_hidden_state: Tensor {
+ * //     dims: [ 1, 549, 768 ],
+ * //     type: 'float32',
+ * //     data: Float32Array(421632) [-0.349443256855011, -0.39341306686401367,  0.022836603224277496, ...],
+ * //     size: 421632
+ * //   }
+ * // }
+ * ```
+ */
+export class WavLMModel extends WavLMPreTrainedModel {}
+
+/**
+ * WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+ */
+export class WavLMForCTC extends WavLMPreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * WavLM Model with a sequence classification head on top (a linear layer over the pooled output).
+ */
+export class WavLMForSequenceClassification extends WavLMPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+ *
+ * **Example:** Extract speaker embeddings with `WavLMForXVector`.
+ * ```javascript
+ * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
+ *
+ * // Read and preprocess audio
+ * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sv');
+ * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
+ * const audio = await read_audio(url, 16000);
+ * const inputs = await processor(audio);
+ *
+ * // Run model with inputs
+ * const model = await AutoModel.from_pretrained('Xenova/wavlm-base-plus-sv');
+ * const outputs = await model(inputs);
+ * // {
+ * //   logits: Tensor {
+ * //     dims: [ 1, 512 ],
+ * //     type: 'float32',
+ * //     data: Float32Array(512) [0.5847219228744507, ...],
+ * //     size: 512
+ * //   },
+ * //   embeddings: Tensor {
+ * //     dims: [ 1, 512 ],
+ * //     type: 'float32',
+ * //     data: Float32Array(512) [-0.09079201519489288, ...],
+ * //     size: 512
+ * //   }
+ * // }
+ * ```
+ */
+export class WavLMForXVector extends WavLMPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<XVectorOutput>} An object containing the model's output logits and speaker embeddings.
+     */
+    async _call(model_inputs) {
+        return new XVectorOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
+ *
+ * **Example:** Perform speaker diarization with `WavLMForAudioFrameClassification`.
+ * ```javascript
+ * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers';
+ *
+ * // Read and preprocess audio
+ * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sd');
+ * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
+ * const audio = await read_audio(url, 16000);
+ * const inputs = await processor(audio);
+ *
+ * // Run model with inputs
+ * const model = await AutoModelForAudioFrameClassification.from_pretrained('Xenova/wavlm-base-plus-sd');
+ * const { logits } = await model(inputs);
+ * // {
+ * //   logits: Tensor {
+ * //     dims: [ 1, 549, 2 ],  // [batch_size, num_frames, num_speakers]
+ * //     type: 'float32',
+ * //     data: Float32Array(1098) [-3.5301010608673096, ...],
+ * //     size: 1098
+ * //   }
+ * // }
+ *
+ * const labels = logits[0].sigmoid().tolist().map(
+ *     frames => frames.map(speaker => speaker > 0.5 ? 1 : 0)
+ * );
+ * console.log(labels); // labels is a one-hot array of shape (num_frames, num_speakers)
+ * // [
+ * //     [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
+ * //     [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
+ * //     [0, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1],
+ * //     ...
+ * // ]
+ * ```
+ */
+export class WavLMForAudioFrameClassification extends WavLMPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/wav2-vec2-bert-pre-trained-model.js b/src/models/pre-trained-models/wav2-vec2-bert-pre-trained-model.js
new file mode 100644
index 000000000..4719694c3
--- /dev/null
+++ b/src/models/pre-trained-models/wav2-vec2-bert-pre-trained-model.js
@@ -0,0 +1,38 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CausalLMOutput, SequenceClassifierOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class Wav2Vec2BertPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class Wav2Vec2BertModel extends Wav2Vec2BertPreTrainedModel {}
+
+/**
+ * Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+ */
+export class Wav2Vec2BertForCTC extends Wav2Vec2BertPreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_features Float values of input mel-spectrogram.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output).
+ */
+export class Wav2Vec2BertForSequenceClassification extends Wav2Vec2BertPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/wav2-vec2-pre-trained-model.js b/src/models/pre-trained-models/wav2-vec2-pre-trained-model.js
new file mode 100644
index 000000000..d175a579b
--- /dev/null
+++ b/src/models/pre-trained-models/wav2-vec2-pre-trained-model.js
@@ -0,0 +1,69 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class Wav2Vec2PreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.
+ *
+ * **Example:** Load and run a `Wav2Vec2Model` for feature extraction.
+ *
+ * ```javascript
+ * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers';
+ *
+ * // Read and preprocess audio
+ * const processor = await AutoProcessor.from_pretrained('Xenova/mms-300m');
+ * const audio = await read_audio('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac', 16000);
+ * const inputs = await processor(audio);
+ *
+ * // Run model with inputs
+ * const model = await AutoModel.from_pretrained('Xenova/mms-300m');
+ * const output = await model(inputs);
+ * // {
+ * //   last_hidden_state: Tensor {
+ * //     dims: [ 1, 1144, 1024 ],
+ * //     type: 'float32',
+ * //     data: Float32Array(1171456) [ ... ],
+ * //     size: 1171456
+ * //   }
+ * // }
+ * ```
+ */
+export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel {}
+
+export class Wav2Vec2ForCTC extends Wav2Vec2PreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
+ */
+export class Wav2Vec2ForAudioFrameClassification extends Wav2Vec2PreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/we-speaker-res-net-pre-trained-model.js b/src/models/pre-trained-models/we-speaker-res-net-pre-trained-model.js
new file mode 100644
index 000000000..8290eaf8c
--- /dev/null
+++ b/src/models/pre-trained-models/we-speaker-res-net-pre-trained-model.js
@@ -0,0 +1,4 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+
+export class WeSpeakerResNetPreTrainedModel extends PreTrainedModel {}
+export class WeSpeakerResNetModel extends WeSpeakerResNetPreTrainedModel {}
diff --git a/src/models/pre-trained-models/whisper-pre-trained-model.js b/src/models/pre-trained-models/whisper-pre-trained-model.js
new file mode 100644
index 000000000..96f280812
--- /dev/null
+++ b/src/models/pre-trained-models/whisper-pre-trained-model.js
@@ -0,0 +1,293 @@
+import { cat, mean, Tensor, stack, std_mean } from '../../utils/tensor.js';
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { WhisperGenerationConfig } from '../model-processors/whisper/generation_whisper.js';
+import { whisper_language_to_code } from '../model-processors/whisper/common_whisper.js';
+import {
+    LogitsProcessorList,
+    SuppressTokensAtBeginLogitsProcessor,
+    WhisperTimeStampLogitsProcessor,
+} from '../../generation/logits_process.js';
+import { medianFilter, dynamic_time_warping } from '../../utils/maths.js';
+import { mergeArrays } from '../../utils/core.js';
+import { ModelOutput } from '../output.js';
+
+export class WhisperPreTrainedModel extends PreTrainedModel {
+    requires_attention_mask = false;
+    main_input_name = 'input_features';
+    forward_params = [
+        'input_features',
+        'attention_mask',
+        'decoder_input_ids',
+        'decoder_attention_mask',
+        'past_key_values',
+    ];
+}
+
+/**
+ * WhisperModel class for training Whisper models without a language model head.
+ */
+export class WhisperModel extends WhisperPreTrainedModel {}
+
+/**
+ * WhisperForConditionalGeneration class for generating conditional outputs from Whisper models.
+ */
+export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
+    _prepare_generation_config(generation_config, kwargs) {
+        return /** @type {WhisperGenerationConfig} */ (
+            super._prepare_generation_config(generation_config, kwargs, WhisperGenerationConfig)
+        );
+    }
+
+    /**
+     *
+     * @param {WhisperGenerationConfig} generation_config
+     */
+    _retrieve_init_tokens(generation_config) {
+        // prefix tokens are of the form:
+        //  - Multilingual: <|startoftranscript|> <|lang_id|> <|task|> [<|notimestamps|>]
+        //  - English-only: <|startoftranscript|> [<|notimestamps|>]
+
+        // 1. Handle <|startoftranscript|> token
+        const init_tokens = [generation_config.decoder_start_token_id];
+
+        // 2. Handle <|lang_id|> and <|task> tokens
+        let language = generation_config.language;
+        const task = generation_config.task;
+        if (generation_config.is_multilingual) {
+            if (!language) {
+                // TODO: Implement language detection
+                console.warn('No language specified - defaulting to English (en).');
+                language = 'en';
+            }
+
+            // Add language token
+            const language_code = whisper_language_to_code(language);
+            const language_token = `<|${language_code}|>`;
+            init_tokens.push(generation_config.lang_to_id[language_token]);
+
+            // Add task token
+            // NOTE: Defaults to 'transcribe' if no task is specified
+            init_tokens.push(generation_config.task_to_id[task ?? 'transcribe']);
+        } else if (language || task) {
+            throw new Error(
+                'Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.',
+            );
+        }
+
+        // 3. Handle <|notimestamps|> token
+        if (
+            !generation_config.return_timestamps &&
+            generation_config.no_timestamps_token_id &&
+            init_tokens.at(-1) !== generation_config.no_timestamps_token_id
+        ) {
+            init_tokens.push(generation_config.no_timestamps_token_id);
+        } else if (
+            generation_config.return_timestamps &&
+            init_tokens.at(-1) === generation_config.no_timestamps_token_id
+        ) {
+            console.warn(
+                '<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`.',
+            );
+            init_tokens.pop();
+        }
+
+        // let's make sure we don't pass `null` tokens as prompt tokens
+        return init_tokens.filter((token) => token != null);
+    }
+
+    /**
+     * Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids.
+     * @param {import('../model-processors/whisper/generation_whisper.js').WhisperGenerationFunctionParameters} options
+     * @returns {Promise<ModelOutput|Tensor>} The output of the model, which can contain the generated token ids, attentions, and scores.
+     */
+    async generate({
+        inputs = null,
+        generation_config = null,
+        logits_processor = null,
+        stopping_criteria = null,
+
+        // Whisper-specific options (passed to kwargs)
+        // prompt_ids = null,
+        // language = null,
+        // task = null,
+
+        ...kwargs
+    }) {
+        generation_config = this._prepare_generation_config(generation_config, kwargs);
+
+        const init_tokens = kwargs.decoder_input_ids ?? this._retrieve_init_tokens(generation_config);
+
+        if (generation_config.return_timestamps) {
+            logits_processor ??= new LogitsProcessorList();
+            logits_processor.push(new WhisperTimeStampLogitsProcessor(generation_config, init_tokens));
+        }
+
+        if (generation_config.begin_suppress_tokens) {
+            logits_processor ??= new LogitsProcessorList();
+            logits_processor.push(
+                new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, init_tokens.length),
+            );
+        }
+
+        if (generation_config.return_token_timestamps) {
+            if (!generation_config.alignment_heads) {
+                throw new Error(
+                    'Model generation config has no `alignment_heads`, token-level timestamps not available. ' +
+                        'See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.',
+                );
+            }
+
+            if (generation_config.task === 'translate') {
+                console.warn("Token-level timestamps may not be reliable for task 'translate'.");
+            }
+
+            generation_config.output_attentions = true;
+            generation_config.return_dict_in_generate = true;
+        }
+
+        const outputs = await super.generate({
+            inputs,
+            generation_config,
+            logits_processor,
+            decoder_input_ids: init_tokens,
+            ...kwargs,
+        });
+
+        if (generation_config.return_token_timestamps) {
+            outputs['token_timestamps'] = this._extract_token_timestamps(
+                // @ts-expect-error TS2345
+                outputs,
+                generation_config.alignment_heads,
+                generation_config.num_frames,
+            );
+        }
+
+        return outputs;
+    }
+
+    /**
+     * Calculates token-level timestamps using the encoder-decoder cross-attentions and
+     * dynamic time-warping (DTW) to map each output token to a position in the input audio.
+     * If `num_frames` is specified, the encoder-decoder cross-attentions will be cropped before applying DTW.
+     * @param {Object} generate_outputs Outputs generated by the model
+     * @param {Tensor[][]} generate_outputs.cross_attentions The cross attentions output by the model
+     * @param {Tensor} generate_outputs.sequences The sequences output by the model
+     * @param {number[][]} alignment_heads Alignment heads of the model
+     * @param {number} [num_frames=null] Number of frames in the input audio.
+     * @param {number} [time_precision=0.02] Precision of the timestamps in seconds
+     * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token
+     */
+    _extract_token_timestamps(generate_outputs, alignment_heads, num_frames = null, time_precision = 0.02) {
+        if (!generate_outputs.cross_attentions) {
+            throw new Error(
+                'Model outputs must contain cross attentions to extract timestamps. ' +
+                    'This is most likely because the model was not exported with `output_attentions=True`.',
+            );
+        }
+        if (num_frames == null) {
+            console.warn(
+                '`num_frames` has not been set, meaning the entire audio will be analyzed. ' +
+                    'This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).',
+            );
+        }
+
+        // @ts-expect-error TS2339
+        let median_filter_width = this.config.median_filter_width;
+        if (median_filter_width === undefined) {
+            console.warn('Model config has no `median_filter_width`, using default value of 7.');
+            median_filter_width = 7;
+        }
+
+        // TODO: Improve batch processing
+        const batch = generate_outputs.cross_attentions;
+        // Create a list with `decoder_layers` elements, each a tensor of shape
+        // (batch size, attention_heads, output length, input length).
+        const cross_attentions = Array.from(
+            // @ts-expect-error TS2339
+            { length: this.config.decoder_layers },
+            // Concatenate the cross attentions for each layer across sequence length dimension.
+            (_, i) =>
+                cat(
+                    batch.map((x) => x[i]),
+                    2,
+                ),
+        );
+
+        const weights = stack(
+            alignment_heads.map(([l, h]) => {
+                if (l >= cross_attentions.length) {
+                    throw new Error(
+                        `Layer index ${l} is out of bounds for cross attentions (length ${cross_attentions.length}).`,
+                    );
+                }
+                return num_frames
+                    ? cross_attentions[l].slice(null, h, null, [0, num_frames])
+                    : cross_attentions[l].slice(null, h);
+            }),
+        ).transpose(1, 0, 2, 3);
+
+        const [std, calculatedMean] = std_mean(weights, -2, 0, true);
+
+        // Normalize and smoothen the weights.
+        const smoothedWeights = weights.clone(); // [1, 8, seqLength, 1500]
+
+        for (let a = 0; a < smoothedWeights.dims[0]; ++a) {
+            const aTensor = smoothedWeights[a]; // [8, seqLength, 1500]
+
+            for (let b = 0; b < aTensor.dims[0]; ++b) {
+                const bTensor = aTensor[b]; // [seqLength, 1500]
+
+                const stdTensorData = std[a][b][0].data; // [1500]
+                const meanTensorData = calculatedMean[a][b][0].data; // [1500]
+
+                for (let c = 0; c < bTensor.dims[0]; ++c) {
+                    let cTensorData = bTensor[c].data; // [1500]
+                    for (let d = 0; d < cTensorData.length; ++d) {
+                        cTensorData[d] = (cTensorData[d] - meanTensorData[d]) / stdTensorData[d];
+                    }
+
+                    // Apply median filter.
+                    cTensorData.set(medianFilter(cTensorData, median_filter_width));
+                }
+            }
+        }
+
+        // Average the different cross-attention heads.
+        const batchedMatrices = [mean(smoothedWeights, 1)];
+
+        const timestampsShape = generate_outputs.sequences.dims;
+
+        const timestamps = new Tensor(
+            'float32',
+            new Float32Array(timestampsShape[0] * timestampsShape[1]),
+            timestampsShape,
+        );
+
+        // Perform dynamic time warping on each element of the batch.
+        for (let batch_idx = 0; batch_idx < timestampsShape[0]; ++batch_idx) {
+            // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions
+            // as the python implementation
+            const matrix = batchedMatrices[batch_idx].neg().squeeze_(0);
+            const [text_indices, time_indices] = dynamic_time_warping(matrix.tolist());
+
+            const diffs = Array.from(
+                { length: text_indices.length - 1 },
+                (v, i) => text_indices[i + 1] - text_indices[i],
+            );
+            const jumps = mergeArrays([1], diffs).map((x) => !!x); // convert to boolean
+
+            const jump_times = [];
+            for (let i = 0; i < jumps.length; ++i) {
+                if (jumps[i]) {
+                    // NOTE: No point in rounding here, since we set to Float32Array later
+                    jump_times.push(time_indices[i] * time_precision);
+                }
+            }
+            timestamps[batch_idx].data.set(jump_times, 1);
+        }
+
+        return timestamps;
+    }
+}
+
+export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration {}
diff --git a/src/models/pre-trained-models/xlm-pre-trained-model.js b/src/models/pre-trained-models/xlm-pre-trained-model.js
new file mode 100644
index 000000000..e477cf130
--- /dev/null
+++ b/src/models/pre-trained-models/xlm-pre-trained-model.js
@@ -0,0 +1,74 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class XLMPreTrainedModel extends PreTrainedModel {}
+
+/**
+ * The bare XLM Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class XLMModel extends XLMPreTrainedModel {}
+
+/**
+ * The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
+ */
+export class XLMWithLMHeadModel extends XLMPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+ */
+export class XLMForSequenceClassification extends XLMPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * XLM Model with a token classification head on top (a linear layer on top of the hidden-states output)
+ */
+export class XLMForTokenClassification extends XLMPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * XLM Model with a span classification head on top for extractive question-answering tasks
+ */
+export class XLMForQuestionAnswering extends XLMPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/xlm-roberta-pre-trained-model.js b/src/models/pre-trained-models/xlm-roberta-pre-trained-model.js
new file mode 100644
index 000000000..72eb14911
--- /dev/null
+++ b/src/models/pre-trained-models/xlm-roberta-pre-trained-model.js
@@ -0,0 +1,70 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import {
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+} from '../output.js';
+
+export class XLMRobertaPreTrainedModel extends PreTrainedModel {}
+export class XLMRobertaModel extends XLMRobertaPreTrainedModel {}
+
+/**
+ * XLMRobertaForMaskedLM class for performing masked language modeling on XLMRoberta models.
+ */
+export class XLMRobertaForMaskedLM extends XLMRobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<MaskedLMOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new MaskedLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * XLMRobertaForSequenceClassification class for performing sequence classification on XLMRoberta models.
+ */
+export class XLMRobertaForSequenceClassification extends XLMRobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * XLMRobertaForTokenClassification class for performing token classification on XLMRoberta models.
+ */
+export class XLMRobertaForTokenClassification extends XLMRobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * XLMRobertaForQuestionAnswering class for performing question answering on XLMRoberta models.
+ */
+export class XLMRobertaForQuestionAnswering extends XLMRobertaPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     *
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<QuestionAnsweringModelOutput>} returned object
+     */
+    async _call(model_inputs) {
+        return new QuestionAnsweringModelOutput(await super._call(model_inputs));
+    }
+}
diff --git a/src/models/pre-trained-models/yolos-pre-trained-model.js b/src/models/pre-trained-models/yolos-pre-trained-model.js
new file mode 100644
index 000000000..6423a795a
--- /dev/null
+++ b/src/models/pre-trained-models/yolos-pre-trained-model.js
@@ -0,0 +1,28 @@
+import { PreTrainedModel } from '../pre-trained-model.js';
+import { ModelOutput } from '../output.js';
+import { Tensor } from '../../utils/tensor.js';
+
+export class YolosPreTrainedModel extends PreTrainedModel {}
+export class YolosModel extends YolosPreTrainedModel {}
+export class YolosForObjectDetection extends YolosPreTrainedModel {
+    /**
+     * @param {any} model_inputs
+     */
+    async _call(model_inputs) {
+        return new YolosObjectDetectionOutput(await super._call(model_inputs));
+    }
+}
+
+export class YolosObjectDetectionOutput extends ModelOutput {
+    /**
+     * @param {Object} output The output of the model.
+     * @param {Tensor} output.logits Classification logits (including no-object) for all queries.
+     * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
+     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
+     */
+    constructor({ logits, pred_boxes }) {
+        super();
+        this.logits = logits;
+        this.pred_boxes = pred_boxes;
+    }
+}
diff --git a/src/models/processors.js b/src/models/processors.js
index 0348315ad..942710a23 100644
--- a/src/models/processors.js
+++ b/src/models/processors.js
@@ -1,24 +1,24 @@
-export * from './chatterbox/processing_chatterbox.js';
-export * from './florence2/processing_florence2.js';
-export * from './gemma3n/processing_gemma3n.js';
-export * from './grounding_dino/processing_grounding_dino.js';
-export * from './idefics3/processing_idefics3.js';
-export * from './janus/processing_janus.js';
-export * from './jina_clip/processing_jina_clip.js';
-export * from './llava/processing_llava.js';
-export * from './mgp_str/processing_mgp_str.js';
-export * from './moonshine/processing_moonshine.js';
-export * from './owlvit/processing_owlvit.js';
-export * from './phi3_v/processing_phi3_v.js';
-export * from './paligemma/processing_paligemma.js';
-export * from './pyannote/processing_pyannote.js';
-export * from './qwen2_vl/processing_qwen2_vl.js';
-export * from './sam/processing_sam.js';
-export * from './sam2/processing_sam2.js';
-export * from './smolvlm/processing_smolvlm.js';
-export * from './speecht5/processing_speecht5.js';
-export * from './ultravox/processing_ultravox.js';
-export * from './voxtral/processing_voxtral.js';
-export * from './wav2vec2/processing_wav2vec2.js';
-export * from './wav2vec2_with_lm/processing_wav2vec2_with_lm.js';
-export * from './whisper/processing_whisper.js';
+export * from './model-processors/chatterbox/processing_chatterbox.js';
+export * from './model-processors/florence2/processing_florence2.js';
+export * from './model-processors/gemma3n/processing_gemma3n.js';
+export * from './model-processors/grounding_dino/processing_grounding_dino.js';
+export * from './model-processors/idefics3/processing_idefics3.js';
+export * from './model-processors/janus/processing_janus.js';
+export * from './model-processors/jina_clip/processing_jina_clip.js';
+export * from './model-processors/llava/processing_llava.js';
+export * from './model-processors/mgp_str/processing_mgp_str.js';
+export * from './model-processors/moonshine/processing_moonshine.js';
+export * from './model-processors/owlvit/processing_owlvit.js';
+export * from './model-processors/phi3_v/processing_phi3_v.js';
+export * from './model-processors/paligemma/processing_paligemma.js';
+export * from './model-processors/pyannote/processing_pyannote.js';
+export * from './model-processors/qwen2_vl/processing_qwen2_vl.js';
+export * from './model-processors/sam/processing_sam.js';
+export * from './model-processors/sam2/processing_sam2.js';
+export * from './model-processors/smolvlm/processing_smolvlm.js';
+export * from './model-processors/speecht5/processing_speecht5.js';
+export * from './model-processors/ultravox/processing_ultravox.js';
+export * from './model-processors/voxtral/processing_voxtral.js';
+export * from './model-processors/wav2vec2/processing_wav2vec2.js';
+export * from './model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js';
+export * from './model-processors/whisper/processing_whisper.js';
diff --git a/src/models/pvt/image_processing_pvt.js b/src/models/pvt/image_processing_pvt.js
deleted file mode 100644
index 702af349e..000000000
--- a/src/models/pvt/image_processing_pvt.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class PvtImageProcessor extends ImageProcessor {}
diff --git a/src/models/registry.js b/src/models/registry.js
new file mode 100644
index 000000000..c6fccfc7e
--- /dev/null
+++ b/src/models/registry.js
@@ -0,0 +1,993 @@
+import {
+    MODEL_TYPES,
+    MODEL_TYPE_MAPPING,
+    MODEL_NAME_TO_CLASS_MAPPING,
+    MODEL_CLASS_TO_NAME_MAPPING,
+    PreTrainedModel,
+} from './pre-trained-model.js';
+
+import {
+    ASTModel,
+    ASTForAudioClassification,
+    AlbertModel,
+    AlbertForSequenceClassification,
+    AlbertForQuestionAnswering,
+    AlbertForMaskedLM,
+    ApertusModel,
+    ApertusForCausalLM,
+    ArceeModel,
+    ArceeForCausalLM,
+    BartModel,
+    BartForConditionalGeneration,
+    BartForSequenceClassification,
+    BeitModel,
+    BeitForImageClassification,
+    BertModel,
+    BertForMaskedLM,
+    BertForSequenceClassification,
+    BertForTokenClassification,
+    BertForQuestionAnswering,
+    BlenderbotModel,
+    BlenderbotForConditionalGeneration,
+    BlenderbotSmallModel,
+    BlenderbotSmallForConditionalGeneration,
+    BloomModel,
+    BloomForCausalLM,
+    CLIPModel,
+    CLIPTextModelWithProjection,
+    CLIPVisionModelWithProjection,
+    CLIPSegModel,
+    CLIPSegForImageSegmentation,
+    CamembertModel,
+    CamembertForMaskedLM,
+    CamembertForSequenceClassification,
+    CamembertForTokenClassification,
+    CamembertForQuestionAnswering,
+    ChatterboxModel,
+    ChineseCLIPModel,
+    ClapModel,
+    ClapTextModelWithProjection,
+    ClapAudioModelWithProjection,
+    CodeGenModel,
+    CodeGenForCausalLM,
+    CohereModel,
+    CohereForCausalLM,
+    ConvBertModel,
+    ConvBertForMaskedLM,
+    ConvBertForSequenceClassification,
+    ConvBertForTokenClassification,
+    ConvBertForQuestionAnswering,
+    ConvNextModel,
+    ConvNextForImageClassification,
+    ConvNextV2Model,
+    ConvNextV2ForImageClassification,
+    DFineModel,
+    DFineForObjectDetection,
+    DINOv3ConvNextModel,
+    DINOv3ViTModel,
+    DPTModel,
+    DPTForDepthEstimation,
+    DacModel,
+    DacEncoderModel,
+    DacDecoderModel,
+    DebertaModel,
+    DebertaForMaskedLM,
+    DebertaForSequenceClassification,
+    DebertaForTokenClassification,
+    DebertaForQuestionAnswering,
+    DebertaV2Model,
+    DebertaV2ForMaskedLM,
+    DebertaV2ForSequenceClassification,
+    DebertaV2ForTokenClassification,
+    DebertaV2ForQuestionAnswering,
+    DecisionTransformerModel,
+    DeiTModel,
+    DeiTForImageClassification,
+    DepthAnythingForDepthEstimation,
+    DepthProForDepthEstimation,
+    DetrModel,
+    DetrForObjectDetection,
+    DetrForSegmentation,
+    Dinov2Model,
+    Dinov2ForImageClassification,
+    Dinov2WithRegistersModel,
+    Dinov2WithRegistersForImageClassification,
+    DistilBertModel,
+    DistilBertForSequenceClassification,
+    DistilBertForTokenClassification,
+    DistilBertForQuestionAnswering,
+    DistilBertForMaskedLM,
+    DonutSwinModel,
+    EfficientNetModel,
+    EfficientNetForImageClassification,
+    ElectraModel,
+    ElectraForMaskedLM,
+    ElectraForSequenceClassification,
+    ElectraForTokenClassification,
+    ElectraForQuestionAnswering,
+    Ernie4_5_Model,
+    Ernie4_5_ForCausalLM,
+    EsmModel,
+    EsmForMaskedLM,
+    EsmForSequenceClassification,
+    EsmForTokenClassification,
+    ExaoneModel,
+    ExaoneForCausalLM,
+    FalconModel,
+    FalconForCausalLM,
+    FastViTModel,
+    FastViTForImageClassification,
+    Florence2ForConditionalGeneration,
+    GLPNModel,
+    GLPNForDepthEstimation,
+    GPT2Model,
+    GPT2LMHeadModel,
+    GPTBigCodeModel,
+    GPTBigCodeForCausalLM,
+    GPTJModel,
+    GPTJForCausalLM,
+    GPTNeoModel,
+    GPTNeoForCausalLM,
+    GPTNeoXModel,
+    GPTNeoXForCausalLM,
+    Gemma2Model,
+    Gemma2ForCausalLM,
+    Gemma3Model,
+    Gemma3ForCausalLM,
+    Gemma3nForConditionalGeneration,
+    GemmaModel,
+    GemmaForCausalLM,
+    GlmModel,
+    GlmForCausalLM,
+    GptOssModel,
+    GptOssForCausalLM,
+    GraniteMoeHybridModel,
+    GraniteMoeHybridForCausalLM,
+    GraniteModel,
+    GraniteForCausalLM,
+    GroundingDinoForObjectDetection,
+    GroupViTModel,
+    HeliumModel,
+    HeliumForCausalLM,
+    HieraModel,
+    HieraForImageClassification,
+    HubertModel,
+    HubertForCTC,
+    HubertForSequenceClassification,
+    IJepaModel,
+    IJepaForImageClassification,
+    Idefics3ForConditionalGeneration,
+    SmolVLMForConditionalGeneration,
+    JAISModel,
+    JAISLMHeadModel,
+    JinaCLIPModel,
+    JinaCLIPTextModel,
+    JinaCLIPVisionModel,
+    Lfm2Model,
+    Lfm2ForCausalLM,
+    Llama4ForCausalLM,
+    LlamaModel,
+    LlamaForCausalLM,
+    LlavaForConditionalGeneration,
+    LlavaOnevisionForConditionalGeneration,
+    Moondream1ForConditionalGeneration,
+    LlavaQwen2ForCausalLM,
+    LongT5Model,
+    LongT5ForConditionalGeneration,
+    M2M100Model,
+    M2M100ForConditionalGeneration,
+    MBartModel,
+    MBartForConditionalGeneration,
+    MBartForSequenceClassification,
+    MBartForCausalLM,
+    MPNetModel,
+    MPNetForMaskedLM,
+    MPNetForSequenceClassification,
+    MPNetForTokenClassification,
+    MPNetForQuestionAnswering,
+    MT5Model,
+    MT5ForConditionalGeneration,
+    MarianModel,
+    MarianMTModel,
+    MaskFormerModel,
+    MaskFormerForInstanceSegmentation,
+    Metric3DForDepthEstimation,
+    Metric3Dv2ForDepthEstimation,
+    MgpstrForSceneTextRecognition,
+    MimiModel,
+    MimiEncoderModel,
+    MimiDecoderModel,
+    MistralModel,
+    MistralForCausalLM,
+    MobileBertModel,
+    MobileBertForMaskedLM,
+    MobileBertForSequenceClassification,
+    MobileBertForQuestionAnswering,
+    MobileLLMModel,
+    MobileLLMForCausalLM,
+    MobileNetV1Model,
+    MobileNetV1ForImageClassification,
+    MobileNetV1ForSemanticSegmentation,
+    MobileNetV2Model,
+    MobileNetV2ForImageClassification,
+    MobileNetV2ForSemanticSegmentation,
+    MobileNetV3Model,
+    MobileNetV3ForImageClassification,
+    MobileNetV3ForSemanticSegmentation,
+    MobileNetV4Model,
+    MobileNetV4ForImageClassification,
+    MobileNetV4ForSemanticSegmentation,
+    MobileViTModel,
+    MobileViTForImageClassification,
+    MobileViTV2Model,
+    MobileViTV2ForImageClassification,
+    ModernBertDecoderModel,
+    ModernBertDecoderForCausalLM,
+    ModernBertModel,
+    ModernBertForMaskedLM,
+    ModernBertForSequenceClassification,
+    ModernBertForTokenClassification,
+    MoonshineForConditionalGeneration,
+    MptModel,
+    MptForCausalLM,
+    MultiModalityCausalLM,
+    MusicgenForConditionalGeneration,
+    NanoChatModel,
+    NanoChatForCausalLM,
+    NeoBertModel,
+    NeoBertForMaskedLM,
+    NeoBertForSequenceClassification,
+    NeoBertForTokenClassification,
+    NeoBertForQuestionAnswering,
+    NomicBertModel,
+    OPTModel,
+    OPTForCausalLM,
+    Olmo2Model,
+    Olmo2ForCausalLM,
+    Olmo3Model,
+    Olmo3ForCausalLM,
+    OlmoModel,
+    OlmoForCausalLM,
+    OpenELMModel,
+    OpenELMForCausalLM,
+    OwlViTModel,
+    OwlViTForObjectDetection,
+    Owlv2Model,
+    Owlv2ForObjectDetection,
+    PaliGemmaForConditionalGeneration,
+    ParakeetForCTC,
+    PatchTSMixerModel,
+    PatchTSMixerForPrediction,
+    PatchTSTModel,
+    PatchTSTForPrediction,
+    Phi3Model,
+    Phi3ForCausalLM,
+    Phi3VForCausalLM,
+    PhiModel,
+    PhiForCausalLM,
+    PvtModel,
+    PvtForImageClassification,
+    PyAnnoteModel,
+    PyAnnoteForAudioFrameClassification,
+    Qwen2Model,
+    Qwen2ForCausalLM,
+    Qwen2VLForConditionalGeneration,
+    Qwen3Model,
+    Qwen3ForCausalLM,
+    RFDetrModel,
+    RFDetrForObjectDetection,
+    RTDetrModel,
+    RTDetrForObjectDetection,
+    RTDetrV2Model,
+    RTDetrV2ForObjectDetection,
+    ResNetModel,
+    ResNetForImageClassification,
+    RoFormerModel,
+    RoFormerForMaskedLM,
+    RoFormerForSequenceClassification,
+    RoFormerForTokenClassification,
+    RoFormerForQuestionAnswering,
+    RobertaModel,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+    RobertaForTokenClassification,
+    RobertaForQuestionAnswering,
+    Sam2Model,
+    EdgeTamModel,
+    Sam3TrackerModel,
+    SamModel,
+    SapiensForSemanticSegmentation,
+    SapiensForDepthEstimation,
+    SapiensForNormalEstimation,
+    SegformerForImageClassification,
+    SegformerForSemanticSegmentation,
+    SiglipModel,
+    SiglipTextModel,
+    SiglipVisionModel,
+    SmolLM3Model,
+    SmolLM3ForCausalLM,
+    SnacModel,
+    SnacEncoderModel,
+    SnacDecoderModel,
+    SpeechT5ForSpeechToText,
+    SpeechT5ForTextToSpeech,
+    SpeechT5HifiGan,
+    SqueezeBertModel,
+    SqueezeBertForMaskedLM,
+    SqueezeBertForSequenceClassification,
+    SqueezeBertForQuestionAnswering,
+    StableLmModel,
+    StableLmForCausalLM,
+    Starcoder2Model,
+    Starcoder2ForCausalLM,
+    StyleTextToSpeech2Model,
+    SupertonicForConditionalGeneration,
+    Swin2SRModel,
+    Swin2SRForImageSuperResolution,
+    SwinModel,
+    SwinForImageClassification,
+    SwinForSemanticSegmentation,
+    T5Model,
+    T5ForConditionalGeneration,
+    TableTransformerModel,
+    TableTransformerForObjectDetection,
+    TrOCRForCausalLM,
+    UltravoxModel,
+    VoxtralForConditionalGeneration,
+    UniSpeechModel,
+    UniSpeechForCTC,
+    UniSpeechForSequenceClassification,
+    UniSpeechSatModel,
+    UniSpeechSatForCTC,
+    UniSpeechSatForSequenceClassification,
+    UniSpeechSatForAudioFrameClassification,
+    VaultGemmaModel,
+    VaultGemmaForCausalLM,
+    ViTMAEModel,
+    ViTMSNModel,
+    ViTMSNForImageClassification,
+    ViTModel,
+    ViTForImageClassification,
+    VisionEncoderDecoderModel,
+    VitMatteForImageMatting,
+    VitPoseForPoseEstimation,
+    VitsModel,
+    Wav2Vec2BertModel,
+    Wav2Vec2BertForCTC,
+    Wav2Vec2BertForSequenceClassification,
+    Wav2Vec2Model,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForAudioFrameClassification,
+    WavLMModel,
+    WavLMForCTC,
+    WavLMForSequenceClassification,
+    WavLMForXVector,
+    WavLMForAudioFrameClassification,
+    WeSpeakerResNetModel,
+    WhisperModel,
+    WhisperForConditionalGeneration,
+    LiteWhisperForConditionalGeneration,
+    XLMModel,
+    XLMWithLMHeadModel,
+    XLMForSequenceClassification,
+    XLMForTokenClassification,
+    XLMForQuestionAnswering,
+    XLMRobertaModel,
+    XLMRobertaForMaskedLM,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaForTokenClassification,
+    XLMRobertaForQuestionAnswering,
+    YolosModel,
+    YolosForObjectDetection,
+} from './pre-trained-models/index.js';
+
+const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
+    ['bert', ['BertModel', BertModel]],
+    ['neobert', ['NeoBertModel', NeoBertModel]],
+    ['modernbert', ['ModernBertModel', ModernBertModel]],
+    ['nomic_bert', ['NomicBertModel', NomicBertModel]],
+    ['roformer', ['RoFormerModel', RoFormerModel]],
+    ['electra', ['ElectraModel', ElectraModel]],
+    ['esm', ['EsmModel', EsmModel]],
+    ['convbert', ['ConvBertModel', ConvBertModel]],
+    ['camembert', ['CamembertModel', CamembertModel]],
+    ['deberta', ['DebertaModel', DebertaModel]],
+    ['deberta-v2', ['DebertaV2Model', DebertaV2Model]],
+    ['mpnet', ['MPNetModel', MPNetModel]],
+    ['albert', ['AlbertModel', AlbertModel]],
+    ['distilbert', ['DistilBertModel', DistilBertModel]],
+    ['roberta', ['RobertaModel', RobertaModel]],
+    ['xlm', ['XLMModel', XLMModel]],
+    ['xlm-roberta', ['XLMRobertaModel', XLMRobertaModel]],
+    ['clap', ['ClapModel', ClapModel]],
+    ['clip', ['CLIPModel', CLIPModel]],
+    ['clipseg', ['CLIPSegModel', CLIPSegModel]],
+    ['chinese_clip', ['ChineseCLIPModel', ChineseCLIPModel]],
+    ['siglip', ['SiglipModel', SiglipModel]],
+    ['jina_clip', ['JinaCLIPModel', JinaCLIPModel]],
+    ['mobilebert', ['MobileBertModel', MobileBertModel]],
+    ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]],
+    ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]],
+    ['wav2vec2-bert', ['Wav2Vec2BertModel', Wav2Vec2BertModel]],
+    ['unispeech', ['UniSpeechModel', UniSpeechModel]],
+    ['unispeech-sat', ['UniSpeechSatModel', UniSpeechSatModel]],
+    ['hubert', ['HubertModel', HubertModel]],
+    ['wavlm', ['WavLMModel', WavLMModel]],
+    ['audio-spectrogram-transformer', ['ASTModel', ASTModel]],
+    ['vits', ['VitsModel', VitsModel]],
+    ['pyannote', ['PyAnnoteModel', PyAnnoteModel]],
+    ['wespeaker-resnet', ['WeSpeakerResNetModel', WeSpeakerResNetModel]],
+
+    ['detr', ['DetrModel', DetrModel]],
+    ['rt_detr', ['RTDetrModel', RTDetrModel]],
+    ['rt_detr_v2', ['RTDetrV2Model', RTDetrV2Model]],
+    ['rf_detr', ['RFDetrModel', RFDetrModel]],
+    ['d_fine', ['DFineModel', DFineModel]],
+    ['table-transformer', ['TableTransformerModel', TableTransformerModel]],
+    ['vit', ['ViTModel', ViTModel]],
+    ['ijepa', ['IJepaModel', IJepaModel]],
+    ['pvt', ['PvtModel', PvtModel]],
+    ['vit_msn', ['ViTMSNModel', ViTMSNModel]],
+    ['vit_mae', ['ViTMAEModel', ViTMAEModel]],
+    ['groupvit', ['GroupViTModel', GroupViTModel]],
+    ['fastvit', ['FastViTModel', FastViTModel]],
+    ['mobilevit', ['MobileViTModel', MobileViTModel]],
+    ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
+    ['owlvit', ['OwlViTModel', OwlViTModel]],
+    ['owlv2', ['Owlv2Model', Owlv2Model]],
+    ['beit', ['BeitModel', BeitModel]],
+    ['deit', ['DeiTModel', DeiTModel]],
+    ['hiera', ['HieraModel', HieraModel]],
+    ['convnext', ['ConvNextModel', ConvNextModel]],
+    ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]],
+    ['dinov2', ['Dinov2Model', Dinov2Model]],
+    ['dinov2_with_registers', ['Dinov2WithRegistersModel', Dinov2WithRegistersModel]],
+    ['dinov3_vit', ['DINOv3ViTModel', DINOv3ViTModel]],
+    ['dinov3_convnext', ['DINOv3ConvNextModel', DINOv3ConvNextModel]],
+    ['resnet', ['ResNetModel', ResNetModel]],
+    ['swin', ['SwinModel', SwinModel]],
+    ['swin2sr', ['Swin2SRModel', Swin2SRModel]],
+    ['donut-swin', ['DonutSwinModel', DonutSwinModel]],
+    ['yolos', ['YolosModel', YolosModel]],
+    ['dpt', ['DPTModel', DPTModel]],
+    ['glpn', ['GLPNModel', GLPNModel]],
+
+    ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
+    ['efficientnet', ['EfficientNetModel', EfficientNetModel]],
+
+    ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]],
+    ['patchtst', ['PatchTSTForPrediction', PatchTSTModel]],
+    ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerModel]],
+
+    ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]],
+    ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]],
+    ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]],
+    ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]],
+
+    ['maskformer', ['MaskFormerModel', MaskFormerModel]],
+    ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]],
+
+    ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]],
+]);
+
+const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
+    ['t5', ['T5Model', T5Model]],
+    ['longt5', ['LongT5Model', LongT5Model]],
+    ['mt5', ['MT5Model', MT5Model]],
+    ['bart', ['BartModel', BartModel]],
+    ['mbart', ['MBartModel', MBartModel]],
+    ['marian', ['MarianModel', MarianModel]],
+    ['whisper', ['WhisperModel', WhisperModel]],
+    ['m2m_100', ['M2M100Model', M2M100Model]],
+    ['blenderbot', ['BlenderbotModel', BlenderbotModel]],
+    ['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]],
+]);
+
+const MODEL_MAPPING_NAMES_AUTO_ENCODER = new Map([
+    ['mimi', ['MimiModel', MimiModel]],
+    ['dac', ['DacModel', DacModel]],
+    ['snac', ['SnacModel', SnacModel]],
+]);
+
+const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
+    ['bloom', ['BloomModel', BloomModel]],
+    ['jais', ['JAISModel', JAISModel]],
+    ['gpt2', ['GPT2Model', GPT2Model]],
+    ['gpt_oss', ['GptOssModel', GptOssModel]],
+    ['gptj', ['GPTJModel', GPTJModel]],
+    ['gpt_bigcode', ['GPTBigCodeModel', GPTBigCodeModel]],
+    ['gpt_neo', ['GPTNeoModel', GPTNeoModel]],
+    ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
+    ['codegen', ['CodeGenModel', CodeGenModel]],
+    ['llama', ['LlamaModel', LlamaModel]],
+    ['apertus', ['ApertusModel', ApertusModel]],
+    ['nanochat', ['NanoChatModel', NanoChatModel]],
+    ['arcee', ['ArceeModel', ArceeModel]],
+    ['lfm2', ['Lfm2Model', Lfm2Model]],
+    ['smollm3', ['SmolLM3Model', SmolLM3Model]],
+    ['exaone', ['ExaoneModel', ExaoneModel]],
+    ['olmo', ['OlmoModel', OlmoModel]],
+    ['olmo2', ['Olmo2Model', Olmo2Model]],
+    ['olmo3', ['Olmo3Model', Olmo3Model]],
+    ['mobilellm', ['MobileLLMModel', MobileLLMModel]],
+    ['granite', ['GraniteModel', GraniteModel]],
+    ['granitemoehybrid', ['GraniteMoeHybridModel', GraniteMoeHybridModel]],
+    ['cohere', ['CohereModel', CohereModel]],
+    ['gemma', ['GemmaModel', GemmaModel]],
+    ['gemma2', ['Gemma2Model', Gemma2Model]],
+    ['vaultgemma', ['VaultGemmaModel', VaultGemmaModel]],
+    ['gemma3_text', ['Gemma3Model', Gemma3Model]],
+    ['helium', ['HeliumModel', HeliumModel]],
+    ['glm', ['GlmModel', GlmModel]],
+    ['openelm', ['OpenELMModel', OpenELMModel]],
+    ['qwen2', ['Qwen2Model', Qwen2Model]],
+    ['qwen3', ['Qwen3Model', Qwen3Model]],
+    ['phi', ['PhiModel', PhiModel]],
+    ['phi3', ['Phi3Model', Phi3Model]],
+    ['mpt', ['MptModel', MptModel]],
+    ['opt', ['OPTModel', OPTModel]],
+    ['mistral', ['MistralModel', MistralModel]],
+    ['ernie4_5', ['Ernie4_5_Model', Ernie4_5_Model]],
+    ['starcoder2', ['Starcoder2Model', Starcoder2Model]],
+    ['falcon', ['FalconModel', FalconModel]],
+    ['stablelm', ['StableLmModel', StableLmModel]],
+    ['modernbert-decoder', ['ModernBertDecoderModel', ModernBertDecoderModel]],
+]);
+
+export const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
+    ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
+    ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
+    ['lite-whisper', ['LiteWhisperForConditionalGeneration', LiteWhisperForConditionalGeneration]],
+    ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
+]);
+
+const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
+    ['speecht5', ['SpeechT5ForTextToSpeech', SpeechT5ForTextToSpeech]],
+]);
+
+const MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = new Map([
+    ['vits', ['VitsModel', VitsModel]],
+    ['musicgen', ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration]],
+    ['supertonic', ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration]],
+]);
+
+const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([
+    ['bert', ['BertForSequenceClassification', BertForSequenceClassification]],
+    ['neobert', ['NeoBertForSequenceClassification', NeoBertForSequenceClassification]],
+    ['modernbert', ['ModernBertForSequenceClassification', ModernBertForSequenceClassification]],
+    ['roformer', ['RoFormerForSequenceClassification', RoFormerForSequenceClassification]],
+    ['electra', ['ElectraForSequenceClassification', ElectraForSequenceClassification]],
+    ['esm', ['EsmForSequenceClassification', EsmForSequenceClassification]],
+    ['convbert', ['ConvBertForSequenceClassification', ConvBertForSequenceClassification]],
+    ['camembert', ['CamembertForSequenceClassification', CamembertForSequenceClassification]],
+    ['deberta', ['DebertaForSequenceClassification', DebertaForSequenceClassification]],
+    ['deberta-v2', ['DebertaV2ForSequenceClassification', DebertaV2ForSequenceClassification]],
+    ['mpnet', ['MPNetForSequenceClassification', MPNetForSequenceClassification]],
+    ['albert', ['AlbertForSequenceClassification', AlbertForSequenceClassification]],
+    ['distilbert', ['DistilBertForSequenceClassification', DistilBertForSequenceClassification]],
+    ['roberta', ['RobertaForSequenceClassification', RobertaForSequenceClassification]],
+    ['xlm', ['XLMForSequenceClassification', XLMForSequenceClassification]],
+    ['xlm-roberta', ['XLMRobertaForSequenceClassification', XLMRobertaForSequenceClassification]],
+    ['bart', ['BartForSequenceClassification', BartForSequenceClassification]],
+    ['mbart', ['MBartForSequenceClassification', MBartForSequenceClassification]],
+    ['mobilebert', ['MobileBertForSequenceClassification', MobileBertForSequenceClassification]],
+    ['squeezebert', ['SqueezeBertForSequenceClassification', SqueezeBertForSequenceClassification]],
+]);
+
+const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([
+    ['bert', ['BertForTokenClassification', BertForTokenClassification]],
+    ['neobert', ['NeoBertForTokenClassification', NeoBertForTokenClassification]],
+    ['modernbert', ['ModernBertForTokenClassification', ModernBertForTokenClassification]],
+    ['roformer', ['RoFormerForTokenClassification', RoFormerForTokenClassification]],
+    ['electra', ['ElectraForTokenClassification', ElectraForTokenClassification]],
+    ['esm', ['EsmForTokenClassification', EsmForTokenClassification]],
+    ['convbert', ['ConvBertForTokenClassification', ConvBertForTokenClassification]],
+    ['camembert', ['CamembertForTokenClassification', CamembertForTokenClassification]],
+    ['deberta', ['DebertaForTokenClassification', DebertaForTokenClassification]],
+    ['deberta-v2', ['DebertaV2ForTokenClassification', DebertaV2ForTokenClassification]],
+    ['mpnet', ['MPNetForTokenClassification', MPNetForTokenClassification]],
+    ['distilbert', ['DistilBertForTokenClassification', DistilBertForTokenClassification]],
+    ['roberta', ['RobertaForTokenClassification', RobertaForTokenClassification]],
+    ['xlm', ['XLMForTokenClassification', XLMForTokenClassification]],
+    ['xlm-roberta', ['XLMRobertaForTokenClassification', XLMRobertaForTokenClassification]],
+]);
+
+export const MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = new Map([
+    ['t5', ['T5ForConditionalGeneration', T5ForConditionalGeneration]],
+    ['longt5', ['LongT5ForConditionalGeneration', LongT5ForConditionalGeneration]],
+    ['mt5', ['MT5ForConditionalGeneration', MT5ForConditionalGeneration]],
+    ['bart', ['BartForConditionalGeneration', BartForConditionalGeneration]],
+    ['mbart', ['MBartForConditionalGeneration', MBartForConditionalGeneration]],
+    ['marian', ['MarianMTModel', MarianMTModel]],
+    ['m2m_100', ['M2M100ForConditionalGeneration', M2M100ForConditionalGeneration]],
+    ['blenderbot', ['BlenderbotForConditionalGeneration', BlenderbotForConditionalGeneration]],
+    ['blenderbot-small', ['BlenderbotSmallForConditionalGeneration', BlenderbotSmallForConditionalGeneration]],
+]);
+
+export const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
+    ['bloom', ['BloomForCausalLM', BloomForCausalLM]],
+    ['gpt2', ['GPT2LMHeadModel', GPT2LMHeadModel]],
+    ['gpt_oss', ['GptOssForCausalLM', GptOssForCausalLM]],
+    ['jais', ['JAISLMHeadModel', JAISLMHeadModel]],
+    ['gptj', ['GPTJForCausalLM', GPTJForCausalLM]],
+    ['gpt_bigcode', ['GPTBigCodeForCausalLM', GPTBigCodeForCausalLM]],
+    ['gpt_neo', ['GPTNeoForCausalLM', GPTNeoForCausalLM]],
+    ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
+    ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
+    ['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
+    ['nanochat', ['NanoChatForCausalLM', NanoChatForCausalLM]],
+    ['apertus', ['ApertusForCausalLM', ApertusForCausalLM]],
+    ['llama4_text', ['Llama4ForCausalLM', Llama4ForCausalLM]],
+    ['arcee', ['ArceeForCausalLM', ArceeForCausalLM]],
+    ['lfm2', ['Lfm2ForCausalLM', Lfm2ForCausalLM]],
+    ['smollm3', ['SmolLM3ForCausalLM', SmolLM3ForCausalLM]],
+    ['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
+    ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
+    ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
+    ['olmo3', ['Olmo3ForCausalLM', Olmo3ForCausalLM]],
+    ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
+    ['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
+    ['granitemoehybrid', ['GraniteMoeHybridForCausalLM', GraniteMoeHybridForCausalLM]],
+    ['cohere', ['CohereForCausalLM', CohereForCausalLM]],
+    ['gemma', ['GemmaForCausalLM', GemmaForCausalLM]],
+    ['gemma2', ['Gemma2ForCausalLM', Gemma2ForCausalLM]],
+    ['vaultgemma', ['VaultGemmaForCausalLM', VaultGemmaForCausalLM]],
+    ['gemma3_text', ['Gemma3ForCausalLM', Gemma3ForCausalLM]],
+    ['helium', ['HeliumForCausalLM', HeliumForCausalLM]],
+    ['glm', ['GlmForCausalLM', GlmForCausalLM]],
+    ['openelm', ['OpenELMForCausalLM', OpenELMForCausalLM]],
+    ['qwen2', ['Qwen2ForCausalLM', Qwen2ForCausalLM]],
+    ['qwen3', ['Qwen3ForCausalLM', Qwen3ForCausalLM]],
+    ['phi', ['PhiForCausalLM', PhiForCausalLM]],
+    ['phi3', ['Phi3ForCausalLM', Phi3ForCausalLM]],
+    ['mpt', ['MptForCausalLM', MptForCausalLM]],
+    ['opt', ['OPTForCausalLM', OPTForCausalLM]],
+    ['mbart', ['MBartForCausalLM', MBartForCausalLM]],
+    ['mistral', ['MistralForCausalLM', MistralForCausalLM]],
+    ['ernie4_5', ['Ernie4_5_ForCausalLM', Ernie4_5_ForCausalLM]],
+    ['starcoder2', ['Starcoder2ForCausalLM', Starcoder2ForCausalLM]],
+    ['falcon', ['FalconForCausalLM', FalconForCausalLM]],
+    ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
+    ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
+    ['modernbert-decoder', ['ModernBertDecoderForCausalLM', ModernBertDecoderForCausalLM]],
+
+    // Also image-text-to-text
+    ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
+]);
+
+const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
+    ['multi_modality', ['MultiModalityCausalLM', MultiModalityCausalLM]],
+]);
+
+const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([
+    ['bert', ['BertForMaskedLM', BertForMaskedLM]],
+    ['neobert', ['NeoBertForMaskedLM', NeoBertForMaskedLM]],
+    ['modernbert', ['ModernBertForMaskedLM', ModernBertForMaskedLM]],
+    ['roformer', ['RoFormerForMaskedLM', RoFormerForMaskedLM]],
+    ['electra', ['ElectraForMaskedLM', ElectraForMaskedLM]],
+    ['esm', ['EsmForMaskedLM', EsmForMaskedLM]],
+    ['convbert', ['ConvBertForMaskedLM', ConvBertForMaskedLM]],
+    ['camembert', ['CamembertForMaskedLM', CamembertForMaskedLM]],
+    ['deberta', ['DebertaForMaskedLM', DebertaForMaskedLM]],
+    ['deberta-v2', ['DebertaV2ForMaskedLM', DebertaV2ForMaskedLM]],
+    ['mpnet', ['MPNetForMaskedLM', MPNetForMaskedLM]],
+    ['albert', ['AlbertForMaskedLM', AlbertForMaskedLM]],
+    ['distilbert', ['DistilBertForMaskedLM', DistilBertForMaskedLM]],
+    ['roberta', ['RobertaForMaskedLM', RobertaForMaskedLM]],
+    ['xlm', ['XLMWithLMHeadModel', XLMWithLMHeadModel]],
+    ['xlm-roberta', ['XLMRobertaForMaskedLM', XLMRobertaForMaskedLM]],
+    ['mobilebert', ['MobileBertForMaskedLM', MobileBertForMaskedLM]],
+    ['squeezebert', ['SqueezeBertForMaskedLM', SqueezeBertForMaskedLM]],
+]);
+
+const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
+    ['bert', ['BertForQuestionAnswering', BertForQuestionAnswering]],
+    ['neobert', ['NeoBertForQuestionAnswering', NeoBertForQuestionAnswering]],
+    ['roformer', ['RoFormerForQuestionAnswering', RoFormerForQuestionAnswering]],
+    ['electra', ['ElectraForQuestionAnswering', ElectraForQuestionAnswering]],
+    ['convbert', ['ConvBertForQuestionAnswering', ConvBertForQuestionAnswering]],
+    ['camembert', ['CamembertForQuestionAnswering', CamembertForQuestionAnswering]],
+    ['deberta', ['DebertaForQuestionAnswering', DebertaForQuestionAnswering]],
+    ['deberta-v2', ['DebertaV2ForQuestionAnswering', DebertaV2ForQuestionAnswering]],
+    ['mpnet', ['MPNetForQuestionAnswering', MPNetForQuestionAnswering]],
+    ['albert', ['AlbertForQuestionAnswering', AlbertForQuestionAnswering]],
+    ['distilbert', ['DistilBertForQuestionAnswering', DistilBertForQuestionAnswering]],
+    ['roberta', ['RobertaForQuestionAnswering', RobertaForQuestionAnswering]],
+    ['xlm', ['XLMForQuestionAnswering', XLMForQuestionAnswering]],
+    ['xlm-roberta', ['XLMRobertaForQuestionAnswering', XLMRobertaForQuestionAnswering]],
+    ['mobilebert', ['MobileBertForQuestionAnswering', MobileBertForQuestionAnswering]],
+    ['squeezebert', ['SqueezeBertForQuestionAnswering', SqueezeBertForQuestionAnswering]],
+]);
+
+export const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([
+    ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
+    ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
+    ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
+]);
+
+const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
+    ['llava', ['LlavaForConditionalGeneration', LlavaForConditionalGeneration]],
+    ['llava_onevision', ['LlavaOnevisionForConditionalGeneration', LlavaOnevisionForConditionalGeneration]],
+    ['moondream1', ['Moondream1ForConditionalGeneration', Moondream1ForConditionalGeneration]],
+    ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
+    ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
+    ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
+    ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
+    ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
+    ['llava_qwen2', ['LlavaQwen2ForCausalLM', LlavaQwen2ForCausalLM]],
+    ['gemma3n', ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration]],
+]);
+
+const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
+    ['ultravox', ['UltravoxModel', UltravoxModel]],
+    ['voxtral', ['VoxtralForConditionalGeneration', VoxtralForConditionalGeneration]],
+]);
+
+const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
+    ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]],
+]);
+
+const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
+    ['vit', ['ViTForImageClassification', ViTForImageClassification]],
+    ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
+    ['pvt', ['PvtForImageClassification', PvtForImageClassification]],
+    ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
+    ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
+    ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
+    ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
+    ['beit', ['BeitForImageClassification', BeitForImageClassification]],
+    ['deit', ['DeiTForImageClassification', DeiTForImageClassification]],
+    ['hiera', ['HieraForImageClassification', HieraForImageClassification]],
+    ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]],
+    ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]],
+    ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]],
+    ['dinov2_with_registers', ['Dinov2WithRegistersForImageClassification', Dinov2WithRegistersForImageClassification]],
+    ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]],
+    ['swin', ['SwinForImageClassification', SwinForImageClassification]],
+    ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]],
+    ['efficientnet', ['EfficientNetForImageClassification', EfficientNetForImageClassification]],
+    ['mobilenet_v1', ['MobileNetV1ForImageClassification', MobileNetV1ForImageClassification]],
+    ['mobilenet_v2', ['MobileNetV2ForImageClassification', MobileNetV2ForImageClassification]],
+    ['mobilenet_v3', ['MobileNetV3ForImageClassification', MobileNetV3ForImageClassification]],
+    ['mobilenet_v4', ['MobileNetV4ForImageClassification', MobileNetV4ForImageClassification]],
+]);
+
+const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
+    ['detr', ['DetrForObjectDetection', DetrForObjectDetection]],
+    ['rt_detr', ['RTDetrForObjectDetection', RTDetrForObjectDetection]],
+    ['rt_detr_v2', ['RTDetrV2ForObjectDetection', RTDetrV2ForObjectDetection]],
+    ['rf_detr', ['RFDetrForObjectDetection', RFDetrForObjectDetection]],
+    ['d_fine', ['DFineForObjectDetection', DFineForObjectDetection]],
+    ['table-transformer', ['TableTransformerForObjectDetection', TableTransformerForObjectDetection]],
+    ['yolos', ['YolosForObjectDetection', YolosForObjectDetection]],
+]);
+
+const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
+    ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]],
+    ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]],
+    ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]],
+]);
+
+const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
+    // TODO: Do not add new models here
+    ['detr', ['DetrForSegmentation', DetrForSegmentation]],
+    ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
+]);
+
+const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
+    ['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]],
+    ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
+
+    ['swin', ['SwinForSemanticSegmentation', SwinForSemanticSegmentation]],
+    ['mobilenet_v1', ['MobileNetV1ForSemanticSegmentation', MobileNetV1ForSemanticSegmentation]],
+    ['mobilenet_v2', ['MobileNetV2ForSemanticSegmentation', MobileNetV2ForSemanticSegmentation]],
+    ['mobilenet_v3', ['MobileNetV3ForSemanticSegmentation', MobileNetV3ForSemanticSegmentation]],
+    ['mobilenet_v4', ['MobileNetV4ForSemanticSegmentation', MobileNetV4ForSemanticSegmentation]],
+]);
+
+const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
+    ['detr', ['DetrForSegmentation', DetrForSegmentation]],
+    ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
+]);
+
+const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
+    ['sam', ['SamModel', SamModel]],
+    ['sam2', ['Sam2Model', Sam2Model]],
+    ['edgetam', ['EdgeTamModel', EdgeTamModel]],
+    ['sam3_tracker', ['Sam3TrackerModel', Sam3TrackerModel]],
+]);
+
+const MODEL_FOR_CTC_MAPPING_NAMES = new Map([
+    ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]],
+    ['wav2vec2-bert', ['Wav2Vec2BertForCTC', Wav2Vec2BertForCTC]],
+    ['unispeech', ['UniSpeechForCTC', UniSpeechForCTC]],
+    ['unispeech-sat', ['UniSpeechSatForCTC', UniSpeechSatForCTC]],
+    ['wavlm', ['WavLMForCTC', WavLMForCTC]],
+    ['hubert', ['HubertForCTC', HubertForCTC]],
+    ['parakeet_ctc', ['ParakeetForCTC', ParakeetForCTC]],
+]);
+
+const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([
+    ['wav2vec2', ['Wav2Vec2ForSequenceClassification', Wav2Vec2ForSequenceClassification]],
+    ['wav2vec2-bert', ['Wav2Vec2BertForSequenceClassification', Wav2Vec2BertForSequenceClassification]],
+    ['unispeech', ['UniSpeechForSequenceClassification', UniSpeechForSequenceClassification]],
+    ['unispeech-sat', ['UniSpeechSatForSequenceClassification', UniSpeechSatForSequenceClassification]],
+    ['wavlm', ['WavLMForSequenceClassification', WavLMForSequenceClassification]],
+    ['hubert', ['HubertForSequenceClassification', HubertForSequenceClassification]],
+    ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]],
+]);
+
+const MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = new Map([['wavlm', ['WavLMForXVector', WavLMForXVector]]]);
+
+const MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = new Map([
+    ['unispeech-sat', ['UniSpeechSatForAudioFrameClassification', UniSpeechSatForAudioFrameClassification]],
+    ['wavlm', ['WavLMForAudioFrameClassification', WavLMForAudioFrameClassification]],
+    ['wav2vec2', ['Wav2Vec2ForAudioFrameClassification', Wav2Vec2ForAudioFrameClassification]],
+    ['pyannote', ['PyAnnoteForAudioFrameClassification', PyAnnoteForAudioFrameClassification]],
+]);
+
+const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([
+    ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]],
+]);
+
+const MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = new Map([
+    ['patchtst', ['PatchTSTForPrediction', PatchTSTForPrediction]],
+    ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerForPrediction]],
+]);
+
+const MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = new Map([
+    ['swin2sr', ['Swin2SRForImageSuperResolution', Swin2SRForImageSuperResolution]],
+]);
+
+const MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = new Map([
+    ['dpt', ['DPTForDepthEstimation', DPTForDepthEstimation]],
+    ['depth_anything', ['DepthAnythingForDepthEstimation', DepthAnythingForDepthEstimation]],
+    ['glpn', ['GLPNForDepthEstimation', GLPNForDepthEstimation]],
+    ['sapiens', ['SapiensForDepthEstimation', SapiensForDepthEstimation]],
+    ['depth_pro', ['DepthProForDepthEstimation', DepthProForDepthEstimation]],
+    ['metric3d', ['Metric3DForDepthEstimation', Metric3DForDepthEstimation]],
+    ['metric3dv2', ['Metric3Dv2ForDepthEstimation', Metric3Dv2ForDepthEstimation]],
+]);
+
+const MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES = new Map([
+    ['sapiens', ['SapiensForNormalEstimation', SapiensForNormalEstimation]],
+]);
+
+const MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES = new Map([
+    ['vitpose', ['VitPoseForPoseEstimation', VitPoseForPoseEstimation]],
+]);
+
+// NOTE: This is custom to Transformers.js, and is necessary because certain models
+// (e.g., CLIP) are split into vision and text components
+const MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = new Map([
+    ['clip', ['CLIPVisionModelWithProjection', CLIPVisionModelWithProjection]],
+    ['siglip', ['SiglipVisionModel', SiglipVisionModel]],
+    ['jina_clip', ['JinaCLIPVisionModel', JinaCLIPVisionModel]],
+]);
+
+const MODEL_CLASS_TYPE_MAPPING = [
+    // MODEL_MAPPING_NAMES:
+    [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly],
+    [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder],
+    [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly],
+    [MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_TYPES.AutoEncoder],
+
+    [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
+    [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
+    [MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.DecoderOnly],
+    [MODEL_FOR_MULTIMODALITY_MAPPING_NAMES, MODEL_TYPES.MultiModality],
+    [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq],
+    [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
+    [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.AudioTextToText],
+    [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES.MaskGeneration],
+    [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
+    [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+
+    // Custom:
+    [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+];
+
+for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) {
+    // @ts-ignore
+    for (const [name, model] of mappings.values()) {
+        MODEL_TYPE_MAPPING.set(name, type);
+        MODEL_CLASS_TO_NAME_MAPPING.set(model, name);
+        MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
+    }
+}
+
+const CUSTOM_MAPPING = [
+    // OVERRIDE:
+    // TODO: Refactor to allow class to specify model
+    ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
+    ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
+
+    ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
+    ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
+    ['JinaCLIPTextModel', JinaCLIPTextModel, MODEL_TYPES.EncoderOnly],
+    ['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly],
+    ['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly],
+
+    ['DacEncoderModel', DacEncoderModel, MODEL_TYPES.EncoderOnly],
+    ['DacDecoderModel', DacDecoderModel, MODEL_TYPES.EncoderOnly],
+    ['MimiEncoderModel', MimiEncoderModel, MODEL_TYPES.EncoderOnly],
+    ['MimiDecoderModel', MimiDecoderModel, MODEL_TYPES.EncoderOnly],
+    ['SnacEncoderModel', SnacEncoderModel, MODEL_TYPES.EncoderOnly],
+    ['SnacDecoderModel', SnacDecoderModel, MODEL_TYPES.EncoderOnly],
+
+    ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration, MODEL_TYPES.ImageAudioTextToText],
+    ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
+    ['ChatterboxModel', ChatterboxModel, MODEL_TYPES.Chatterbox],
+];
+for (const [name, model, type] of CUSTOM_MAPPING) {
+    MODEL_TYPE_MAPPING.set(name, type);
+    MODEL_CLASS_TO_NAME_MAPPING.set(model, name);
+    MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
+}
+
+const CUSTOM_ARCHITECTURES = new Map([
+    ['modnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
+    ['birefnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
+    ['isnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
+    ['ben', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
+]);
+for (const [name, mapping] of CUSTOM_ARCHITECTURES.entries()) {
+    mapping.set(name, ['PreTrainedModel', PreTrainedModel]);
+    MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly);
+    MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, name);
+    MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel);
+}
+
+export {
+    CUSTOM_ARCHITECTURES,
+    MODEL_CLASS_TYPE_MAPPING,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES,
+    MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES,
+    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+    MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES,
+    MODEL_FOR_MASK_GENERATION_MAPPING_NAMES,
+    MODEL_FOR_CTC_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES,
+    MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES,
+    MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES,
+    MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+    MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES,
+};
+
+export * from './pre-trained-models/index.js';
diff --git a/src/models/session.js b/src/models/session.js
new file mode 100644
index 000000000..ecf942946
--- /dev/null
+++ b/src/models/session.js
@@ -0,0 +1,244 @@
+import {
+    createInferenceSession,
+    deviceToExecutionProviders,
+    isONNXProxy,
+    runInferenceSession,
+} from '../backends/onnx.js';
+import { getCacheShapes } from '../configs.js';
+import {
+    DATA_TYPES,
+    DEFAULT_DEVICE_DTYPE_MAPPING,
+    DEFAULT_DTYPE_SUFFIX_MAPPING,
+    isWebGpuFp16Supported,
+} from '../utils/dtypes.js';
+import { apis } from '../env.js';
+import { replaceTensors } from '../utils/tensor.js';
+import { validateInputs } from './utils.js';
+import { getCoreModelFile, getModelDataFiles } from '../utils/model-loader.js';
+
+/**
+ * Constructs an InferenceSession using a model file located at the specified path.
+ * @param {string} pretrained_model_name_or_path The path to the directory containing the model file.
+ * @param {string} fileName The name of the model file.
+ * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
+ * @param {boolean} [is_decoder=false] Whether the model is a decoder model.
+ * @returns {Promise<{buffer_or_path: Uint8Array|string, session_options: Object, session_config: Object}>} A Promise that resolves to the data needed to create an InferenceSession object.
+ * @private
+ */
+async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) {
+    let custom_config = options.config?.['transformers.js_config'] ?? {};
+
+    let device = options.device ?? custom_config.device;
+    if (device && typeof device !== 'string') {
+        if (device.hasOwnProperty(fileName)) {
+            device = device[fileName];
+        } else {
+            console.warn(`device not specified for "${fileName}". Using the default device.`);
+            device = null;
+        }
+    }
+
+    // If the device is not specified, we use the default (supported) execution providers.
+    const selectedDevice = /** @type {import("../utils/devices.js").DeviceType} */ (
+        device ?? (apis.IS_NODE_ENV ? 'cpu' : 'wasm')
+    );
+
+    const executionProviders = deviceToExecutionProviders(selectedDevice);
+
+    // Update custom config with the selected device's config, if it exists
+    const device_config = custom_config.device_config ?? {};
+    if (device_config.hasOwnProperty(selectedDevice)) {
+        custom_config = {
+            ...custom_config,
+            ...device_config[selectedDevice],
+        };
+    }
+
+    // If options.dtype is specified, we use it to choose the suffix for the model file.
+    // Otherwise, we use the default dtype for the device.
+    let dtype = options.dtype ?? custom_config.dtype;
+    if (typeof dtype !== 'string') {
+        if (dtype && dtype.hasOwnProperty(fileName)) {
+            dtype = dtype[fileName];
+        } else {
+            dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32;
+            console.warn(
+                `dtype not specified for "${fileName}". Using the default dtype (${dtype}) for this device (${selectedDevice}).`,
+            );
+        }
+    }
+
+    if (dtype === DATA_TYPES.auto) {
+        // Try to choose the auto dtype based on the custom config
+        let config_dtype = custom_config.dtype;
+        if (typeof config_dtype !== 'string') {
+            config_dtype = config_dtype?.[fileName];
+        }
+
+        if (config_dtype && config_dtype !== DATA_TYPES.auto && DATA_TYPES.hasOwnProperty(config_dtype)) {
+            // Defined by the config, and is not "auto"
+            dtype = config_dtype;
+        } else {
+            // Choose default dtype based on device, falling back to fp32
+            dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32;
+        }
+    }
+
+    const selectedDtype = /** @type {import("../utils/dtypes.js").DataType} */ (dtype);
+
+    if (!DEFAULT_DTYPE_SUFFIX_MAPPING.hasOwnProperty(selectedDtype)) {
+        throw new Error(`Invalid dtype: ${selectedDtype}. Should be one of: ${Object.keys(DATA_TYPES).join(', ')}`);
+    } else if (
+        selectedDevice === 'webgpu' &&
+        // NOTE: Currently, we assume that the Native WebGPU EP always supports fp16. In future, we will add a check for this.
+        !apis.IS_NODE_ENV &&
+        selectedDtype === DATA_TYPES.fp16 &&
+        !(await isWebGpuFp16Supported())
+    ) {
+        throw new Error(`The device (${selectedDevice}) does not support fp16.`);
+    }
+
+    // Only valid for models with a decoder
+    const kv_cache_dtype_config = custom_config.kv_cache_dtype;
+    const kv_cache_dtype = kv_cache_dtype_config
+        ? typeof kv_cache_dtype_config === 'string'
+            ? kv_cache_dtype_config
+            : (kv_cache_dtype_config[selectedDtype] ?? 'float32')
+        : undefined;
+
+    if (kv_cache_dtype && !['float32', 'float16'].includes(kv_cache_dtype)) {
+        throw new Error(`Invalid kv_cache_dtype: ${kv_cache_dtype}. Should be one of: float32, float16`);
+    }
+
+    const session_config = {
+        dtype: selectedDtype,
+        kv_cache_dtype,
+        device: selectedDevice,
+    };
+
+    // Construct the model file suffix
+    const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[selectedDtype];
+
+    const session_options = { ...options.session_options };
+
+    // Overwrite `executionProviders` if not specified
+    session_options.executionProviders ??= executionProviders;
+
+    // Overwrite `freeDimensionOverrides` if specified in config and not set in session options
+    const free_dimension_overrides = custom_config.free_dimension_overrides;
+    if (free_dimension_overrides) {
+        session_options.freeDimensionOverrides ??= free_dimension_overrides;
+    } else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
+        console.warn(
+            `WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${selectedDevice}"]. ` +
+                `When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`,
+        );
+    }
+
+    const bufferOrPathPromise = getCoreModelFile(pretrained_model_name_or_path, fileName, options, suffix);
+
+    // Handle onnx external data files
+    const use_external_data_format = options.use_external_data_format ?? custom_config.use_external_data_format;
+    const externalData = await getModelDataFiles(
+        pretrained_model_name_or_path,
+        fileName,
+        suffix,
+        options,
+        use_external_data_format,
+        session_options,
+    );
+
+    if (externalData.length > 0 && !apis.IS_NODE_ENV) {
+        session_options.externalData = externalData;
+    }
+
+    if (is_decoder && selectedDevice === 'webgpu' && kv_cache_dtype_config !== false) {
+        const shapes = getCacheShapes(options.config, {
+            prefix: 'present',
+        });
+        if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
+            // Only set preferredOutputLocation if shapes are present and we aren't proxying ONNX
+            /** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
+            const preferredOutputLocation = {};
+            for (const key in shapes) {
+                preferredOutputLocation[key] = 'gpu-buffer';
+            }
+            session_options.preferredOutputLocation = preferredOutputLocation;
+        }
+    }
+
+    const buffer_or_path = await bufferOrPathPromise;
+
+    return { buffer_or_path, session_options, session_config };
+}
+
+/**
+ * Helper function to create multiple InferenceSession objects.
+ *
+ * @param {string} pretrained_model_name_or_path The path to the directory containing the model file.
+ * @param {Record<string, string>} names The names of the model files to load.
+ * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the model.
+ * @param {string} [decoder_name] The name of the decoder model, if any.
+ * @returns {Promise<Record<string, any>>} A Promise that resolves to a dictionary of InferenceSession objects.
+ * @private
+ */
+export async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = undefined) {
+    return Object.fromEntries(
+        await Promise.all(
+            Object.keys(names).map(async (name) => {
+                const { buffer_or_path, session_options, session_config } = await getSession(
+                    pretrained_model_name_or_path,
+                    names[name],
+                    options,
+                    name === decoder_name,
+                );
+                const session = await createInferenceSession(buffer_or_path, session_options, session_config);
+                return [name, session];
+            }),
+        ),
+    );
+}
+
+/**
+ * Executes an InferenceSession using the specified inputs.
+ * NOTE: `inputs` must contain at least the input names of the model.
+ *  - If additional inputs are passed, they will be ignored.
+ *  - If inputs are missing, an error will be thrown.
+ *
+ * @param {Object} session The InferenceSession object to run.
+ * @param {Object} inputs An object that maps input names to input tensors.
+ * @returns {Promise<Object>} A Promise that resolves to an object that maps output names to output tensors.
+ * @private
+ */
+export async function sessionRun(session, inputs) {
+    const checkedInputs = validateInputs(session, inputs);
+    try {
+        // pass the original ort tensor
+        const ortFeed = Object.fromEntries(Object.entries(checkedInputs).map(([k, v]) => [k, v.ort_tensor]));
+        const output = await runInferenceSession(session, ortFeed);
+        return replaceTensors(output);
+    } catch (e) {
+        // Error messages can be long (nested) and uninformative. For this reason,
+        // we apply minor formatting to show the most important information
+        const formatted = Object.fromEntries(
+            Object.entries(checkedInputs).map(([k, tensor]) => {
+                // Extract these properties from the underlying ORT tensor
+                const unpacked = {
+                    type: tensor.type,
+                    dims: tensor.dims,
+                    location: tensor.location,
+                };
+                if (unpacked.location !== 'gpu-buffer') {
+                    // Only return the data if it's not a GPU buffer
+                    unpacked.data = tensor.data;
+                }
+                return [k, unpacked];
+            }),
+        );
+
+        // This usually occurs when the inputs are of the wrong type.
+        console.error(`An error occurred during model execution: "${e}".`);
+        console.error('Inputs given to model:', formatted);
+        throw e;
+    }
+}
diff --git a/src/models/siglip/image_processing_siglip.js b/src/models/siglip/image_processing_siglip.js
deleted file mode 100644
index 4651b9c73..000000000
--- a/src/models/siglip/image_processing_siglip.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { ImageProcessor } from '../../base/image_processors_utils.js';
-
-export class SiglipImageProcessor extends ImageProcessor {}
diff --git a/src/models/speecht5/feature_extraction_speecht5.js b/src/models/speecht5/feature_extraction_speecht5.js
deleted file mode 100644
index 881f9b9ad..000000000
--- a/src/models/speecht5/feature_extraction_speecht5.js
+++ /dev/null
@@ -1,3 +0,0 @@
-import { FeatureExtractor } from '../../base/feature_extraction_utils.js';
-
-export class SpeechT5FeatureExtractor extends FeatureExtractor {}
diff --git a/src/models/utils.js b/src/models/utils.js
new file mode 100644
index 000000000..6912136d2
--- /dev/null
+++ b/src/models/utils.js
@@ -0,0 +1,573 @@
+// JS doesn't support mixins, so we define some reused functions here, and allow "this" to be passed in
+import { pick } from '../utils/core.js';
+import { cat, full_like, ones, Tensor, toI64Tensor, zeros_like, boolTensor, full } from '../utils/tensor.js';
+import { max } from '../utils/maths.js';
+import { sessionRun } from './session.js';
+import { getModelJSON } from '../utils/hub.js';
+import { isONNXProxy } from '../backends/onnx.js';
+import { Seq2SeqLMOutput } from './output.js';
+
+/**
+ * Perform forward pass on the seq2seq model (both encoder and decoder).
+ * @param {Object} self The seq2seq model object.
+ * @param {Object} model_inputs The input object for the model containing encoder and decoder inputs.
+ * @returns {Promise<Seq2SeqLMOutput>} Promise that resolves with the output of the seq2seq model.
+ * @private
+ */
+export async function seq2seqForward(self, model_inputs) {
+    let { encoder_outputs, input_ids, decoder_input_ids, ...other_decoder_inputs } = model_inputs;
+    // Encode if needed
+    if (!encoder_outputs) {
+        const encoder_inputs = pick(model_inputs, self.sessions['model'].inputNames);
+        // Encoder outputs are not given, so we must compute them.
+        encoder_outputs = (await encoderForward(self, encoder_inputs)).last_hidden_state;
+    }
+
+    other_decoder_inputs.input_ids = decoder_input_ids;
+    other_decoder_inputs.encoder_hidden_states = encoder_outputs;
+
+    if (self.sessions['decoder_model_merged'].inputNames.includes('encoder_attention_mask')) {
+        other_decoder_inputs.encoder_attention_mask = model_inputs.attention_mask;
+    }
+
+    return await decoderForward(self, other_decoder_inputs, true);
+}
+
+/**
+ * Forward pass of an encoder model.
+ * @param {Object} self The encoder model.
+ * @param {Object} model_inputs The input data to be used for the forward pass.
+ * @returns {Promise<Object>} The model's outputs.
+ * @private
+ */
+export async function encoderForward(self, model_inputs) {
+    const session = self.sessions['model'];
+    const encoderFeeds = pick(model_inputs, session.inputNames);
+
+    if (session.inputNames.includes('inputs_embeds') && !encoderFeeds.inputs_embeds) {
+        if (!model_inputs.input_ids) {
+            throw new Error('Both `input_ids` and `inputs_embeds` are missing in the model inputs.');
+        }
+        encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids });
+    }
+    if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) {
+        if (!encoderFeeds.input_ids) {
+            throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.');
+        }
+        // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it,
+        // but they weren't created by the tokenizer.
+        encoderFeeds.token_type_ids = zeros_like(encoderFeeds.input_ids);
+    }
+    if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) {
+        if (!encoderFeeds.pixel_values) {
+            throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.');
+        }
+        // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it,
+        // but they weren't created by the processor.
+        const dims = encoderFeeds.pixel_values.dims;
+        encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]);
+    }
+
+    return await sessionRun(session, encoderFeeds);
+}
+
+export async function autoEncoderForward(self, model_inputs) {
+    const encoded = await self.encode(model_inputs);
+    const decoded = await self.decode(encoded);
+    return decoded;
+}
+
+/**
+ * Forward pass of a decoder model.
+ * @param {Object} self The decoder model.
+ * @param {Object} model_inputs The input data to be used for the forward pass.
+ * @returns {Promise<Object>} The logits and past key values.
+ * @private
+ */
+export async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
+    const session = self.sessions[is_encoder_decoder ? 'decoder_model_merged' : 'model'];
+
+    const { past_key_values, ...new_model_inputs } = model_inputs;
+
+    if (session.inputNames.includes('use_cache_branch')) {
+        new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
+    }
+    if (
+        session.inputNames.includes('position_ids') &&
+        new_model_inputs.attention_mask &&
+        !new_model_inputs.position_ids
+    ) {
+        // NOTE: Handle a special case for paligemma/gemma3 models, where positions are 1-indexed
+        const start_index = ['paligemma', 'gemma3_text', 'gemma3'].includes(self.config.model_type) ? 1 : 0;
+        new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
+    }
+
+    // Unpack the `past_key_values` object into model inputs
+    self.addPastKeyValues(new_model_inputs, past_key_values);
+
+    // Select only the inputs that are needed for the current session
+    const fixed = pick(new_model_inputs, session.inputNames);
+    return await sessionRun(session, fixed);
+}
+
+/**
+ * Abstract forward pass function for image-text-to-text or audio-text-to-text models.
+ * @param {Object} self The model object.
+ * @param {Object} params Additional parameters.
+ * @param {Function} [params.encode_function] The function to encode the modality values.
+ * @param {Function} [params.merge_function] The function to merge the modality features with the input embeddings.
+ * @param {string} [params.modality_input_name] The modality input name.
+ * @param {string} [params.modality_output_name] The modality output name.
+ * @param {Tensor} [params.input_ids=null]
+ * @param {Tensor} [params.attention_mask=null]
+ * @param {Tensor} [params.position_ids=null]
+ * @param {Tensor} [params.inputs_embeds=null]
+ * @param {Tensor} [params.past_key_values=null]
+ * @param {Object} [params.generation_config=null]
+ * @param {Object} [params.logits_processor=null]
+ * @returns {Promise<Tensor>} The model's output tensor
+ * @private
+ */
+export async function genericTextToTextForward(
+    self,
+    {
+        // Generic parameters:
+        encode_function,
+        merge_function,
+        modality_input_name,
+        modality_output_name,
+
+        // Produced by the tokenizer/processor:
+        input_ids = null,
+        attention_mask = null,
+
+        // Used during generation:
+        position_ids = null,
+        inputs_embeds = null,
+        past_key_values = null,
+
+        // Generic generation parameters
+        generation_config = null,
+        logits_processor = null,
+
+        // Additional parameters
+        ...kwargs
+    },
+) {
+    const modality_values = kwargs[modality_input_name];
+    if (!inputs_embeds) {
+        // 1. Extract the text embeddings.
+        inputs_embeds = await self.encode_text({ input_ids, ...kwargs });
+
+        // 2. Possibly, merge text and modality values
+        if (modality_values && input_ids.dims[1] !== 1) {
+            const modality_features = await encode_function({
+                // Pass the modality values under its expected key.
+                // The caller knows whether this is audio or image.
+                [modality_input_name]: modality_values,
+                ...kwargs,
+            });
+            ({ inputs_embeds, attention_mask } = merge_function({
+                [modality_output_name]: modality_features,
+                inputs_embeds,
+                input_ids,
+                attention_mask,
+            }));
+        } else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
+            // This branch handles the cache case.
+            const target_length = input_ids.dims[1]; // always 1
+            const past_length = Object.values(past_key_values)[0].dims.at(-2);
+
+            attention_mask = cat(
+                [
+                    ones([input_ids.dims[0], past_length]),
+                    attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]]),
+                ],
+                1,
+            );
+        }
+    }
+
+    if (!position_ids) {
+        if (self.config.model_type === 'qwen2_vl') {
+            // Special case for qwen2_vl models
+            // @ts-ignore
+            const { image_grid_thw, video_grid_thw } = kwargs;
+            [position_ids] = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask);
+        }
+    }
+
+    // 3. Call the decoder forward using the updated inputs.
+    const outputs = await decoderForward(
+        self,
+        {
+            inputs_embeds,
+            past_key_values,
+            attention_mask,
+            position_ids,
+            generation_config,
+            logits_processor,
+        },
+        true,
+    );
+    return outputs;
+}
+
+/**
+ * Forward pass of an audio-text-to-text model.
+ * @param {Object} self The audio-text-to-text model.
+ * @param {Object} params The inputs for the audio-text-to-text forward pass.
+ * @returns {Promise<Tensor>} The model's output tensor.
+ * @private
+ */
+export async function audioTextToTextForward(self, params) {
+    return await genericTextToTextForward(self, {
+        ...params,
+        modality_input_name: 'audio_values',
+        modality_output_name: 'audio_features',
+        encode_function: self.encode_audio.bind(self),
+        merge_function: self._merge_input_ids_with_audio_features.bind(self),
+    });
+}
+
+/**
+ * Forward pass of an image-text-to-text model.
+ * @param {Object} self The image-text-to-text model.
+ * @param {Object} params The inputs for the image-text-to-text forward pass.
+ * @returns {Promise<Tensor>} The model's output tensor.
+ * @private
+ */
+export async function imageTextToTextForward(self, params) {
+    return await genericTextToTextForward(self, {
+        ...params,
+        modality_input_name: 'pixel_values',
+        modality_output_name: 'image_features',
+        encode_function: self.encode_image.bind(self),
+        merge_function: self._merge_input_ids_with_image_features.bind(self),
+    });
+}
+
+/**
+ * Helper function to perform the following:
+ * ```python
+ * x = attention_mask.long().cumsum(-1) - 1
+ * x.masked_fill_(attention_mask == 0, 1)
+ * ```
+ * @param {Tensor} attention_mask
+ * @returns {{data: BigInt64Array, dims: number[]}}
+ */
+export function cumsum_masked_fill(attention_mask, start_index = 0) {
+    const [bz, seq_len] = attention_mask.dims;
+    const attn_mask_data = attention_mask.data;
+
+    const data = new BigInt64Array(attn_mask_data.length);
+    for (let i = 0; i < bz; ++i) {
+        const start = i * seq_len;
+        let sum = BigInt(start_index);
+        for (let j = 0; j < seq_len; ++j) {
+            const index = start + j;
+            if (attn_mask_data[index] === 0n) {
+                data[index] = BigInt(1);
+            } else {
+                // === 1n
+                data[index] = sum;
+                sum += attn_mask_data[index];
+            }
+        }
+    }
+    return { data, dims: attention_mask.dims };
+}
+
+/**
+ * If the model supports providing position_ids, we create position_ids on the fly for batch generation,
+ * by computing the cumulative sum of the attention mask along the sequence length dimension.
+ *
+ * Equivalent to:
+ * ```python
+ * position_ids = attention_mask.long().cumsum(-1) - 1
+ * position_ids.masked_fill_(attention_mask == 0, 1)
+ * if past_key_values:
+ *     position_ids = position_ids[:, -input_ids.shape[1] :]
+ * ```
+ */
+export function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
+    const { input_ids, inputs_embeds, attention_mask } = model_inputs;
+
+    const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
+    let position_ids = new Tensor('int64', data, dims);
+    if (past_key_values) {
+        const offset = -(input_ids ?? inputs_embeds).dims.at(1);
+        position_ids = position_ids.slice(null, [offset, null]);
+    }
+    return position_ids;
+}
+
+export function decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
+    const past_length = model_inputs.past_key_values ? Object.values(model_inputs.past_key_values)[0].dims.at(-2) : 0;
+
+    if (!model_inputs.attention_mask) {
+        // If the attention mask is not provided, we attempt to infer based on provided inputs
+        let dims;
+        for (const key of ['input_ids', 'inputs_embeds', 'position_ids']) {
+            if (model_inputs[key]) {
+                dims = model_inputs[key].dims;
+                break;
+            }
+        }
+        if (!dims) {
+            throw new Error('attention_mask is not provided, and unable to infer its shape from model inputs.');
+        }
+        model_inputs.attention_mask = ones([dims[0], past_length + dims[1]]);
+    }
+
+    if (model_inputs.past_key_values) {
+        const { input_ids, attention_mask } = model_inputs;
+
+        // Keep only the unprocessed tokens:
+        // 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        // some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+        // input)
+        if (attention_mask && attention_mask.dims[1] > input_ids.dims[1]) {
+            // NOTE: not needed since we only pass the generated tokens to the next forward pass
+            // const offset = -(attention_mask.dims[1] - past_length);
+            // model_inputs.input_ids = input_ids.slice(null, [offset, null]);
+        }
+        // 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens.
+        // We can discard input_ids based on the past_length.
+        else if (past_length < input_ids.dims[1]) {
+            // NOTE: Required for phi models.
+            // See https://github.com/huggingface/transformers/issues/30809#issuecomment-2111918479 for more information.
+            model_inputs.input_ids = input_ids.slice(null, [past_length, null]);
+        }
+        // 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+        else {
+        }
+    }
+
+    return model_inputs;
+}
+
+export function encoder_decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
+    if (model_inputs.past_key_values) {
+        input_ids = input_ids.map((x) => [x.at(-1)]);
+    }
+
+    return {
+        ...model_inputs,
+        decoder_input_ids: toI64Tensor(input_ids),
+    };
+}
+
+export function multimodal_text_to_text_prepare_inputs_for_generation(self, ...args) {
+    if (self.config.is_encoder_decoder) {
+        return encoder_decoder_prepare_inputs_for_generation(self, ...args);
+    } else {
+        return decoder_prepare_inputs_for_generation(self, ...args);
+    }
+}
+
+export function multimodality_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
+    const has_past_key_values = !!model_inputs.past_key_values;
+
+    if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) {
+        if (has_past_key_values) {
+            model_inputs.input_ids = cat([model_inputs.input_ids, model_inputs.input_ids], 0);
+            // NOTE: attention_mask handled in generation
+        } else {
+            model_inputs.input_ids = cat(
+                [model_inputs.input_ids, full_like(model_inputs.input_ids, BigInt(generation_config.pad_token_id))],
+                0,
+            );
+            model_inputs.attention_mask = cat(
+                [model_inputs.attention_mask, full_like(model_inputs.attention_mask, 0n)],
+                0,
+            );
+        }
+    }
+
+    if (has_past_key_values || !model_inputs.pixel_values) {
+        model_inputs.pixel_values = full([0, 0, 3, 384, 384], 1.0);
+    }
+
+    if (has_past_key_values) {
+        const num_img_tokens = 0;
+        const num_text_tokens = 1;
+        const has_image = num_img_tokens > 0 ? 1 : 0;
+
+        const batch_size = 1;
+        model_inputs.images_seq_mask = new Tensor(
+            'bool',
+            new Array(num_img_tokens + num_text_tokens).fill(true).fill(false, 0, num_text_tokens),
+            [batch_size, num_img_tokens + num_text_tokens],
+        );
+        model_inputs.images_emb_mask = new Tensor('bool', new Array(num_img_tokens).fill(!!has_image), [
+            batch_size,
+            1,
+            num_img_tokens,
+        ]);
+    }
+    return model_inputs;
+}
+
+export function chatterbox_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) {
+    if (!model_inputs.position_ids && self.sessions['embed_tokens'].inputNames.includes('position_ids')) {
+        // If position_ids are not provided, we create them on the fly using the position of the START_SPEECH_TOKEN
+        const START_SPEECH_TOKEN = 6561;
+        if (model_inputs.input_ids.dims[1] === 1) {
+            const position_ids = Array.from(
+                {
+                    length: input_ids.length,
+                },
+                (_, i) => input_ids[i].length - input_ids[i].findLastIndex((x) => x == START_SPEECH_TOKEN) - 1,
+            );
+            model_inputs.position_ids = new Tensor('int64', position_ids, [input_ids.length, 1]);
+        } else {
+            const batched_input_ids = model_inputs.input_ids.tolist();
+            const position_ids_list = batched_input_ids.map((ids) => {
+                let position = 0;
+                return ids.map((id) => (id >= START_SPEECH_TOKEN ? 0 : position++));
+            });
+            model_inputs.position_ids = new Tensor('int64', position_ids_list.flat(), model_inputs.input_ids.dims);
+        }
+    }
+    if (model_inputs.input_ids.dims[1] === 1) {
+        // We are in generation mode and no longer need the audio inputs
+        delete model_inputs.audio_values;
+        delete model_inputs.audio_features;
+        delete model_inputs.audio_tokens;
+        delete model_inputs.speaker_embeddings;
+        delete model_inputs.speaker_features;
+    }
+    return decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config);
+}
+
+/**
+ * Validate model inputs
+ * @param {Object} session The InferenceSession object that will be run.
+ * @param {Object} inputs The inputs to check.
+ * @returns {Record<string, Tensor>} The checked inputs.
+ * @throws {Error} If any inputs are missing.
+ * @private
+ */
+export function validateInputs(session, inputs) {
+    /**
+     * NOTE: Create either a shallow or deep copy based on `onnx.wasm.proxy`
+     * @type {Record<string, Tensor>}
+     */
+    const checkedInputs = Object.create(null);
+    const missingInputs = [];
+    for (const inputName of session.inputNames) {
+        const tensor = inputs[inputName];
+        // Rare case where one of the model's input names corresponds to a built-in
+        // object name (e.g., toString), which would cause a simple (!tensor) check to fail,
+        // because it's not undefined but a function.
+        if (!(tensor instanceof Tensor)) {
+            missingInputs.push(inputName);
+            continue;
+        }
+        // NOTE: When `env.wasm.proxy is true` the tensor is moved across the Worker
+        // boundary, transferring ownership to the worker and invalidating the tensor.
+        // So, in this case, we simply sacrifice a clone for it.
+        checkedInputs[inputName] = isONNXProxy() ? tensor.clone() : tensor;
+    }
+    if (missingInputs.length > 0) {
+        throw new Error(
+            `An error occurred during model execution: "Missing the following inputs: ${missingInputs.join(', ')}.`,
+        );
+    }
+
+    const numInputsProvided = Object.keys(inputs).length;
+    const numInputsNeeded = session.inputNames.length;
+    if (numInputsProvided > numInputsNeeded) {
+        // No missing inputs, but too many inputs were provided.
+        // Warn the user and ignore the extra inputs.
+        let ignored = Object.keys(inputs).filter((inputName) => !session.inputNames.includes(inputName));
+        console.warn(
+            `WARNING: Too many inputs were provided (${numInputsProvided} > ${numInputsNeeded}). The following inputs will be ignored: "${ignored.join(', ')}".`,
+        );
+    }
+
+    return checkedInputs;
+}
+
+export function default_merge_input_ids_with_features({
+    modality_token_id,
+    inputs_embeds,
+    modality_features,
+    input_ids,
+    attention_mask,
+}) {
+    const token_positions = input_ids.tolist().map((ids) =>
+        ids.reduce((acc, x, idx) => {
+            if (x == modality_token_id) acc.push(idx);
+            return acc;
+        }, []),
+    );
+    const n_tokens = token_positions.reduce((acc, x) => acc + x.length, 0);
+    const n_features = modality_features.dims[0];
+    if (n_tokens !== n_features) {
+        throw new Error(`Number of tokens and features do not match: tokens: ${n_tokens}, features ${n_features}`);
+    }
+
+    // Equivalent to performing a masked_scatter
+    let img = 0;
+    for (let i = 0; i < token_positions.length; ++i) {
+        const tokens = token_positions[i];
+        const embeds = inputs_embeds[i];
+        for (let j = 0; j < tokens.length; ++j) {
+            embeds[tokens[j]].data.set(modality_features[img++].data);
+        }
+    }
+    return { inputs_embeds, attention_mask };
+}
+
+export function default_merge_input_ids_with_image_features({
+    image_token_id,
+    inputs_embeds,
+    image_features,
+    input_ids,
+    attention_mask,
+}) {
+    return default_merge_input_ids_with_features({
+        modality_token_id: image_token_id,
+        inputs_embeds,
+        modality_features: image_features,
+        input_ids,
+        attention_mask,
+    });
+}
+
+export function default_merge_input_ids_with_audio_features({
+    audio_token_id,
+    inputs_embeds,
+    audio_features,
+    input_ids,
+    attention_mask,
+}) {
+    return default_merge_input_ids_with_features({
+        modality_token_id: audio_token_id,
+        inputs_embeds,
+        modality_features: audio_features,
+        input_ids,
+        attention_mask,
+    });
+}
+
+/**
+ * Helper function to load multiple optional configuration files
+ * @param {string} pretrained_model_name_or_path The path to the directory containing the config file.
+ * @param {Record<string, string>} names The names of the config files to load.
+ * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the configs.
+ * @returns {Promise<Record<string, any>>} A Promise that resolves to a dictionary of configuration objects.
+ * @private
+ */
+export async function getOptionalConfigs(pretrained_model_name_or_path, names, options) {
+    return Object.fromEntries(
+        await Promise.all(
+            Object.keys(names).map(async (name) => {
+                const config = await getModelJSON(pretrained_model_name_or_path, names[name], false, options);
+                return [name, config];
+            }),
+        ),
+    );
+}
diff --git a/src/pipelines.js b/src/pipelines.js
index 02392cf9d..78adf6b76 100644
--- a/src/pipelines.js
+++ b/src/pipelines.js
@@ -14,7 +14,7 @@
  */
 
 import { AutoTokenizer } from './tokenizers.js';
-import { AutoProcessor } from './models/auto/processing_auto.js';
+import { AutoProcessor } from './models/model-processors/auto/processing_auto.js';
 import {
     AutoModel,
     AutoModelForSequenceClassification,
diff --git a/src/pipelines/_base.js b/src/pipelines/_base.js
index 4011ba33c..a8aacc8ea 100644
--- a/src/pipelines/_base.js
+++ b/src/pipelines/_base.js
@@ -1,5 +1,5 @@
 import { PreTrainedTokenizer } from '../tokenizers.js';
-import { PreTrainedModel } from '../models.js';
+import { PreTrainedModel } from '../models/pre-trained-model.js';
 import { Processor } from '../base/processing_utils.js';
 
 import { Callable } from '../utils/generic.js';
diff --git a/src/tokenizers.js b/src/tokenizers.js
index 0e69372d5..daf596312 100644
--- a/src/tokenizers.js
+++ b/src/tokenizers.js
@@ -31,7 +31,7 @@ import { PriorityQueue, TokenLattice, CharTrie, DictionarySplitter, LRUCache } f
 
 import { Template } from '@huggingface/jinja';
 
-import { WHISPER_LANGUAGE_MAPPING } from './models/whisper/common_whisper.js';
+import { WHISPER_LANGUAGE_MAPPING } from './models/model-processors/whisper/common_whisper.js';
 
 /**
  * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties.
diff --git a/src/transformers.js b/src/transformers.js
index 2212cb251..65bb98c39 100644
--- a/src/transformers.js
+++ b/src/transformers.js
@@ -25,16 +25,16 @@ export * from './utils/tensor.js';
 export * from './utils/maths.js';
 
 export { FeatureExtractor } from './base/feature_extraction_utils.js';
-export * from './models/feature_extractors.js';
-export * from './models/auto/feature_extraction_auto.js';
+export * from './models/feature-extractors.js';
+export * from './models/model-processors/auto/feature_extraction_auto.js';
 
 export { ImageProcessor } from './base/image_processors_utils.js';
-export * from './models/image_processors.js';
-export * from './models/auto/image_processing_auto.js';
+export * from './models/image-processors.js';
+export * from './models/model-processors/auto/image_processing_auto.js';
 
 export { Processor } from './base/processing_utils.js';
 export * from './models/processors.js';
-export * from './models/auto/processing_auto.js';
+export * from './models/model-processors/auto/processing_auto.js';
 
 export * from './generation/streamers.js';
 export * from './generation/stopping_criteria.js';
diff --git a/src/utils/hub.js b/src/utils/hub.js
index 07c4f492c..de87f9147 100755
--- a/src/utils/hub.js
+++ b/src/utils/hub.js
@@ -92,45 +92,19 @@ export async function getFile(urlOrPath) {
 }
 
 /**
- * Retrieves a file from either a remote URL using the Fetch API or from the local file system using the FileSystem API.
- * If the filesystem is available and `env.useCache = true`, the file will be downloaded and cached.
+ * Builds the resource paths and URLs for a model file.
+ * Can be used to get the resource URL or path without loading the file.
  *
  * @param {string} path_or_repo_id This can be either:
  * - a string, the *model id* of a model repo on huggingface.co.
  * - a path to a *directory* potentially containing the file.
- * @param {string} filename The name of the file to locate in `path_or_repo`.
- * @param {boolean} [fatal=true] Whether to throw an error if the file is not found.
+ * @param {string} filename The name of the file to locate.
  * @param {PretrainedOptions} [options] An object containing optional parameters.
- * @param {boolean} [return_path=false] Whether to return the path of the file instead of the file content.
- *
- * @throws Will throw an error if the file is not found and `fatal` is true.
- * @returns {Promise<string|Uint8Array>} A Promise that resolves with the file content as a Uint8Array if `return_path` is false, or the file path as a string if `return_path` is true.
+ * @param {import('./cache.js').CacheInterface | null} [cache] The cache instance to use for determining cache keys.
+ * @returns {{ requestURL: string, localPath: string, remoteURL: string, proposedCacheKey: string, validModelId: boolean }}
+ * An object containing all the paths and URLs for the resource.
  */
-export async function getModelFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false) {
-    if (!env.allowLocalModels) {
-        // User has disabled local models, so we just make sure other settings are correct.
-
-        if (options.local_files_only) {
-            throw Error(
-                'Invalid configuration detected: local models are disabled (`env.allowLocalModels=false`) but you have requested to only use local models (`local_files_only=true`).',
-            );
-        } else if (!env.allowRemoteModels) {
-            throw Error(
-                'Invalid configuration detected: both local and remote models are disabled. Fix by setting `env.allowLocalModels` or `env.allowRemoteModels` to `true`.',
-            );
-        }
-    }
-
-    // Initiate file retrieval
-    dispatchCallback(options.progress_callback, {
-        status: 'initiate',
-        name: path_or_repo_id,
-        file: filename,
-    });
-
-    /** @type {import('./cache.js').CacheInterface | null} */
-    const cache = await getCache(options?.cache_dir);
-
+export function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = null) {
     const revision = options.revision ?? 'main';
     const requestURL = pathJoin(path_or_repo_id, filename);
 
@@ -144,8 +118,6 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
         filename,
     );
 
-    /** @type {string} */
-    let cacheKey;
     const proposedCacheKey =
         cache instanceof FileCache
             ? // Choose cache key for filesystem cache
@@ -156,19 +128,125 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
                 : pathJoin(path_or_repo_id, revision, filename)
             : remoteURL;
 
+    return {
+        requestURL,
+        localPath,
+        remoteURL,
+        proposedCacheKey,
+        validModelId,
+    };
+}
+
+/**
+ * Checks if a resource exists in cache.
+ *
+ * @param {import('./cache.js').CacheInterface | null} cache The cache instance to check.
+ * @param {string} localPath The local path to try first.
+ * @param {string} proposedCacheKey The proposed cache key to try second.
+ * @returns {Promise<Response|import('./hub/FileResponse.js').default|undefined|string>}
+ * The cached response if found, undefined otherwise.
+ */
+export async function checkCachedResource(cache, localPath, proposedCacheKey) {
+    if (!cache) {
+        return undefined;
+    }
+
+    // A caching system is available, so we try to get the file from it.
+    //  1. We first try to get from cache using the local path. In some environments (like deno),
+    //     non-URL cache keys are not allowed. In these cases, `response` will be undefined.
+    //  2. If no response is found, we try to get from cache using the remote URL or file system cache.
+    return await tryCache(cache, localPath, proposedCacheKey);
+}
+
+/**
+ * Stores a resource in the cache.
+ *
+ * @param {import('./cache.js').CacheInterface} cache The cache instance to store in.
+ * @param {string} cacheKey The cache key to use.
+ * @param {Response|import('./hub/FileResponse.js').default} response The response to cache.
+ * @param {Uint8Array} [result] The result buffer if already read.
+ * @param {PretrainedOptions & { _path_or_repo_id?: string, _filename?: string }} [options] Options containing progress callback and context for progress updates.
+ * @returns {Promise<void>}
+ */
+export async function storeCachedResource(cache, cacheKey, response, result, options = {}) {
+    // Check again whether request is in cache. If not, we add the response to the cache
+    if ((await cache.match(cacheKey)) !== undefined) {
+        return;
+    }
+
+    if (!result) {
+        // We haven't yet read the response body, so we need to do so now.
+        // Ensure progress updates include consistent metadata.
+        const wrapped_progress = options.progress_callback
+            ? (data) =>
+                  dispatchCallback(options.progress_callback, {
+                      status: 'progress',
+                      name: options._path_or_repo_id,
+                      file: options._filename,
+                      ...data,
+                  })
+            : undefined;
+        await cache.put(cacheKey, /** @type {Response} */ (response), wrapped_progress);
+    } else if (typeof response !== 'string') {
+        // NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files
+        await cache
+            .put(
+                cacheKey,
+                new Response(/** @type {any} */ (result), {
+                    headers: response.headers,
+                }),
+            )
+            .catch((err) => {
+                // Do not crash if unable to add to cache (e.g., QuotaExceededError).
+                // Rather, log a warning and proceed with execution.
+                console.warn(`Unable to add response to browser cache: ${err}.`);
+            });
+    }
+}
+
+/**
+ * Loads a resource file from local or remote sources.
+ *
+ * @param {string} path_or_repo_id This can be either:
+ * - a string, the *model id* of a model repo on huggingface.co.
+ * - a path to a *directory* potentially containing the file.
+ * @param {string} filename The name of the file to locate.
+ * @param {boolean} [fatal=true] Whether to throw an error if the file is not found.
+ * @param {PretrainedOptions} [options] An object containing optional parameters.
+ * @param {boolean} [return_path=false] Whether to return the path of the file instead of the file content.
+ * @param {import('./cache.js').CacheInterface | null} [cache] The cache instance to use.
+ * @param {{ requestURL: string, localPath: string, remoteURL: string, proposedCacheKey: string, validModelId: boolean }} [paths] Pre-built paths object.
+ *
+ * @throws Will throw an error if the file is not found and `fatal` is true.
+ * @returns {Promise<string|Uint8Array|null>} A Promise that resolves with the file content as a Uint8Array if `return_path` is false, or the file path as a string if `return_path` is true.
+ */
+export async function loadResourceFile(
+    path_or_repo_id,
+    filename,
+    fatal = true,
+    options = {},
+    return_path = false,
+    cache = null,
+    paths = null,
+) {
+    // Use pre-built paths or build them
+    if (!paths) {
+        paths = buildResourcePaths(path_or_repo_id, filename, options, cache);
+    }
+
+    const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = paths;
+
+    /** @type {string} */
+    let cacheKey;
+
     // Whether to cache the final response in the end.
     let toCacheResponse = false;
 
     /** @type {Response|import('./hub/FileResponse.js').default|undefined|string} */
     let response;
 
-    if (cache) {
-        // A caching system is available, so we try to get the file from it.
-        //  1. We first try to get from cache using the local path. In some environments (like deno),
-        //     non-URL cache keys are not allowed. In these cases, `response` will be undefined.
-        //  2. If no response is found, we try to get from cache using the remote URL or file system cache.
-        response = await tryCache(cache, localPath, proposedCacheKey);
-    }
+    // Check cache
+    response = await checkCachedResource(cache, localPath, proposedCacheKey);
 
     const cacheHit = response !== undefined;
     if (!cacheHit) {
@@ -295,38 +373,17 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
         // i.e., do not cache FileResponses (prevents duplication)
         toCacheResponse &&
         cacheKey &&
-        // Check again whether request is in cache. If not, we add the response to the cache
-        (await cache.match(cacheKey)) === undefined
+        typeof response !== 'string'
     ) {
-        if (!result) {
-            // We haven't yet read the response body, so we need to do so now.
-            // Ensure progress updates include consistent metadata.
-            const wrapped_progress = options.progress_callback
-                ? (data) =>
-                      dispatchCallback(options.progress_callback, {
-                          status: 'progress',
-                          name: path_or_repo_id,
-                          file: filename,
-                          ...data,
-                      })
-                : undefined;
-            await cache.put(cacheKey, /** @type {Response} */ (response), wrapped_progress);
-        } else if (typeof response !== 'string') {
-            // NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files
-            await cache
-                .put(
-                    cacheKey,
-                    new Response(/** @type {any} */ (result), {
-                        headers: response.headers,
-                    }),
-                )
-                .catch((err) => {
-                    // Do not crash if unable to add to cache (e.g., QuotaExceededError).
-                    // Rather, log a warning and proceed with execution.
-                    console.warn(`Unable to add response to browser cache: ${err}.`);
-                });
-        }
+        // Store temporary context for progress callbacks in cache storage
+        const extendedOptions = {
+            ...options,
+            _path_or_repo_id: path_or_repo_id,
+            _filename: filename,
+        };
+        await storeCachedResource(cache, cacheKey, response, result, extendedOptions);
     }
+
     dispatchCallback(options.progress_callback, {
         status: 'done',
         name: path_or_repo_id,
@@ -357,6 +414,49 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti
     throw new Error('Unable to get model file path or buffer.');
 }
 
+/**
+ * Retrieves a file from either a remote URL using the Fetch API or from the local file system using the FileSystem API.
+ * If the filesystem is available and `env.useCache = true`, the file will be downloaded and cached.
+ *
+ * @param {string} path_or_repo_id This can be either:
+ * - a string, the *model id* of a model repo on huggingface.co.
+ * - a path to a *directory* potentially containing the file.
+ * @param {string} filename The name of the file to locate in `path_or_repo`.
+ * @param {boolean} [fatal=true] Whether to throw an error if the file is not found.
+ * @param {PretrainedOptions} [options] An object containing optional parameters.
+ * @param {boolean} [return_path=false] Whether to return the path of the file instead of the file content.
+ *
+ * @throws Will throw an error if the file is not found and `fatal` is true.
+ * @returns {Promise<string|Uint8Array>} A Promise that resolves with the file content as a Uint8Array if `return_path` is false, or the file path as a string if `return_path` is true.
+ */
+export async function getModelFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false) {
+    if (!env.allowLocalModels) {
+        // User has disabled local models, so we just make sure other settings are correct.
+
+        if (options.local_files_only) {
+            throw Error(
+                'Invalid configuration detected: local models are disabled (`env.allowLocalModels=false`) but you have requested to only use local models (`local_files_only=true`).',
+            );
+        } else if (!env.allowRemoteModels) {
+            throw Error(
+                'Invalid configuration detected: both local and remote models are disabled. Fix by setting `env.allowLocalModels` or `env.allowRemoteModels` to `true`.',
+            );
+        }
+    }
+
+    dispatchCallback(options.progress_callback, {
+        status: 'initiate',
+        name: path_or_repo_id,
+        file: filename,
+    });
+
+    /** @type {import('./cache.js').CacheInterface | null} */
+    const cache = await getCache(options?.cache_dir);
+    const paths = buildResourcePaths(path_or_repo_id, filename, options, cache);
+
+    return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache, paths);
+}
+
 /**
  * Fetches a text file from a given path and file name.
  *
diff --git a/src/utils/model-loader.js b/src/utils/model-loader.js
new file mode 100644
index 000000000..4bc33952d
--- /dev/null
+++ b/src/utils/model-loader.js
@@ -0,0 +1,99 @@
+import { getModelFile, MAX_EXTERNAL_DATA_CHUNKS } from './hub.js';
+import { apis } from '../env.js';
+
+/**
+ * Loads the core model file.
+ *
+ * @param {string} pretrained_model_name_or_path The path to the directory containing the model file.
+ * @param {string} fileName The base name of the model file (without suffix or extension).
+ * @param {import('./hub.js').PretrainedModelOptions} options Additional options for loading the model.
+ * @param {string} suffix The suffix to append to the file name (e.g., '_q4', '_quantized').
+ * @returns {Promise<string|Uint8Array>} A Promise that resolves to the model file buffer or path.
+ */
+export async function getCoreModelFile(pretrained_model_name_or_path, fileName, options, suffix) {
+    const baseName = `${fileName}${suffix}.onnx`;
+    const fullPath = `${options.subfolder ?? ''}/${baseName}`;
+
+    return await getModelFile(
+        pretrained_model_name_or_path,
+        fullPath,
+        true,
+        options,
+        apis.IS_NODE_ENV,
+    );
+}
+
+/**
+ * Loads external data files for a model.
+ *
+ * @param {string} pretrained_model_name_or_path The path to the directory containing the model files.
+ * @param {string} fileName The base name of the model file (without suffix or extension).
+ * @param {string} suffix The suffix to append to the file name (e.g., '_q4').
+ * @param {import('./hub.js').PretrainedModelOptions} options Additional options for loading the model.
+ * @param {import('./hub.js').ExternalData|Record<string, import('./hub.js').ExternalData>|undefined} use_external_data_format External data format configuration.
+ * @param {any} [session_options] Optional session options that may contain externalData configuration.
+ * @returns {Promise<Array<string|{path: string, data: Uint8Array}>>} A Promise that resolves to an array of external data files.
+ */
+export async function getModelDataFiles(
+    pretrained_model_name_or_path,
+    fileName,
+    suffix,
+    options,
+    use_external_data_format,
+    session_options = {},
+) {
+    const baseName = `${fileName}${suffix}.onnx`;
+    const return_path = apis.IS_NODE_ENV;
+
+    /** @type {Promise<string|{path: string, data: Uint8Array}>[]} */
+    let externalDataPromises = [];
+
+    if (use_external_data_format) {
+        let external_data_format;
+        if (typeof use_external_data_format === 'object') {
+            if (use_external_data_format.hasOwnProperty(baseName)) {
+                external_data_format = use_external_data_format[baseName];
+            } else if (use_external_data_format.hasOwnProperty(fileName)) {
+                external_data_format = use_external_data_format[fileName];
+            } else {
+                external_data_format = false;
+            }
+        } else {
+            external_data_format = use_external_data_format;
+        }
+
+        const num_chunks = +external_data_format; // (false=0, true=1, number remains the same)
+        if (num_chunks > MAX_EXTERNAL_DATA_CHUNKS) {
+            throw new Error(
+                `The number of external data chunks (${num_chunks}) exceeds the maximum allowed value (${MAX_EXTERNAL_DATA_CHUNKS}).`,
+            );
+        }
+        for (let i = 0; i < num_chunks; ++i) {
+            const path = `${baseName}_data${i === 0 ? '' : '_' + i}`;
+            const fullPath = `${options.subfolder ?? ''}/${path}`;
+            externalDataPromises.push(
+                new Promise(async (resolve, reject) => {
+                    const data = await getModelFile(
+                        pretrained_model_name_or_path,
+                        fullPath,
+                        true,
+                        options,
+                        return_path,
+                    );
+                    resolve(data instanceof Uint8Array ? { path, data } : path);
+                }),
+            );
+        }
+    } else if (session_options.externalData !== undefined) {
+        externalDataPromises = session_options.externalData.map(async (ext) => {
+            // if the external data is a string, fetch the file and replace the string with its content
+            if (typeof ext.data === 'string') {
+                const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options);
+                return { ...ext, data: ext_buffer };
+            }
+            return ext;
+        });
+    }
+
+    return Promise.all(externalDataPromises);
+}
\ No newline at end of file
diff --git a/src/utils/tensor.js b/src/utils/tensor.js
index e708c79b1..3fe0fac78 100644
--- a/src/utils/tensor.js
+++ b/src/utils/tensor.js
@@ -1583,3 +1583,64 @@ export function quantize_embeddings(tensor, precision) {
 
     return new Tensor(dtype, outputData, [tensor.dims[0], tensor.dims[1] / 8]);
 }
+
+/**
+ * Replaces ONNX Tensor objects with custom Tensor objects to support additional functions.
+ * @param {Object} obj The object to replace tensor objects in.
+ * @returns {Object} The object with tensor objects replaced by custom Tensor objects.
+ * @private
+ */
+export function replaceTensors(obj) {
+    for (let prop in obj) {
+        if (isONNXTensor(obj[prop])) {
+            obj[prop] = new Tensor(obj[prop]);
+        } else if (typeof obj[prop] === 'object') {
+            replaceTensors(obj[prop]);
+        }
+    }
+    return obj;
+}
+
+/**
+ * Converts an array or Tensor of integers to an int64 Tensor.
+ * @param {any[]|Tensor} items The input integers to be converted.
+ * @returns {Tensor} The int64 Tensor with the converted values.
+ * @throws {Error} If the input array is empty or the input is a batched Tensor and not all sequences have the same length.
+ * @private
+ */
+export function toI64Tensor(items) {
+    if (items instanceof Tensor) {
+        return items;
+    }
+    // items is an array
+    if (items.length === 0) {
+        throw Error('items must be non-empty');
+    }
+
+    if (Array.isArray(items[0])) {
+        // batched
+        if (items.some((x) => x.length !== items[0].length)) {
+            throw Error(
+                "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.",
+            );
+        }
+
+        return new Tensor('int64', BigInt64Array.from(items.flat().map((x) => BigInt(x))), [
+            items.length,
+            items[0].length,
+        ]);
+    } else {
+        //flat
+        return new Tensor('int64', BigInt64Array.from(items.map((x) => BigInt(x))), [1, items.length]);
+    }
+}
+
+/**
+ * Creates a boolean tensor with a single value.
+ * @param {boolean} value The value of the tensor.
+ * @returns {Tensor} The boolean tensor.
+ * @private
+ */
+export function boolTensor(value) {
+    return new Tensor('bool', [value], [1]);
+}
diff --git a/webpack.config.js b/webpack.config.js
deleted file mode 100644
index d1e264ac2..000000000
--- a/webpack.config.js
+++ /dev/null
@@ -1,224 +0,0 @@
-import { fileURLToPath } from "node:url";
-import path from "node:path";
-import fs from "node:fs";
-import webpack from "webpack";
-import TerserPlugin from "terser-webpack-plugin";
-
-const __dirname = path.dirname(fileURLToPath(import.meta.url));
-
-/**
- * Plugin to strip the "node:" prefix from module requests.
- *
- * This is necessary to ensure both web and node builds work correctly,
- * otherwise we would get an error like:
- * ```
- * Module build failed: UnhandledSchemeError: Reading from "node:path" is not handled by plugins (Unhandled scheme).
- * Webpack supports "data:" and "file:" URIs by default.
- * You may need an additional plugin to handle "node:" URIs.
- * ```
- *
- * NOTE: We then do not need to use the `node:` prefix in the resolve.alias configuration.
- */
-class StripNodePrefixPlugin extends webpack.NormalModuleReplacementPlugin {
-  constructor() {
-    super(/^node:(.+)$/, (resource) => {
-      resource.request = resource.request.replace(/^node:/, "");
-    });
-  }
-}
-
-/**
- * Plugin to post-process build files. Required to solve certain issues with ESM module output.
- * See https://github.com/webpack/webpack/issues/17121 for more information.
- *
- * @see https://webpack.js.org/contribute/writing-a-plugin/
- */
-class PostBuildPlugin {
-  static completed = false;
-
-  apply(compiler) {
-    compiler.hooks.done.tap("PostBuildPlugin", () => {
-      if (!process.env.WEBPACK_SERVE && !PostBuildPlugin.completed) {
-        // Ensure we only run this once
-        PostBuildPlugin.completed = true;
-        return;
-      }
-      const dist = path.join(__dirname, "dist");
-      const ORT_JSEP_FILE = "ort-wasm-simd-threaded.asyncify.mjs";
-      const ORT_BUNDLE_FILE = "ort.webgpu.bundle.min.mjs";
-
-      // 1. Copy unbundled asyncify file
-      {
-        const src = path.join(
-          __dirname,
-          "node_modules/onnxruntime-web/dist",
-          ORT_JSEP_FILE,
-        );
-        const dest = path.join(dist, ORT_JSEP_FILE);
-        fs.copyFileSync(src, dest);
-      }
-
-      // 2. Remove unnecessary files
-      {
-        const file = path.join(dist, ORT_BUNDLE_FILE);
-        if (fs.existsSync(file)) fs.unlinkSync(file);
-      }
-    });
-  }
-}
-
-/**
- * Helper function to create webpack configurations.
- * @param {Object} options Options for creating a webpack target.
- * @param {string} options.name Name of output file.
- * @param {string} options.suffix Suffix of output file.
- * @param {string} options.type Type of library.
- * @param {string} options.ignoreModules The list of modules to ignore.
- * @param {string} options.externalModules The list of modules to set as external.
- * @param {Object[]} options.plugins List of plugins to use.
- * @returns {import('webpack').Configuration} One webpack target.
- */
-function buildConfig({
-  name = "",
-  suffix = ".js",
-  type = "module", // 'module' | 'commonjs'
-  ignoreModules = [],
-  externalModules = [],
-  plugins = [],
-} = {}) {
-  const outputModule = type === "module";
-  const alias = Object.fromEntries(
-    ignoreModules.map((module) => [module, false]),
-  );
-
-  /** @type {import('webpack').Configuration} */
-  const config = {
-    mode: "development",
-    devtool: "source-map",
-    entry: {
-      [`transformers${name}`]: "./src/transformers.js",
-      [`transformers${name}.min`]: "./src/transformers.js",
-    },
-    output: {
-      filename: `[name]${suffix}`,
-      path: path.join(__dirname, "dist"),
-      library: {
-        type,
-      },
-      assetModuleFilename: "[name][ext]",
-      chunkFormat: false,
-    },
-    optimization: {
-      minimize: true,
-      minimizer: [
-        new TerserPlugin({
-          test: new RegExp(`\\.min\\${suffix}$`),
-
-          // Do not bundle with comments.
-          // See https://webpack.js.org/plugins/terser-webpack-plugin/#remove-comments for more information.
-          terserOptions: {
-            output: {
-              comments: false,
-            },
-          },
-          extractComments: false,
-        }),
-      ],
-    },
-    experiments: {
-      outputModule,
-    },
-    resolve: { alias },
-
-    externals: externalModules,
-
-    // Development server
-    devServer: {
-      static: {
-        directory: __dirname,
-      },
-      port: 8080,
-    },
-    plugins,
-  };
-
-  if (outputModule) {
-    config.module = {
-      parser: {
-        javascript: {
-          importMeta: false,
-        },
-      },
-    };
-  } else {
-    config.externalsType = "commonjs";
-  }
-
-  return config;
-}
-
-// Do not bundle onnxruntime-web when packaging for Node.js.
-// Instead, we use the native library (onnxruntime-node).
-const NODE_IGNORE_MODULES = ["onnxruntime-web"];
-
-// Do not bundle the following modules with webpack (mark as external)
-// NOTE: This is necessary for both type="module" and type="commonjs",
-// and will be ignored when building for web (only used for node/deno)
-const NODE_EXTERNAL_MODULES = [
-  "onnxruntime-common",
-  "onnxruntime-node",
-  "sharp",
-  "node:fs",
-  "node:path",
-  "node:url",
-  "node:stream",
-  "node:stream/promises",
-];
-
-// Do not bundle node-only packages when bundling for the web.
-// NOTE: We can exclude the "node:" prefix for built-in modules here,
-// since we apply the `StripNodePrefixPlugin` to strip it.
-const WEB_IGNORE_MODULES = ["onnxruntime-node", "sharp", "fs", "path", "url", "stream", "stream/promises"];
-
-// Do not bundle the following modules with webpack (mark as external)
-const WEB_EXTERNAL_MODULES = ["onnxruntime-common", "onnxruntime-web"];
-
-// Web-only build
-const WEB_BUILD = buildConfig({
-  name: ".web",
-  type: "module",
-  ignoreModules: WEB_IGNORE_MODULES,
-  externalModules: WEB_EXTERNAL_MODULES,
-  plugins: [new StripNodePrefixPlugin(), new PostBuildPlugin()],
-});
-
-// Web-only build, bundled with onnxruntime-web
-const BUNDLE_BUILD = buildConfig({
-  type: "module",
-  ignoreModules: WEB_IGNORE_MODULES,
-  plugins: [new StripNodePrefixPlugin(), new PostBuildPlugin()],
-});
-
-// Node-compatible builds
-const NODE_BUILDS = [
-  buildConfig({
-    name: ".node",
-    suffix: ".mjs",
-    type: "module",
-    ignoreModules: NODE_IGNORE_MODULES,
-    externalModules: NODE_EXTERNAL_MODULES,
-  }),
-  buildConfig({
-    name: ".node",
-    suffix: ".cjs",
-    type: "commonjs",
-    ignoreModules: NODE_IGNORE_MODULES,
-    externalModules: NODE_EXTERNAL_MODULES,
-  }),
-];
-
-// When running with `webpack serve`, only build the web target.
-const BUILDS = process.env.WEBPACK_SERVE
-  ? [BUNDLE_BUILD]
-  : [BUNDLE_BUILD, WEB_BUILD, ...NODE_BUILDS];
-export default BUILDS;