diff --git a/.prettierignore b/.prettierignore index e66246bae..92c713b9d 100644 --- a/.prettierignore +++ b/.prettierignore @@ -2,6 +2,6 @@ .github dist docs -scripts +scripts/**/*.py types *.md diff --git a/.prettierrc b/.prettierrc index a9e0acb4f..39d89ff84 100644 --- a/.prettierrc +++ b/.prettierrc @@ -14,6 +14,13 @@ "tabWidth": 2, "printWidth": 10000000 } + }, + { + "files": ["scripts/**/*.{js,mjs,cjs}"], + "options": { + "tabWidth": 2, + "printWidth": 120 + } } ] } diff --git a/package-lock.json b/package-lock.json index 81c2b6aeb..8c03c290b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,15 +18,13 @@ "@types/jest": "^30.0.0", "@types/node": "^24.0.11", "@webgpu/types": "^0.1.64", + "esbuild": "^0.27.2", "jest": "^30.0.4", "jest-environment-node": "^30.0.4", "jsdoc-to-markdown": "^9.1.1", "prettier": "3.4.2", "typescript": "^5.8.3", - "wavefile": "11.0.0", - "webpack": "^5.99.9", - "webpack-cli": "^6.0.1", - "webpack-dev-server": "^5.2.2" + "wavefile": "11.0.0" } }, "node_modules/@babel/code-frame": { @@ -595,16 +593,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@discoveryjs/json-ext": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/@discoveryjs/json-ext/-/json-ext-0.6.3.tgz", - "integrity": "sha512-4B4OijXeVNOPZlYA2oEwWOTkzyltLao+xbotHQeqN++Rv27Y6s818+n2Qkp8q+Fxhn0t/5lA5X1Mxktud8eayQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.17.0" - } - }, "node_modules/@emnapi/core": { "version": "1.7.1", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.7.1.tgz", @@ -639,6 +627,448 @@ "tslib": "^2.4.0" } }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.2.tgz", + "integrity": "sha512-GZMB+a0mOMZs4MpDbj8RJp4cw+w1WV5NYD6xzgvzUJ5Ek2jerwfO2eADyI6ExDSUED+1X8aMbegahsJi+8mgpw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.2.tgz", + "integrity": "sha512-DVNI8jlPa7Ujbr1yjU2PfUSRtAUZPG9I1RwW4F4xFB1Imiu2on0ADiI/c3td+KmDtVKNbi+nffGDQMfcIMkwIA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.2.tgz", + "integrity": "sha512-pvz8ZZ7ot/RBphf8fv60ljmaoydPU12VuXHImtAs0XhLLw+EXBi2BLe3OYSBslR4rryHvweW5gmkKFwTiFy6KA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.2.tgz", + "integrity": "sha512-z8Ank4Byh4TJJOh4wpz8g2vDy75zFL0TlZlkUkEwYXuPSgX8yzep596n6mT7905kA9uHZsf/o2OJZubl2l3M7A==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.2.tgz", + "integrity": "sha512-davCD2Zc80nzDVRwXTcQP/28fiJbcOwvdolL0sOiOsbwBa72kegmVU0Wrh1MYrbuCL98Omp5dVhQFWRKR2ZAlg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.2.tgz", + "integrity": "sha512-ZxtijOmlQCBWGwbVmwOF/UCzuGIbUkqB1faQRf5akQmxRJ1ujusWsb3CVfk/9iZKr2L5SMU5wPBi1UWbvL+VQA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.2.tgz", + "integrity": "sha512-lS/9CN+rgqQ9czogxlMcBMGd+l8Q3Nj1MFQwBZJyoEKI50XGxwuzznYdwcav6lpOGv5BqaZXqvBSiB/kJ5op+g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.2.tgz", + "integrity": "sha512-tAfqtNYb4YgPnJlEFu4c212HYjQWSO/w/h/lQaBK7RbwGIkBOuNKQI9tqWzx7Wtp7bTPaGC6MJvWI608P3wXYA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.2.tgz", + "integrity": "sha512-vWfq4GaIMP9AIe4yj1ZUW18RDhx6EPQKjwe7n8BbIecFtCQG4CfHGaHuh7fdfq+y3LIA2vGS/o9ZBGVxIDi9hw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.2.tgz", + "integrity": "sha512-hYxN8pr66NsCCiRFkHUAsxylNOcAQaxSSkHMMjcpx0si13t1LHFphxJZUiGwojB1a/Hd5OiPIqDdXONia6bhTw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.2.tgz", + "integrity": "sha512-MJt5BRRSScPDwG2hLelYhAAKh9imjHK5+NE/tvnRLbIqUWa+0E9N4WNMjmp/kXXPHZGqPLxggwVhz7QP8CTR8w==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.2.tgz", + "integrity": "sha512-lugyF1atnAT463aO6KPshVCJK5NgRnU4yb3FUumyVz+cGvZbontBgzeGFO1nF+dPueHD367a2ZXe1NtUkAjOtg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.2.tgz", + "integrity": "sha512-nlP2I6ArEBewvJ2gjrrkESEZkB5mIoaTswuqNFRv/WYd+ATtUpe9Y09RnJvgvdag7he0OWgEZWhviS1OTOKixw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.2.tgz", + "integrity": "sha512-C92gnpey7tUQONqg1n6dKVbx3vphKtTHJaNG2Ok9lGwbZil6DrfyecMsp9CrmXGQJmZ7iiVXvvZH6Ml5hL6XdQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.2.tgz", + "integrity": "sha512-B5BOmojNtUyN8AXlK0QJyvjEZkWwy/FKvakkTDCziX95AowLZKR6aCDhG7LeF7uMCXEJqwa8Bejz5LTPYm8AvA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.2.tgz", + "integrity": "sha512-p4bm9+wsPwup5Z8f4EpfN63qNagQ47Ua2znaqGH6bqLlmJ4bx97Y9JdqxgGZ6Y8xVTixUnEkoKSHcpRlDnNr5w==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.2.tgz", + "integrity": "sha512-uwp2Tip5aPmH+NRUwTcfLb+W32WXjpFejTIOWZFw/v7/KnpCDKG66u4DLcurQpiYTiYwQ9B7KOeMJvLCu/OvbA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.2.tgz", + "integrity": "sha512-Kj6DiBlwXrPsCRDeRvGAUb/LNrBASrfqAIok+xB0LxK8CHqxZ037viF13ugfsIpePH93mX7xfJp97cyDuTZ3cw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.2.tgz", + "integrity": "sha512-HwGDZ0VLVBY3Y+Nw0JexZy9o/nUAWq9MlV7cahpaXKW6TOzfVno3y3/M8Ga8u8Yr7GldLOov27xiCnqRZf0tCA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.2.tgz", + "integrity": "sha512-DNIHH2BPQ5551A7oSHD0CKbwIA/Ox7+78/AWkbS5QoRzaqlev2uFayfSxq68EkonB+IKjiuxBFoV8ESJy8bOHA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.2.tgz", + "integrity": "sha512-/it7w9Nb7+0KFIzjalNJVR5bOzA9Vay+yIPLVHfIQYG/j+j9VTH84aNB8ExGKPU4AzfaEvN9/V4HV+F+vo8OEg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.2.tgz", + "integrity": "sha512-LRBbCmiU51IXfeXk59csuX/aSaToeG7w48nMwA6049Y4J4+VbWALAuXcs+qcD04rHDuSCSRKdmY63sruDS5qag==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.2.tgz", + "integrity": "sha512-kMtx1yqJHTmqaqHPAzKCAkDaKsffmXkPHThSfRwZGyuqyIeBvf08KSsYXl+abf5HDAPMJIPnbBfXvP2ZC2TfHg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.2.tgz", + "integrity": "sha512-Yaf78O/B3Kkh+nKABUF++bvJv5Ijoy9AN1ww904rOXZFLWVc5OLOfL56W+C8F9xn5JQZa3UX6m+IktJnIb1Jjg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.2.tgz", + "integrity": "sha512-Iuws0kxo4yusk7sw70Xa2E2imZU5HoixzxfGCdxwBdhiDgt9vX9VUCBhqcwY7/uh//78A1hMkkROMJq9l27oLQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.2.tgz", + "integrity": "sha512-sRdU18mcKf7F+YgheI/zGf5alZatMUTKj/jNS6l744f9u3WFu4v7twcUI9vu4mknF4Y9aDlblIie0IM+5xxaqQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, "node_modules/@huggingface/jinja": { "version": "0.5.1", "license": "MIT", @@ -1103,17 +1533,6 @@ "node": ">=6.0.0" } }, - "node_modules/@jridgewell/source-map": { - "version": "0.3.10", - "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.10.tgz", - "integrity": "sha512-0pPkgz9dY+bijgistcTTJ5mR+ocqRXLuhXHYdzoMmmoJ2C9S46RCm2GMUbatPEUK9Yjy26IrAy8D/M00lLkv+Q==", - "dev": true, - "license": "MIT", - "dependencies": { - "@jridgewell/gen-mapping": "^0.3.5", - "@jridgewell/trace-mapping": "^0.3.25" - } - }, "node_modules/@jridgewell/sourcemap-codec": { "version": "1.5.4", "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.4.tgz", @@ -1143,62 +1562,6 @@ "node": ">=v12.0.0" } }, - "node_modules/@jsonjoy.com/base64": { - "version": "1.1.2", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=10.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/streamich" - }, - "peerDependencies": { - "tslib": "2" - } - }, - "node_modules/@jsonjoy.com/json-pack": { - "version": "1.1.0", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@jsonjoy.com/base64": "^1.1.1", - "@jsonjoy.com/util": "^1.1.2", - "hyperdyperid": "^1.2.0", - "thingies": "^1.20.0" - }, - "engines": { - "node": ">=10.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/streamich" - }, - "peerDependencies": { - "tslib": "2" - } - }, - "node_modules/@jsonjoy.com/util": { - "version": "1.5.0", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=10.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/streamich" - }, - "peerDependencies": { - "tslib": "2" - } - }, - "node_modules/@leichtgewicht/ip-codec": { - "version": "2.0.5", - "dev": true, - "license": "MIT" - }, "node_modules/@napi-rs/wasm-runtime": { "version": "0.2.12", "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", @@ -1353,139 +1716,46 @@ "node_modules/@types/babel__core": { "version": "7.20.5", "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", - "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/parser": "^7.20.7", - "@babel/types": "^7.20.7", - "@types/babel__generator": "*", - "@types/babel__template": "*", - "@types/babel__traverse": "*" - } - }, - "node_modules/@types/babel__generator": { - "version": "7.27.0", - "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz", - "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/types": "^7.0.0" - } - }, - "node_modules/@types/babel__template": { - "version": "7.4.4", - "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", - "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/parser": "^7.1.0", - "@babel/types": "^7.0.0" - } - }, - "node_modules/@types/babel__traverse": { - "version": "7.28.0", - "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", - "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/types": "^7.28.2" - } - }, - "node_modules/@types/body-parser": { - "version": "1.19.2", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/connect": "*", - "@types/node": "*" - } - }, - "node_modules/@types/bonjour": { - "version": "3.5.13", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node": "*" - } - }, - "node_modules/@types/connect": { - "version": "3.4.35", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node": "*" - } - }, - "node_modules/@types/connect-history-api-fallback": { - "version": "1.5.4", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/express-serve-static-core": "*", - "@types/node": "*" - } - }, - "node_modules/@types/eslint": { - "version": "9.6.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/estree": "*", - "@types/json-schema": "*" - } - }, - "node_modules/@types/eslint-scope": { - "version": "3.7.7", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/eslint": "*", - "@types/estree": "*" - } - }, - "node_modules/@types/estree": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", - "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/express": { - "version": "4.17.21", + "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", "dev": true, "license": "MIT", "dependencies": { - "@types/body-parser": "*", - "@types/express-serve-static-core": "^4.17.33", - "@types/qs": "*", - "@types/serve-static": "*" + "@babel/parser": "^7.20.7", + "@babel/types": "^7.20.7", + "@types/babel__generator": "*", + "@types/babel__template": "*", + "@types/babel__traverse": "*" } }, - "node_modules/@types/express-serve-static-core": { - "version": "4.17.33", + "node_modules/@types/babel__generator": { + "version": "7.27.0", + "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz", + "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==", "dev": true, "license": "MIT", "dependencies": { - "@types/node": "*", - "@types/qs": "*", - "@types/range-parser": "*" + "@babel/types": "^7.0.0" } }, - "node_modules/@types/http-errors": { - "version": "2.0.4", + "node_modules/@types/babel__template": { + "version": "7.4.4", + "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", + "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.1.0", + "@babel/types": "^7.0.0" + } }, - "node_modules/@types/http-proxy": { - "version": "1.17.10", + "node_modules/@types/babel__traverse": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", + "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==", "dev": true, "license": "MIT", "dependencies": { - "@types/node": "*" + "@babel/types": "^7.28.2" } }, "node_modules/@types/istanbul-lib-coverage": { @@ -1526,13 +1796,6 @@ "pretty-format": "^30.0.0" } }, - "node_modules/@types/json-schema": { - "version": "7.0.15", - "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", - "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==", - "dev": true, - "license": "MIT" - }, "node_modules/@types/linkify-it": { "version": "5.0.0", "dev": true, @@ -1552,11 +1815,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@types/mime": { - "version": "1.3.5", - "dev": true, - "license": "MIT" - }, "node_modules/@types/node": { "version": "24.0.14", "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.14.tgz", @@ -1566,64 +1824,6 @@ "undici-types": "~7.8.0" } }, - "node_modules/@types/node-forge": { - "version": "1.3.11", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node": "*" - } - }, - "node_modules/@types/qs": { - "version": "6.9.7", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/range-parser": { - "version": "1.2.4", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/retry": { - "version": "0.12.2", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/send": { - "version": "0.17.4", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/mime": "^1", - "@types/node": "*" - } - }, - "node_modules/@types/serve-index": { - "version": "1.9.4", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/express": "*" - } - }, - "node_modules/@types/serve-static": { - "version": "1.15.7", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/http-errors": "*", - "@types/node": "*", - "@types/send": "*" - } - }, - "node_modules/@types/sockjs": { - "version": "0.3.36", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node": "*" - } - }, "node_modules/@types/stack-utils": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz", @@ -1631,14 +1831,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@types/ws": { - "version": "8.5.13", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node": "*" - } - }, "node_modules/@types/yargs": { "version": "17.0.33", "resolved": "https://registry.npmjs.org/@types/yargs/-/yargs-17.0.33.tgz", @@ -1854,316 +2046,90 @@ ], "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-x64-musl": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-musl/-/resolver-binding-linux-x64-musl-1.11.1.tgz", - "integrity": "sha512-rV0YSoyhK2nZ4vEswT/QwqzqQXw5I6CjoaYMOX0TqBlWhojUf8P94mvI7nuJTeaCkkds3QE4+zS8Ko+GdXuZtA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-wasm32-wasi": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-wasm32-wasi/-/resolver-binding-wasm32-wasi-1.11.1.tgz", - "integrity": "sha512-5u4RkfxJm+Ng7IWgkzi3qrFOvLvQYnPBmjmZQ8+szTK/b31fQCnleNl1GgEt7nIsZRIf5PLhPwT0WM+q45x/UQ==", - "cpu": [ - "wasm32" - ], - "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "@napi-rs/wasm-runtime": "^0.2.11" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@unrs/resolver-binding-win32-arm64-msvc": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-arm64-msvc/-/resolver-binding-win32-arm64-msvc-1.11.1.tgz", - "integrity": "sha512-nRcz5Il4ln0kMhfL8S3hLkxI85BXs3o8EYoattsJNdsX4YUU89iOkVn7g0VHSRxFuVMdM4Q1jEpIId1Ihim/Uw==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@unrs/resolver-binding-win32-ia32-msvc": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-ia32-msvc/-/resolver-binding-win32-ia32-msvc-1.11.1.tgz", - "integrity": "sha512-DCEI6t5i1NmAZp6pFonpD5m7i6aFrpofcp4LA2i8IIq60Jyo28hamKBxNrZcyOwVOZkgsRp9O2sXWBWP8MnvIQ==", - "cpu": [ - "ia32" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@unrs/resolver-binding-win32-x64-msvc": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-x64-msvc/-/resolver-binding-win32-x64-msvc-1.11.1.tgz", - "integrity": "sha512-lrW200hZdbfRtztbygyaq/6jP6AKE8qQN2KvPcJ+x7wiD038YtnYtZ82IMNJ69GJibV7bwL3y9FgK+5w/pYt6g==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@webassemblyjs/ast": { - "version": "1.14.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/helper-numbers": "1.13.2", - "@webassemblyjs/helper-wasm-bytecode": "1.13.2" - } - }, - "node_modules/@webassemblyjs/floating-point-hex-parser": { - "version": "1.13.2", - "dev": true, - "license": "MIT" - }, - "node_modules/@webassemblyjs/helper-api-error": { - "version": "1.13.2", - "dev": true, - "license": "MIT" - }, - "node_modules/@webassemblyjs/helper-buffer": { - "version": "1.14.1", - "dev": true, - "license": "MIT" - }, - "node_modules/@webassemblyjs/helper-numbers": { - "version": "1.13.2", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/floating-point-hex-parser": "1.13.2", - "@webassemblyjs/helper-api-error": "1.13.2", - "@xtuc/long": "4.2.2" - } - }, - "node_modules/@webassemblyjs/helper-wasm-bytecode": { - "version": "1.13.2", - "dev": true, - "license": "MIT" - }, - "node_modules/@webassemblyjs/helper-wasm-section": { - "version": "1.14.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/ast": "1.14.1", - "@webassemblyjs/helper-buffer": "1.14.1", - "@webassemblyjs/helper-wasm-bytecode": "1.13.2", - "@webassemblyjs/wasm-gen": "1.14.1" - } - }, - "node_modules/@webassemblyjs/ieee754": { - "version": "1.13.2", - "dev": true, - "license": "MIT", - "dependencies": { - "@xtuc/ieee754": "^1.2.0" - } - }, - "node_modules/@webassemblyjs/leb128": { - "version": "1.13.2", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@xtuc/long": "4.2.2" - } - }, - "node_modules/@webassemblyjs/utf8": { - "version": "1.13.2", - "dev": true, - "license": "MIT" - }, - "node_modules/@webassemblyjs/wasm-edit": { - "version": "1.14.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/ast": "1.14.1", - "@webassemblyjs/helper-buffer": "1.14.1", - "@webassemblyjs/helper-wasm-bytecode": "1.13.2", - "@webassemblyjs/helper-wasm-section": "1.14.1", - "@webassemblyjs/wasm-gen": "1.14.1", - "@webassemblyjs/wasm-opt": "1.14.1", - "@webassemblyjs/wasm-parser": "1.14.1", - "@webassemblyjs/wast-printer": "1.14.1" - } - }, - "node_modules/@webassemblyjs/wasm-gen": { - "version": "1.14.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/ast": "1.14.1", - "@webassemblyjs/helper-wasm-bytecode": "1.13.2", - "@webassemblyjs/ieee754": "1.13.2", - "@webassemblyjs/leb128": "1.13.2", - "@webassemblyjs/utf8": "1.13.2" - } - }, - "node_modules/@webassemblyjs/wasm-opt": { - "version": "1.14.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/ast": "1.14.1", - "@webassemblyjs/helper-buffer": "1.14.1", - "@webassemblyjs/wasm-gen": "1.14.1", - "@webassemblyjs/wasm-parser": "1.14.1" - } - }, - "node_modules/@webassemblyjs/wasm-parser": { - "version": "1.14.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/ast": "1.14.1", - "@webassemblyjs/helper-api-error": "1.13.2", - "@webassemblyjs/helper-wasm-bytecode": "1.13.2", - "@webassemblyjs/ieee754": "1.13.2", - "@webassemblyjs/leb128": "1.13.2", - "@webassemblyjs/utf8": "1.13.2" - } - }, - "node_modules/@webassemblyjs/wast-printer": { - "version": "1.14.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@webassemblyjs/ast": "1.14.1", - "@xtuc/long": "4.2.2" - } - }, - "node_modules/@webgpu/types": { - "version": "0.1.64", - "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.64.tgz", - "integrity": "sha512-84kRIAGV46LJTlJZWxShiOrNL30A+9KokD7RB3dRCIqODFjodS5tCD5yyiZ8kIReGVZSDfA3XkkwyyOIF6K62A==", - "dev": true, - "license": "BSD-3-Clause" - }, - "node_modules/@webpack-cli/configtest": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/@webpack-cli/configtest/-/configtest-3.0.1.tgz", - "integrity": "sha512-u8d0pJ5YFgneF/GuvEiDA61Tf1VDomHHYMjv/wc9XzYj7nopltpG96nXN5dJRstxZhcNpV1g+nT6CydO7pHbjA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18.12.0" - }, - "peerDependencies": { - "webpack": "^5.82.0", - "webpack-cli": "6.x.x" - } - }, - "node_modules/@webpack-cli/info": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/@webpack-cli/info/-/info-3.0.1.tgz", - "integrity": "sha512-coEmDzc2u/ffMvuW9aCjoRzNSPDl/XLuhPdlFRpT9tZHmJ/039az33CE7uH+8s0uL1j5ZNtfdv0HkfaKRBGJsQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18.12.0" - }, - "peerDependencies": { - "webpack": "^5.82.0", - "webpack-cli": "6.x.x" - } - }, - "node_modules/@webpack-cli/serve": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/@webpack-cli/serve/-/serve-3.0.1.tgz", - "integrity": "sha512-sbgw03xQaCLiT6gcY/6u3qBDn01CWw/nbaXl3gTdTFuJJ75Gffv3E3DBpgvY2fkkrdS1fpjaXNOmJlnbtKauKg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18.12.0" - }, - "peerDependencies": { - "webpack": "^5.82.0", - "webpack-cli": "6.x.x" - }, - "peerDependenciesMeta": { - "webpack-dev-server": { - "optional": true - } - } - }, - "node_modules/@xtuc/ieee754": { - "version": "1.2.0", - "dev": true, - "license": "BSD-3-Clause" + "optional": true, + "os": [ + "linux" + ] }, - "node_modules/@xtuc/long": { - "version": "4.2.2", + "node_modules/@unrs/resolver-binding-linux-x64-musl": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-musl/-/resolver-binding-linux-x64-musl-1.11.1.tgz", + "integrity": "sha512-rV0YSoyhK2nZ4vEswT/QwqzqQXw5I6CjoaYMOX0TqBlWhojUf8P94mvI7nuJTeaCkkds3QE4+zS8Ko+GdXuZtA==", + "cpu": [ + "x64" + ], "dev": true, - "license": "Apache-2.0" + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] }, - "node_modules/accepts": { - "version": "1.3.8", + "node_modules/@unrs/resolver-binding-wasm32-wasi": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-wasm32-wasi/-/resolver-binding-wasm32-wasi-1.11.1.tgz", + "integrity": "sha512-5u4RkfxJm+Ng7IWgkzi3qrFOvLvQYnPBmjmZQ8+szTK/b31fQCnleNl1GgEt7nIsZRIf5PLhPwT0WM+q45x/UQ==", + "cpu": [ + "wasm32" + ], "dev": true, "license": "MIT", + "optional": true, "dependencies": { - "mime-types": "~2.1.34", - "negotiator": "0.6.3" + "@napi-rs/wasm-runtime": "^0.2.11" }, "engines": { - "node": ">= 0.6" + "node": ">=14.0.0" } }, - "node_modules/acorn": { - "version": "8.15.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", - "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", + "node_modules/@unrs/resolver-binding-win32-arm64-msvc": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-arm64-msvc/-/resolver-binding-win32-arm64-msvc-1.11.1.tgz", + "integrity": "sha512-nRcz5Il4ln0kMhfL8S3hLkxI85BXs3o8EYoattsJNdsX4YUU89iOkVn7g0VHSRxFuVMdM4Q1jEpIId1Ihim/Uw==", + "cpu": [ + "arm64" + ], "dev": true, "license": "MIT", - "bin": { - "acorn": "bin/acorn" - }, - "engines": { - "node": ">=0.4.0" - } + "optional": true, + "os": [ + "win32" + ] }, - "node_modules/acorn-import-phases": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/acorn-import-phases/-/acorn-import-phases-1.0.4.tgz", - "integrity": "sha512-wKmbr/DDiIXzEOiWrTTUcDm24kQ2vGfZQvM2fwg2vXqR5uW6aapr7ObPtj1th32b9u90/Pf4AItvdTh42fBmVQ==", + "node_modules/@unrs/resolver-binding-win32-ia32-msvc": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-ia32-msvc/-/resolver-binding-win32-ia32-msvc-1.11.1.tgz", + "integrity": "sha512-DCEI6t5i1NmAZp6pFonpD5m7i6aFrpofcp4LA2i8IIq60Jyo28hamKBxNrZcyOwVOZkgsRp9O2sXWBWP8MnvIQ==", + "cpu": [ + "ia32" + ], "dev": true, "license": "MIT", - "engines": { - "node": ">=10.13.0" - }, - "peerDependencies": { - "acorn": "^8.14.0" - } + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@unrs/resolver-binding-win32-x64-msvc": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-x64-msvc/-/resolver-binding-win32-x64-msvc-1.11.1.tgz", + "integrity": "sha512-lrW200hZdbfRtztbygyaq/6jP6AKE8qQN2KvPcJ+x7wiD038YtnYtZ82IMNJ69GJibV7bwL3y9FgK+5w/pYt6g==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@webgpu/types": { + "version": "0.1.64", + "resolved": "https://registry.npmjs.org/@webgpu/types/-/types-0.1.64.tgz", + "integrity": "sha512-84kRIAGV46LJTlJZWxShiOrNL30A+9KokD7RB3dRCIqODFjodS5tCD5yyiZ8kIReGVZSDfA3XkkwyyOIF6K62A==", + "dev": true, + "license": "BSD-3-Clause" }, "node_modules/adm-zip": { "version": "0.5.16", @@ -2174,52 +2140,6 @@ "node": ">=12.0" } }, - "node_modules/ajv": { - "version": "8.17.1", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz", - "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", - "dev": true, - "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.3", - "fast-uri": "^3.0.1", - "json-schema-traverse": "^1.0.0", - "require-from-string": "^2.0.2" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" - } - }, - "node_modules/ajv-formats": { - "version": "2.1.1", - "dev": true, - "license": "MIT", - "dependencies": { - "ajv": "^8.0.0" - }, - "peerDependencies": { - "ajv": "^8.0.0" - }, - "peerDependenciesMeta": { - "ajv": { - "optional": true - } - } - }, - "node_modules/ajv-keywords": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz", - "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==", - "dev": true, - "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.3" - }, - "peerDependencies": { - "ajv": "^8.8.2" - } - }, "node_modules/ansi-escapes": { "version": "4.3.2", "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.2.tgz", @@ -2236,17 +2156,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/ansi-html-community": { - "version": "0.0.8", - "dev": true, - "engines": [ - "node >= 0.8.0" - ], - "license": "Apache-2.0", - "bin": { - "ansi-html": "bin/ansi-html" - } - }, "node_modules/ansi-regex": { "version": "6.2.2", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz", @@ -2405,59 +2314,11 @@ "dev": true, "license": "MIT" }, - "node_modules/batch": { - "version": "0.6.1", - "dev": true, - "license": "MIT" - }, - "node_modules/binary-extensions": { - "version": "2.3.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/bluebird": { "version": "3.7.2", "dev": true, "license": "MIT" }, - "node_modules/body-parser": { - "version": "1.20.3", - "dev": true, - "license": "MIT", - "dependencies": { - "bytes": "3.1.2", - "content-type": "~1.0.5", - "debug": "2.6.9", - "depd": "2.0.0", - "destroy": "1.2.0", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "on-finished": "2.4.1", - "qs": "6.13.0", - "raw-body": "2.5.2", - "type-is": "~1.6.18", - "unpipe": "1.0.0" - }, - "engines": { - "node": ">= 0.8", - "npm": "1.2.8000 || >= 1.4.16" - } - }, - "node_modules/bonjour-service": { - "version": "1.3.0", - "dev": true, - "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.3", - "multicast-dns": "^7.2.5" - } - }, "node_modules/boolean": { "version": "3.2.0", "license": "MIT" @@ -2531,30 +2392,6 @@ "dev": true, "license": "MIT" }, - "node_modules/bundle-name": { - "version": "4.1.0", - "dev": true, - "license": "MIT", - "dependencies": { - "run-applescript": "^7.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/bytes": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", - "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, "node_modules/cache-point": { "version": "3.0.1", "dev": true, @@ -2574,24 +2411,6 @@ } } }, - "node_modules/call-bind": { - "version": "1.0.7", - "dev": true, - "license": "MIT", - "dependencies": { - "es-define-property": "^1.0.0", - "es-errors": "^1.3.0", - "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.4", - "set-function-length": "^1.2.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/callsites": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", @@ -2681,37 +2500,6 @@ "node": ">=10" } }, - "node_modules/chokidar": { - "version": "3.6.0", - "dev": true, - "license": "MIT", - "dependencies": { - "anymatch": "~3.1.2", - "braces": "~3.0.2", - "glob-parent": "~5.1.2", - "is-binary-path": "~2.1.0", - "is-glob": "~4.0.1", - "normalize-path": "~3.0.0", - "readdirp": "~3.6.0" - }, - "engines": { - "node": ">= 8.10.0" - }, - "funding": { - "url": "https://paulmillr.com/funding/" - }, - "optionalDependencies": { - "fsevents": "~2.3.2" - } - }, - "node_modules/chrome-trace-event": { - "version": "1.0.3", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.0" - } - }, "node_modules/ci-info": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-4.3.0.tgz", @@ -2813,21 +2601,6 @@ "url": "https://github.com/chalk/wrap-ansi?sponsor=1" } }, - "node_modules/clone-deep": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-4.0.1.tgz", - "integrity": "sha512-neHB9xuzh/wk0dIHweyAXv2aPGZIVk3pLMe+/RNzINf17fe0OG96QroktYAUm7SM1PBnzTabaLboqqxDyMU+SQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "is-plain-object": "^2.0.4", - "kind-of": "^6.0.2", - "shallow-clone": "^3.0.0" - }, - "engines": { - "node": ">=6" - } - }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -2879,11 +2652,6 @@ "simple-swizzle": "^0.2.2" } }, - "node_modules/colorette": { - "version": "2.0.20", - "dev": true, - "license": "MIT" - }, "node_modules/command-line-args": { "version": "6.0.1", "dev": true, @@ -2920,109 +2688,35 @@ "node": ">=12.20.0" } }, - "node_modules/commander": { - "version": "2.20.3", - "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", - "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", - "dev": true, - "license": "MIT" - }, "node_modules/common-sequence": { - "version": "3.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12.17" - } - }, - "node_modules/compressible": { - "version": "2.0.18", - "dev": true, - "license": "MIT", - "dependencies": { - "mime-db": ">= 1.43.0 < 2" - }, - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/compression": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz", - "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==", - "dev": true, - "license": "MIT", - "dependencies": { - "bytes": "3.1.2", - "compressible": "~2.0.18", - "debug": "2.6.9", - "negotiator": "~0.6.4", - "on-headers": "~1.1.0", - "safe-buffer": "5.2.1", - "vary": "~1.1.2" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/compression/node_modules/negotiator": { - "version": "0.6.4", - "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz", - "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/concat-map": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", - "dev": true, - "license": "MIT" - }, - "node_modules/config-master": { - "version": "3.1.0", - "dev": true, - "license": "MIT", - "dependencies": { - "walk-back": "^2.0.1" - } - }, - "node_modules/config-master/node_modules/walk-back": { - "version": "2.0.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/connect-history-api-fallback": { - "version": "2.0.0", + "version": "3.0.0", "dev": true, "license": "MIT", "engines": { - "node": ">=0.8" + "node": ">=12.17" } }, - "node_modules/content-disposition": { - "version": "0.5.4", + "node_modules/concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "dev": true, + "license": "MIT" + }, + "node_modules/config-master": { + "version": "3.1.0", "dev": true, "license": "MIT", "dependencies": { - "safe-buffer": "5.2.1" - }, - "engines": { - "node": ">= 0.6" + "walk-back": "^2.0.1" } }, - "node_modules/content-type": { - "version": "1.0.5", + "node_modules/config-master/node_modules/walk-back": { + "version": "2.0.1", "dev": true, "license": "MIT", "engines": { - "node": ">= 0.6" + "node": ">=0.10.0" } }, "node_modules/convert-source-map": { @@ -3032,24 +2726,6 @@ "dev": true, "license": "MIT" }, - "node_modules/cookie": { - "version": "0.7.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/cookie-signature": { - "version": "1.0.6", - "dev": true, - "license": "MIT" - }, - "node_modules/core-util-is": { - "version": "1.0.3", - "dev": true, - "license": "MIT" - }, "node_modules/cross-spawn": { "version": "7.0.6", "dev": true, @@ -3071,14 +2747,6 @@ "node": ">=12.17" } }, - "node_modules/debug": { - "version": "2.6.9", - "dev": true, - "license": "MIT", - "dependencies": { - "ms": "2.0.0" - } - }, "node_modules/dedent": { "version": "1.7.0", "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.7.0.tgz", @@ -3104,32 +2772,6 @@ "node": ">=0.10.0" } }, - "node_modules/default-browser": { - "version": "5.2.1", - "dev": true, - "license": "MIT", - "dependencies": { - "bundle-name": "^4.1.0", - "default-browser-id": "^5.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/default-browser-id": { - "version": "5.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/define-data-property": { "version": "1.1.4", "license": "MIT", @@ -3145,17 +2787,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/define-lazy-prop": { - "version": "3.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/define-properties": { "version": "1.2.1", "license": "MIT", @@ -3171,23 +2802,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/depd": { - "version": "2.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, - "node_modules/destroy": { - "version": "1.2.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8", - "npm": "1.2.8000 || >= 1.4.16" - } - }, "node_modules/detect-libc": { "version": "2.0.4", "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz", @@ -3236,17 +2850,6 @@ } } }, - "node_modules/dns-packet": { - "version": "5.6.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@leichtgewicht/ip-codec": "^2.0.1" - }, - "engines": { - "node": ">=6" - } - }, "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", @@ -3254,11 +2857,6 @@ "dev": true, "license": "MIT" }, - "node_modules/ee-first": { - "version": "1.1.1", - "dev": true, - "license": "MIT" - }, "node_modules/electron-to-chromium": { "version": "1.5.50", "dev": true, @@ -3284,28 +2882,6 @@ "dev": true, "license": "MIT" }, - "node_modules/encodeurl": { - "version": "2.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, - "node_modules/enhanced-resolve": { - "version": "5.18.2", - "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.18.2.tgz", - "integrity": "sha512-6Jw4sE1maoRJo3q8MsSIn2onJFbLTOjY9hlx4DZXmOKvLRd1Ok2kXmAGXaafL2+ijsJZ1ClYbl/pmqr9+k4iUQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "graceful-fs": "^4.2.4", - "tapable": "^2.2.0" - }, - "engines": { - "node": ">=10.13.0" - } - }, "node_modules/entities": { "version": "4.5.0", "dev": true, @@ -3317,19 +2893,6 @@ "url": "https://github.com/fb55/entities?sponsor=1" } }, - "node_modules/envinfo": { - "version": "7.14.0", - "resolved": "https://registry.npmjs.org/envinfo/-/envinfo-7.14.0.tgz", - "integrity": "sha512-CO40UI41xDQzhLB1hWyqUKgFhs250pNcGbyGKe1l/e4FSaI/+YE4IMG76GDt0In67WLPACIITC+sOi08x4wIvg==", - "dev": true, - "license": "MIT", - "bin": { - "envinfo": "dist/cli.js" - }, - "engines": { - "node": ">=4" - } - }, "node_modules/error-ex": { "version": "1.3.4", "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.4.tgz", @@ -3364,15 +2927,52 @@ "node": ">= 0.4" } }, - "node_modules/es-module-lexer": { - "version": "1.2.1", - "dev": true, - "license": "MIT" - }, "node_modules/es6-error": { "version": "4.1.1", "license": "MIT" }, + "node_modules/esbuild": { + "version": "0.27.2", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.2.tgz", + "integrity": "sha512-HyNQImnsOC7X9PMNaCIeAm4ISCQXs5a5YasTXVliKv4uuBo1dKrG0A+uQS8M5eXjVMnLg3WgXaKvprHlFJQffw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.27.2", + "@esbuild/android-arm": "0.27.2", + "@esbuild/android-arm64": "0.27.2", + "@esbuild/android-x64": "0.27.2", + "@esbuild/darwin-arm64": "0.27.2", + "@esbuild/darwin-x64": "0.27.2", + "@esbuild/freebsd-arm64": "0.27.2", + "@esbuild/freebsd-x64": "0.27.2", + "@esbuild/linux-arm": "0.27.2", + "@esbuild/linux-arm64": "0.27.2", + "@esbuild/linux-ia32": "0.27.2", + "@esbuild/linux-loong64": "0.27.2", + "@esbuild/linux-mips64el": "0.27.2", + "@esbuild/linux-ppc64": "0.27.2", + "@esbuild/linux-riscv64": "0.27.2", + "@esbuild/linux-s390x": "0.27.2", + "@esbuild/linux-x64": "0.27.2", + "@esbuild/netbsd-arm64": "0.27.2", + "@esbuild/netbsd-x64": "0.27.2", + "@esbuild/openbsd-arm64": "0.27.2", + "@esbuild/openbsd-x64": "0.27.2", + "@esbuild/openharmony-arm64": "0.27.2", + "@esbuild/sunos-x64": "0.27.2", + "@esbuild/win32-arm64": "0.27.2", + "@esbuild/win32-ia32": "0.27.2", + "@esbuild/win32-x64": "0.27.2" + } + }, "node_modules/escalade": { "version": "3.2.0", "dev": true, @@ -3381,11 +2981,6 @@ "node": ">=6" } }, - "node_modules/escape-html": { - "version": "1.0.3", - "dev": true, - "license": "MIT" - }, "node_modules/escape-string-regexp": { "version": "2.0.0", "dev": true, @@ -3394,18 +2989,6 @@ "node": ">=8" } }, - "node_modules/eslint-scope": { - "version": "5.1.1", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "esrecurse": "^4.3.0", - "estraverse": "^4.1.1" - }, - "engines": { - "node": ">=8.0.0" - } - }, "node_modules/esprima": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", @@ -3420,54 +3003,6 @@ "node": ">=4" } }, - "node_modules/esrecurse": { - "version": "4.3.0", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "estraverse": "^5.2.0" - }, - "engines": { - "node": ">=4.0" - } - }, - "node_modules/esrecurse/node_modules/estraverse": { - "version": "5.3.0", - "dev": true, - "license": "BSD-2-Clause", - "engines": { - "node": ">=4.0" - } - }, - "node_modules/estraverse": { - "version": "4.3.0", - "dev": true, - "license": "BSD-2-Clause", - "engines": { - "node": ">=4.0" - } - }, - "node_modules/etag": { - "version": "1.8.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/eventemitter3": { - "version": "4.0.7", - "dev": true, - "license": "MIT" - }, - "node_modules/events": { - "version": "3.3.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.8.x" - } - }, "node_modules/execa": { "version": "5.1.1", "resolved": "https://registry.npmjs.org/execa/-/execa-5.1.1.tgz", @@ -3527,61 +3062,6 @@ "node": "^18.14.0 || ^20.0.0 || ^22.0.0 || >=24.0.0" } }, - "node_modules/express": { - "version": "4.21.2", - "dev": true, - "license": "MIT", - "dependencies": { - "accepts": "~1.3.8", - "array-flatten": "1.1.1", - "body-parser": "1.20.3", - "content-disposition": "0.5.4", - "content-type": "~1.0.4", - "cookie": "0.7.1", - "cookie-signature": "1.0.6", - "debug": "2.6.9", - "depd": "2.0.0", - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "finalhandler": "1.3.1", - "fresh": "0.5.2", - "http-errors": "2.0.0", - "merge-descriptors": "1.0.3", - "methods": "~1.1.2", - "on-finished": "2.4.1", - "parseurl": "~1.3.3", - "path-to-regexp": "0.1.12", - "proxy-addr": "~2.0.7", - "qs": "6.13.0", - "range-parser": "~1.2.1", - "safe-buffer": "5.2.1", - "send": "0.19.0", - "serve-static": "1.16.2", - "setprototypeof": "1.2.0", - "statuses": "2.0.1", - "type-is": "~1.6.18", - "utils-merge": "1.0.1", - "vary": "~1.1.2" - }, - "engines": { - "node": ">= 0.10.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" - } - }, - "node_modules/express/node_modules/array-flatten": { - "version": "1.1.1", - "dev": true, - "license": "MIT" - }, - "node_modules/fast-deep-equal": { - "version": "3.1.3", - "dev": true, - "license": "MIT" - }, "node_modules/fast-glob": { "version": "3.3.2", "dev": true, @@ -3604,19 +3084,6 @@ "dev": true, "license": "MIT" }, - "node_modules/fast-uri": { - "version": "3.0.3", - "dev": true, - "license": "BSD-3-Clause" - }, - "node_modules/fastest-levenshtein": { - "version": "1.0.16", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 4.9.1" - } - }, "node_modules/fastq": { "version": "1.17.1", "dev": true, @@ -3625,17 +3092,6 @@ "reusify": "^1.0.4" } }, - "node_modules/faye-websocket": { - "version": "0.11.4", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "websocket-driver": ">=0.5.1" - }, - "engines": { - "node": ">=0.8.0" - } - }, "node_modules/fb-watchman": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/fb-watchman/-/fb-watchman-2.0.2.tgz", @@ -3677,23 +3133,6 @@ "node": ">=8" } }, - "node_modules/finalhandler": { - "version": "1.3.1", - "dev": true, - "license": "MIT", - "dependencies": { - "debug": "2.6.9", - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "on-finished": "2.4.1", - "parseurl": "~1.3.3", - "statuses": "2.0.1", - "unpipe": "~1.0.0" - }, - "engines": { - "node": ">= 0.8" - } - }, "node_modules/find-replace": { "version": "5.0.2", "dev": true, @@ -3722,39 +3161,10 @@ "node": ">=8" } }, - "node_modules/flat": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/flat/-/flat-5.0.2.tgz", - "integrity": "sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==", - "dev": true, - "license": "BSD-3-Clause", - "bin": { - "flat": "cli.js" - } - }, "node_modules/flatbuffers": { "version": "25.1.24", "license": "Apache-2.0" }, - "node_modules/follow-redirects": { - "version": "1.15.6", - "dev": true, - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], - "license": "MIT", - "engines": { - "node": ">=4.0" - }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } - } - }, "node_modules/foreground-child": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", @@ -3772,22 +3182,6 @@ "url": "https://github.com/sponsors/isaacs" } }, - "node_modules/forwarded": { - "version": "0.2.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/fresh": { - "version": "0.5.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, "node_modules/fs.realpath": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", @@ -3909,11 +3303,6 @@ "node": ">= 6" } }, - "node_modules/glob-to-regexp": { - "version": "0.4.1", - "dev": true, - "license": "BSD-2-Clause" - }, "node_modules/global-agent": { "version": "3.0.0", "license": "BSD-3-Clause", @@ -3962,11 +3351,6 @@ "version": "1.0.9", "license": "ISC" }, - "node_modules/handle-thing": { - "version": "2.0.1", - "dev": true, - "license": "MIT" - }, "node_modules/handlebars": { "version": "4.7.8", "dev": true, @@ -3983,19 +3367,8 @@ "engines": { "node": ">=0.4.7" }, - "optionalDependencies": { - "uglify-js": "^3.1.4" - } - }, - "node_modules/has": { - "version": "1.0.3", - "dev": true, - "license": "MIT", - "dependencies": { - "function-bind": "^1.1.1" - }, - "engines": { - "node": ">= 0.4.0" + "optionalDependencies": { + "uglify-js": "^3.1.4" } }, "node_modules/has-flag": { @@ -4046,44 +3419,6 @@ "node": ">= 0.4" } }, - "node_modules/hpack.js": { - "version": "2.1.6", - "dev": true, - "license": "MIT", - "dependencies": { - "inherits": "^2.0.1", - "obuf": "^1.0.0", - "readable-stream": "^2.0.1", - "wbuf": "^1.1.0" - } - }, - "node_modules/hpack.js/node_modules/readable-stream": { - "version": "2.3.8", - "dev": true, - "license": "MIT", - "dependencies": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.3", - "isarray": "~1.0.0", - "process-nextick-args": "~2.0.0", - "safe-buffer": "~5.1.1", - "string_decoder": "~1.1.1", - "util-deprecate": "~1.0.1" - } - }, - "node_modules/hpack.js/node_modules/safe-buffer": { - "version": "5.1.2", - "dev": true, - "license": "MIT" - }, - "node_modules/hpack.js/node_modules/string_decoder": { - "version": "1.1.1", - "dev": true, - "license": "MIT", - "dependencies": { - "safe-buffer": "~5.1.0" - } - }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -4091,67 +3426,6 @@ "dev": true, "license": "MIT" }, - "node_modules/http-deceiver": { - "version": "1.2.7", - "dev": true, - "license": "MIT" - }, - "node_modules/http-errors": { - "version": "2.0.0", - "dev": true, - "license": "MIT", - "dependencies": { - "depd": "2.0.0", - "inherits": "2.0.4", - "setprototypeof": "1.2.0", - "statuses": "2.0.1", - "toidentifier": "1.0.1" - }, - "engines": { - "node": ">= 0.8" - } - }, - "node_modules/http-parser-js": { - "version": "0.5.8", - "dev": true, - "license": "MIT" - }, - "node_modules/http-proxy": { - "version": "1.18.1", - "dev": true, - "license": "MIT", - "dependencies": { - "eventemitter3": "^4.0.0", - "follow-redirects": "^1.0.0", - "requires-port": "^1.0.0" - }, - "engines": { - "node": ">=8.0.0" - } - }, - "node_modules/http-proxy-middleware": { - "version": "2.0.9", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/http-proxy": "^1.17.8", - "http-proxy": "^1.18.1", - "is-glob": "^4.0.1", - "is-plain-obj": "^3.0.0", - "micromatch": "^4.0.2" - }, - "engines": { - "node": ">=12.0.0" - }, - "peerDependencies": { - "@types/express": "^4.17.13" - }, - "peerDependenciesMeta": { - "@types/express": { - "optional": true - } - } - }, "node_modules/human-signals": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz", @@ -4162,25 +3436,6 @@ "node": ">=10.17.0" } }, - "node_modules/hyperdyperid": { - "version": "1.2.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10.18" - } - }, - "node_modules/iconv-lite": { - "version": "0.4.24", - "dev": true, - "license": "MIT", - "dependencies": { - "safer-buffer": ">= 2.1.2 < 3" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/import-local": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/import-local/-/import-local-3.2.0.tgz", @@ -4228,62 +3483,10 @@ "dev": true, "license": "ISC" }, - "node_modules/interpret": { - "version": "3.1.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10.13.0" - } - }, - "node_modules/ipaddr.js": { - "version": "2.2.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 10" - } - }, "node_modules/is-arrayish": { "version": "0.3.2", "license": "MIT" }, - "node_modules/is-binary-path": { - "version": "2.1.0", - "dev": true, - "license": "MIT", - "dependencies": { - "binary-extensions": "^2.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/is-core-module": { - "version": "2.12.0", - "dev": true, - "license": "MIT", - "dependencies": { - "has": "^1.0.3" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/is-docker": { - "version": "3.0.0", - "dev": true, - "license": "MIT", - "bin": { - "is-docker": "cli.js" - }, - "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/is-extglob": { "version": "2.1.1", "dev": true, @@ -4323,34 +3526,6 @@ "node": ">=0.10.0" } }, - "node_modules/is-inside-container": { - "version": "1.0.0", - "dev": true, - "license": "MIT", - "dependencies": { - "is-docker": "^3.0.0" - }, - "bin": { - "is-inside-container": "cli.js" - }, - "engines": { - "node": ">=14.16" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/is-network-error": { - "version": "1.1.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=16" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/is-number": { "version": "7.0.0", "dev": true, @@ -4359,30 +3534,6 @@ "node": ">=0.12.0" } }, - "node_modules/is-plain-obj": { - "version": "3.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/is-plain-object": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", - "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", - "dev": true, - "license": "MIT", - "dependencies": { - "isobject": "^3.0.1" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/is-stream": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", @@ -4396,40 +3547,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/is-wsl": { - "version": "3.1.0", - "dev": true, - "license": "MIT", - "dependencies": { - "is-inside-container": "^1.0.0" - }, - "engines": { - "node": ">=16" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/isarray": { - "version": "1.0.0", - "dev": true, - "license": "MIT" - }, "node_modules/isexe": { "version": "2.0.0", "dev": true, "license": "ISC" }, - "node_modules/isobject": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", - "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/istanbul-lib-coverage": { "version": "3.2.2", "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", @@ -5336,13 +4458,6 @@ "dev": true, "license": "MIT" }, - "node_modules/json-schema-traverse": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", - "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", - "dev": true, - "license": "MIT" - }, "node_modules/json-stringify-safe": { "version": "5.0.1", "license": "ISC" @@ -5360,16 +4475,6 @@ "node": ">=6" } }, - "node_modules/kind-of": { - "version": "6.0.3", - "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.3.tgz", - "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/klaw": { "version": "3.0.0", "dev": true, @@ -5378,15 +4483,6 @@ "graceful-fs": "^4.1.9" } }, - "node_modules/launch-editor": { - "version": "2.9.1", - "dev": true, - "license": "MIT", - "dependencies": { - "picocolors": "^1.0.0", - "shell-quote": "^1.8.1" - } - }, "node_modules/leven": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz", @@ -5412,14 +4508,6 @@ "uc.micro": "^2.0.0" } }, - "node_modules/loader-runner": { - "version": "4.3.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.11.5" - } - }, "node_modules/locate-path": { "version": "5.0.0", "dev": true, @@ -5547,103 +4635,31 @@ "dev": true, "license": "MIT" }, - "node_modules/media-typer": { - "version": "0.3.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/memfs": { - "version": "4.14.1", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "@jsonjoy.com/json-pack": "^1.0.3", - "@jsonjoy.com/util": "^1.3.0", - "tree-dump": "^1.0.1", - "tslib": "^2.0.0" - }, - "engines": { - "node": ">= 4.0.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/streamich" - } - }, - "node_modules/merge-descriptors": { - "version": "1.0.3", - "dev": true, - "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/merge-stream": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", - "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==", - "dev": true, - "license": "MIT" - }, - "node_modules/merge2": { - "version": "1.4.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 8" - } - }, - "node_modules/methods": { - "version": "1.1.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/micromatch": { - "version": "4.0.8", - "dev": true, - "license": "MIT", - "dependencies": { - "braces": "^3.0.3", - "picomatch": "^2.3.1" - }, - "engines": { - "node": ">=8.6" - } - }, - "node_modules/mime": { - "version": "1.6.0", - "dev": true, - "license": "MIT", - "bin": { - "mime": "cli.js" - }, - "engines": { - "node": ">=4" - } + "integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==", + "dev": true, + "license": "MIT" }, - "node_modules/mime-db": { - "version": "1.52.0", + "node_modules/merge2": { + "version": "1.4.1", "dev": true, "license": "MIT", "engines": { - "node": ">= 0.6" + "node": ">= 8" } }, - "node_modules/mime-types": { - "version": "2.1.35", + "node_modules/micromatch": { + "version": "4.0.8", "dev": true, "license": "MIT", "dependencies": { - "mime-db": "1.52.0" + "braces": "^3.0.3", + "picomatch": "^2.3.1" }, "engines": { - "node": ">= 0.6" + "node": ">=8.6" } }, "node_modules/mimic-fn": { @@ -5656,11 +4672,6 @@ "node": ">=6" } }, - "node_modules/minimalistic-assert": { - "version": "1.0.1", - "dev": true, - "license": "ISC" - }, "node_modules/minimatch": { "version": "9.0.5", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz", @@ -5706,23 +4717,6 @@ "node": ">=10" } }, - "node_modules/ms": { - "version": "2.0.0", - "dev": true, - "license": "MIT" - }, - "node_modules/multicast-dns": { - "version": "7.2.5", - "dev": true, - "license": "MIT", - "dependencies": { - "dns-packet": "^5.2.2", - "thunky": "^1.0.2" - }, - "bin": { - "multicast-dns": "cli.js" - } - }, "node_modules/napi-postinstall": { "version": "0.3.4", "resolved": "https://registry.npmjs.org/napi-postinstall/-/napi-postinstall-0.3.4.tgz", @@ -5746,27 +4740,11 @@ "dev": true, "license": "MIT" }, - "node_modules/negotiator": { - "version": "0.6.3", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, "node_modules/neo-async": { "version": "2.6.2", "dev": true, "license": "MIT" }, - "node_modules/node-forge": { - "version": "1.3.1", - "dev": true, - "license": "(BSD-3-Clause OR GPL-2.0)", - "engines": { - "node": ">= 6.13.0" - } - }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -5800,17 +4778,6 @@ "node": ">=8" } }, - "node_modules/object-inspect": { - "version": "1.13.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/object-keys": { "version": "1.1.1", "license": "MIT", @@ -5826,32 +4793,6 @@ "node": ">=8.0.0" } }, - "node_modules/obuf": { - "version": "1.1.2", - "dev": true, - "license": "MIT" - }, - "node_modules/on-finished": { - "version": "2.4.1", - "dev": true, - "license": "MIT", - "dependencies": { - "ee-first": "1.1.1" - }, - "engines": { - "node": ">= 0.8" - } - }, - "node_modules/on-headers": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz", - "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, "node_modules/once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -5915,23 +4856,6 @@ "protobufjs": "^7.2.4" } }, - "node_modules/open": { - "version": "10.1.0", - "dev": true, - "license": "MIT", - "dependencies": { - "default-browser": "^5.2.1", - "define-lazy-prop": "^3.0.0", - "is-inside-container": "^1.0.0", - "is-wsl": "^3.1.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/p-limit": { "version": "2.3.0", "dev": true, @@ -5957,22 +4881,6 @@ "node": ">=8" } }, - "node_modules/p-retry": { - "version": "6.2.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/retry": "0.12.2", - "is-network-error": "^1.0.0", - "retry": "^0.13.1" - }, - "engines": { - "node": ">=16.17" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/p-try": { "version": "2.2.0", "dev": true, @@ -6007,14 +4915,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/parseurl": { - "version": "1.3.3", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, "node_modules/path-exists": { "version": "4.0.0", "dev": true, @@ -6041,11 +4941,6 @@ "node": ">=8" } }, - "node_modules/path-parse": { - "version": "1.0.7", - "dev": true, - "license": "MIT" - }, "node_modules/path-scurry": { "version": "1.11.1", "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz", @@ -6070,11 +4965,6 @@ "dev": true, "license": "ISC" }, - "node_modules/path-to-regexp": { - "version": "0.1.12", - "dev": true, - "license": "MIT" - }, "node_modules/picocolors": { "version": "1.1.1", "dev": true, @@ -6158,11 +5048,6 @@ "url": "https://github.com/chalk/ansi-styles?sponsor=1" } }, - "node_modules/process-nextick-args": { - "version": "2.0.1", - "dev": true, - "license": "MIT" - }, "node_modules/protobufjs": { "version": "7.2.6", "hasInstallScript": true, @@ -6185,26 +5070,6 @@ "node": ">=12.0.0" } }, - "node_modules/proxy-addr": { - "version": "2.0.7", - "dev": true, - "license": "MIT", - "dependencies": { - "forwarded": "0.2.0", - "ipaddr.js": "1.9.1" - }, - "engines": { - "node": ">= 0.10" - } - }, - "node_modules/proxy-addr/node_modules/ipaddr.js": { - "version": "1.9.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.10" - } - }, "node_modules/punycode.js": { "version": "2.3.1", "dev": true, @@ -6230,20 +5095,6 @@ ], "license": "MIT" }, - "node_modules/qs": { - "version": "6.13.0", - "dev": true, - "license": "BSD-3-Clause", - "dependencies": { - "side-channel": "^1.0.6" - }, - "engines": { - "node": ">=0.6" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/queue-microtask": { "version": "1.2.3", "dev": true, @@ -6263,38 +5114,6 @@ ], "license": "MIT" }, - "node_modules/randombytes": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", - "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "safe-buffer": "^5.1.0" - } - }, - "node_modules/range-parser": { - "version": "1.2.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/raw-body": { - "version": "2.5.2", - "dev": true, - "license": "MIT", - "dependencies": { - "bytes": "3.1.2", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "unpipe": "1.0.0" - }, - "engines": { - "node": ">= 0.8" - } - }, "node_modules/react-is": { "version": "18.3.1", "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.3.1.tgz", @@ -6302,41 +5121,6 @@ "dev": true, "license": "MIT" }, - "node_modules/readable-stream": { - "version": "3.6.1", - "dev": true, - "license": "MIT", - "dependencies": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/readdirp": { - "version": "3.6.0", - "dev": true, - "license": "MIT", - "dependencies": { - "picomatch": "^2.2.1" - }, - "engines": { - "node": ">=8.10.0" - } - }, - "node_modules/rechoir": { - "version": "0.8.0", - "dev": true, - "license": "MIT", - "dependencies": { - "resolve": "^1.20.0" - }, - "engines": { - "node": ">= 10.13.0" - } - }, "node_modules/require-directory": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", @@ -6347,19 +5131,6 @@ "node": ">=0.10.0" } }, - "node_modules/require-from-string": { - "version": "2.0.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/requires-port": { - "version": "1.0.0", - "dev": true, - "license": "MIT" - }, "node_modules/requizzle": { "version": "0.2.4", "dev": true, @@ -6368,22 +5139,6 @@ "lodash": "^4.17.21" } }, - "node_modules/resolve": { - "version": "1.22.2", - "dev": true, - "license": "MIT", - "dependencies": { - "is-core-module": "^2.11.0", - "path-parse": "^1.0.7", - "supports-preserve-symlinks-flag": "^1.0.0" - }, - "bin": { - "resolve": "bin/resolve" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/resolve-cwd": { "version": "3.0.0", "dev": true, @@ -6403,14 +5158,6 @@ "node": ">=8" } }, - "node_modules/retry": { - "version": "0.13.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 4" - } - }, "node_modules/reusify": { "version": "1.0.4", "dev": true, @@ -6439,17 +5186,6 @@ "version": "1.1.3", "license": "BSD-3-Clause" }, - "node_modules/run-applescript": { - "version": "7.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, "node_modules/run-parallel": { "version": "1.2.0", "dev": true, @@ -6472,117 +5208,20 @@ "queue-microtask": "^1.2.2" } }, - "node_modules/safe-buffer": { - "version": "5.2.1", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" - }, - "node_modules/safer-buffer": { - "version": "2.1.2", - "dev": true, - "license": "MIT" - }, - "node_modules/schema-utils": { - "version": "4.3.2", - "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.2.tgz", - "integrity": "sha512-Gn/JaSk/Mt9gYubxTtSn/QCV4em9mpAPiR1rqy/Ocu19u/G9J5WWdNoUT4SiV6mFC3y6cxyFcFwdzPM3FgxGAQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/json-schema": "^7.0.9", - "ajv": "^8.9.0", - "ajv-formats": "^2.1.1", - "ajv-keywords": "^5.1.0" - }, - "engines": { - "node": ">= 10.13.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - } - }, - "node_modules/select-hose": { - "version": "2.0.0", - "dev": true, - "license": "MIT" - }, - "node_modules/selfsigned": { - "version": "2.4.1", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node-forge": "^1.3.0", - "node-forge": "^1" - }, - "engines": { - "node": ">=10" - } - }, "node_modules/semver": { "version": "7.7.2", "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==", - "license": "ISC", - "bin": { - "semver": "bin/semver.js" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/semver-compare": { - "version": "1.0.0", - "license": "MIT" - }, - "node_modules/send": { - "version": "0.19.0", - "dev": true, - "license": "MIT", - "dependencies": { - "debug": "2.6.9", - "depd": "2.0.0", - "destroy": "1.2.0", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "fresh": "0.5.2", - "http-errors": "2.0.0", - "mime": "1.6.0", - "ms": "2.1.3", - "on-finished": "2.4.1", - "range-parser": "~1.2.1", - "statuses": "2.0.1" + "license": "ISC", + "bin": { + "semver": "bin/semver.js" }, "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/send/node_modules/encodeurl": { - "version": "1.0.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" + "node": ">=10" } }, - "node_modules/send/node_modules/ms": { - "version": "2.1.3", - "dev": true, + "node_modules/semver-compare": { + "version": "1.0.0", "license": "MIT" }, "node_modules/serialize-error": { @@ -6608,121 +5247,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/serialize-javascript": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-6.0.2.tgz", - "integrity": "sha512-Saa1xPByTTq2gdeFZYLLo+RFE35NHZkAbqZeWNd3BpzppeVisAqpDjcp8dyf6uIvEqJRd46jemmyA4iFIeVk8g==", - "dev": true, - "license": "BSD-3-Clause", - "dependencies": { - "randombytes": "^2.1.0" - } - }, - "node_modules/serve-index": { - "version": "1.9.1", - "dev": true, - "license": "MIT", - "dependencies": { - "accepts": "~1.3.4", - "batch": "0.6.1", - "debug": "2.6.9", - "escape-html": "~1.0.3", - "http-errors": "~1.6.2", - "mime-types": "~2.1.17", - "parseurl": "~1.3.2" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/serve-index/node_modules/depd": { - "version": "1.1.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/serve-index/node_modules/http-errors": { - "version": "1.6.3", - "dev": true, - "license": "MIT", - "dependencies": { - "depd": "~1.1.2", - "inherits": "2.0.3", - "setprototypeof": "1.1.0", - "statuses": ">= 1.4.0 < 2" - }, - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/serve-index/node_modules/inherits": { - "version": "2.0.3", - "dev": true, - "license": "ISC" - }, - "node_modules/serve-index/node_modules/setprototypeof": { - "version": "1.1.0", - "dev": true, - "license": "ISC" - }, - "node_modules/serve-index/node_modules/statuses": { - "version": "1.5.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/serve-static": { - "version": "1.16.2", - "dev": true, - "license": "MIT", - "dependencies": { - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "parseurl": "~1.3.3", - "send": "0.19.0" - }, - "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/set-function-length": { - "version": "1.2.2", - "dev": true, - "license": "MIT", - "dependencies": { - "define-data-property": "^1.1.4", - "es-errors": "^1.3.0", - "function-bind": "^1.1.2", - "get-intrinsic": "^1.2.4", - "gopd": "^1.0.1", - "has-property-descriptors": "^1.0.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/setprototypeof": { - "version": "1.2.0", - "dev": true, - "license": "ISC" - }, - "node_modules/shallow-clone": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-3.0.1.tgz", - "integrity": "sha512-/6KqX+GVUdqPuPPd2LxDDxzX6CAbjJehAAOKlNpqqUpAqPM6HeL8f+o3a+JsyGjn2lv0WY8UsTgUJjU9Ok55NA==", - "dev": true, - "license": "MIT", - "dependencies": { - "kind-of": "^6.0.2" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/sharp": { "version": "0.34.3", "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.3.tgz", @@ -6784,34 +5308,6 @@ "node": ">=8" } }, - "node_modules/shell-quote": { - "version": "1.8.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/side-channel": { - "version": "1.0.6", - "dev": true, - "license": "MIT", - "dependencies": { - "call-bind": "^1.0.7", - "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.4", - "object-inspect": "^1.13.1" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/signal-exit": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", @@ -6842,16 +5338,6 @@ "node": ">=8" } }, - "node_modules/sockjs": { - "version": "0.3.24", - "dev": true, - "license": "MIT", - "dependencies": { - "faye-websocket": "^0.11.3", - "uuid": "^8.3.2", - "websocket-driver": "^0.7.4" - } - }, "node_modules/sort-array": { "version": "5.0.0", "dev": true, @@ -6891,76 +5377,6 @@ "source-map": "^0.6.0" } }, - "node_modules/spdy": { - "version": "4.0.2", - "dev": true, - "license": "MIT", - "dependencies": { - "debug": "^4.1.0", - "handle-thing": "^2.0.0", - "http-deceiver": "^1.2.7", - "select-hose": "^2.0.0", - "spdy-transport": "^3.0.0" - }, - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/spdy-transport": { - "version": "3.0.0", - "dev": true, - "license": "MIT", - "dependencies": { - "debug": "^4.1.0", - "detect-node": "^2.0.4", - "hpack.js": "^2.1.6", - "obuf": "^1.1.2", - "readable-stream": "^3.0.6", - "wbuf": "^1.7.3" - } - }, - "node_modules/spdy-transport/node_modules/debug": { - "version": "4.3.4", - "dev": true, - "license": "MIT", - "dependencies": { - "ms": "2.1.2" - }, - "engines": { - "node": ">=6.0" - }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } - } - }, - "node_modules/spdy-transport/node_modules/ms": { - "version": "2.1.2", - "dev": true, - "license": "MIT" - }, - "node_modules/spdy/node_modules/debug": { - "version": "4.3.4", - "dev": true, - "license": "MIT", - "dependencies": { - "ms": "2.1.2" - }, - "engines": { - "node": ">=6.0" - }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } - } - }, - "node_modules/spdy/node_modules/ms": { - "version": "2.1.2", - "dev": true, - "license": "MIT" - }, "node_modules/sprintf-js": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", @@ -6981,22 +5397,6 @@ "node": ">=10" } }, - "node_modules/statuses": { - "version": "2.0.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, - "node_modules/string_decoder": { - "version": "1.3.0", - "dev": true, - "license": "MIT", - "dependencies": { - "safe-buffer": "~5.2.0" - } - }, "node_modules/string-length": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz", @@ -7151,180 +5551,63 @@ "node_modules/strip-final-newline": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/strip-final-newline/-/strip-final-newline-2.0.0.tgz", - "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/strip-json-comments": { - "version": "3.1.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/supports-color": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", - "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", - "dev": true, - "license": "MIT", - "dependencies": { - "has-flag": "^4.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/supports-preserve-symlinks-flag": { - "version": "1.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/synckit": { - "version": "0.11.11", - "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.11.tgz", - "integrity": "sha512-MeQTA1r0litLUf0Rp/iisCaL8761lKAZHaimlbGK4j0HysC4PLfqygQj9srcs0m2RdtDYnF8UuYyKpbjHYp7Jw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@pkgr/core": "^0.2.9" - }, - "engines": { - "node": "^14.18.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/synckit" - } - }, - "node_modules/table-layout": { - "version": "4.1.1", - "dev": true, - "license": "MIT", - "dependencies": { - "array-back": "^6.2.2", - "wordwrapjs": "^5.1.0" - }, - "engines": { - "node": ">=12.17" - } - }, - "node_modules/tapable": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.2.tgz", - "integrity": "sha512-Re10+NauLTMCudc7T5WLFLAwDhQ0JWdrMK+9B2M8zR5hRExKmsRDCBA7/aV/pNJFltmBFO5BAMlQFi/vq3nKOg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/terser": { - "version": "5.43.1", - "resolved": "https://registry.npmjs.org/terser/-/terser-5.43.1.tgz", - "integrity": "sha512-+6erLbBm0+LROX2sPXlUYx/ux5PyE9K/a92Wrt6oA+WDAoFTdpHE5tCYCI5PNzq2y8df4rA+QgHLJuR4jNymsg==", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "@jridgewell/source-map": "^0.3.3", - "acorn": "^8.14.0", - "commander": "^2.20.0", - "source-map-support": "~0.5.20" - }, - "bin": { - "terser": "bin/terser" - }, + "integrity": "sha512-BrpvfNAE3dcvq7ll3xVumzjKjZQ5tI1sEUIKr3Uoks0XUl45St3FlatVqef9prk4jRDzhW6WZg+3bk93y6pLjA==", + "dev": true, + "license": "MIT", "engines": { - "node": ">=10" + "node": ">=6" } }, - "node_modules/terser-webpack-plugin": { - "version": "5.3.14", - "resolved": "https://registry.npmjs.org/terser-webpack-plugin/-/terser-webpack-plugin-5.3.14.tgz", - "integrity": "sha512-vkZjpUjb6OMS7dhV+tILUW6BhpDR7P2L/aQSAv+Uwk+m8KATX9EccViHTJR2qDtACKPIYndLGCyl3FMo+r2LMw==", + "node_modules/strip-json-comments": { + "version": "3.1.1", "dev": true, "license": "MIT", - "dependencies": { - "@jridgewell/trace-mapping": "^0.3.25", - "jest-worker": "^27.4.5", - "schema-utils": "^4.3.0", - "serialize-javascript": "^6.0.2", - "terser": "^5.31.1" - }, "engines": { - "node": ">= 10.13.0" + "node": ">=8" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - }, - "peerDependencies": { - "webpack": "^5.1.0" - }, - "peerDependenciesMeta": { - "@swc/core": { - "optional": true - }, - "esbuild": { - "optional": true - }, - "uglify-js": { - "optional": true - } + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/terser-webpack-plugin/node_modules/jest-worker": { - "version": "27.5.1", - "resolved": "https://registry.npmjs.org/jest-worker/-/jest-worker-27.5.1.tgz", - "integrity": "sha512-7vuh85V5cdDofPyxn58nrPjBktZo0u9x1g8WtjQol+jZDaE+fhN+cIvTj11GndBnMnyfrUOG1sZQxCdjKh+DKg==", + "node_modules/supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", "dev": true, "license": "MIT", "dependencies": { - "@types/node": "*", - "merge-stream": "^2.0.0", - "supports-color": "^8.0.0" + "has-flag": "^4.0.0" }, "engines": { - "node": ">= 10.13.0" + "node": ">=8" } }, - "node_modules/terser-webpack-plugin/node_modules/supports-color": { - "version": "8.1.1", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz", - "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==", + "node_modules/synckit": { + "version": "0.11.11", + "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.11.tgz", + "integrity": "sha512-MeQTA1r0litLUf0Rp/iisCaL8761lKAZHaimlbGK4j0HysC4PLfqygQj9srcs0m2RdtDYnF8UuYyKpbjHYp7Jw==", "dev": true, "license": "MIT", "dependencies": { - "has-flag": "^4.0.0" + "@pkgr/core": "^0.2.9" }, "engines": { - "node": ">=10" + "node": "^14.18.0 || >=16.0.0" }, "funding": { - "url": "https://github.com/chalk/supports-color?sponsor=1" + "url": "https://opencollective.com/synckit" } }, - "node_modules/terser/node_modules/source-map-support": { - "version": "0.5.21", - "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz", - "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==", + "node_modules/table-layout": { + "version": "4.1.1", "dev": true, "license": "MIT", "dependencies": { - "buffer-from": "^1.0.0", - "source-map": "^0.6.0" + "array-back": "^6.2.2", + "wordwrapjs": "^5.1.0" + }, + "engines": { + "node": ">=12.17" } }, "node_modules/test-exclude": { @@ -7388,22 +5671,6 @@ "node": "*" } }, - "node_modules/thingies": { - "version": "1.21.0", - "dev": true, - "license": "Unlicense", - "engines": { - "node": ">=10.18" - }, - "peerDependencies": { - "tslib": "^2" - } - }, - "node_modules/thunky": { - "version": "1.1.0", - "dev": true, - "license": "MIT" - }, "node_modules/tmpl": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz", @@ -7422,33 +5689,11 @@ "node": ">=8.0" } }, - "node_modules/toidentifier": { - "version": "1.0.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.6" - } - }, - "node_modules/tree-dump": { - "version": "1.0.2", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=10.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/streamich" - }, - "peerDependencies": { - "tslib": "2" - } - }, "node_modules/tslib": { "version": "2.6.3", "dev": true, - "license": "0BSD" + "license": "0BSD", + "optional": true }, "node_modules/type-detect": { "version": "4.0.8", @@ -7473,18 +5718,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/type-is": { - "version": "1.6.18", - "dev": true, - "license": "MIT", - "dependencies": { - "media-typer": "0.3.0", - "mime-types": "~2.1.24" - }, - "engines": { - "node": ">= 0.6" - } - }, "node_modules/typescript": { "version": "5.8.3", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz", @@ -7535,14 +5768,6 @@ "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==", "license": "MIT" }, - "node_modules/unpipe": { - "version": "1.0.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, "node_modules/unrs-resolver": { "version": "1.11.1", "resolved": "https://registry.npmjs.org/unrs-resolver/-/unrs-resolver-1.11.1.tgz", @@ -7607,27 +5832,6 @@ "browserslist": ">= 4.21.0" } }, - "node_modules/util-deprecate": { - "version": "1.0.2", - "dev": true, - "license": "MIT" - }, - "node_modules/utils-merge": { - "version": "1.0.1", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.4.0" - } - }, - "node_modules/uuid": { - "version": "8.3.2", - "dev": true, - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, "node_modules/v8-to-istanbul": { "version": "9.3.0", "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.3.0.tgz", @@ -7643,14 +5847,6 @@ "node": ">=10.12.0" } }, - "node_modules/vary": { - "version": "1.1.2", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.8" - } - }, "node_modules/walk-back": { "version": "5.1.1", "dev": true, @@ -7669,18 +5865,6 @@ "makeerror": "1.0.12" } }, - "node_modules/watchpack": { - "version": "2.4.2", - "dev": true, - "license": "MIT", - "dependencies": { - "glob-to-regexp": "^0.4.1", - "graceful-fs": "^4.1.2" - }, - "engines": { - "node": ">=10.13.0" - } - }, "node_modules/wavefile": { "version": "11.0.0", "dev": true, @@ -7692,246 +5876,6 @@ "node": ">=8" } }, - "node_modules/wbuf": { - "version": "1.7.3", - "dev": true, - "license": "MIT", - "dependencies": { - "minimalistic-assert": "^1.0.0" - } - }, - "node_modules/webpack": { - "version": "5.100.2", - "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.100.2.tgz", - "integrity": "sha512-QaNKAvGCDRh3wW1dsDjeMdDXwZm2vqq3zn6Pvq4rHOEOGSaUMgOOjG2Y9ZbIGzpfkJk9ZYTHpDqgDfeBDcnLaw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/eslint-scope": "^3.7.7", - "@types/estree": "^1.0.8", - "@types/json-schema": "^7.0.15", - "@webassemblyjs/ast": "^1.14.1", - "@webassemblyjs/wasm-edit": "^1.14.1", - "@webassemblyjs/wasm-parser": "^1.14.1", - "acorn": "^8.15.0", - "acorn-import-phases": "^1.0.3", - "browserslist": "^4.24.0", - "chrome-trace-event": "^1.0.2", - "enhanced-resolve": "^5.17.2", - "es-module-lexer": "^1.2.1", - "eslint-scope": "5.1.1", - "events": "^3.2.0", - "glob-to-regexp": "^0.4.1", - "graceful-fs": "^4.2.11", - "json-parse-even-better-errors": "^2.3.1", - "loader-runner": "^4.2.0", - "mime-types": "^2.1.27", - "neo-async": "^2.6.2", - "schema-utils": "^4.3.2", - "tapable": "^2.1.1", - "terser-webpack-plugin": "^5.3.11", - "watchpack": "^2.4.1", - "webpack-sources": "^3.3.3" - }, - "bin": { - "webpack": "bin/webpack.js" - }, - "engines": { - "node": ">=10.13.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - }, - "peerDependenciesMeta": { - "webpack-cli": { - "optional": true - } - } - }, - "node_modules/webpack-cli": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/webpack-cli/-/webpack-cli-6.0.1.tgz", - "integrity": "sha512-MfwFQ6SfwinsUVi0rNJm7rHZ31GyTcpVE5pgVA3hwFRb7COD4TzjUUwhGWKfO50+xdc2MQPuEBBJoqIMGt3JDw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@discoveryjs/json-ext": "^0.6.1", - "@webpack-cli/configtest": "^3.0.1", - "@webpack-cli/info": "^3.0.1", - "@webpack-cli/serve": "^3.0.1", - "colorette": "^2.0.14", - "commander": "^12.1.0", - "cross-spawn": "^7.0.3", - "envinfo": "^7.14.0", - "fastest-levenshtein": "^1.0.12", - "import-local": "^3.0.2", - "interpret": "^3.1.1", - "rechoir": "^0.8.0", - "webpack-merge": "^6.0.1" - }, - "bin": { - "webpack-cli": "bin/cli.js" - }, - "engines": { - "node": ">=18.12.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - }, - "peerDependencies": { - "webpack": "^5.82.0" - }, - "peerDependenciesMeta": { - "webpack-bundle-analyzer": { - "optional": true - }, - "webpack-dev-server": { - "optional": true - } - } - }, - "node_modules/webpack-cli/node_modules/commander": { - "version": "12.1.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz", - "integrity": "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - } - }, - "node_modules/webpack-dev-middleware": { - "version": "7.4.2", - "dev": true, - "license": "MIT", - "dependencies": { - "colorette": "^2.0.10", - "memfs": "^4.6.0", - "mime-types": "^2.1.31", - "on-finished": "^2.4.1", - "range-parser": "^1.2.1", - "schema-utils": "^4.0.0" - }, - "engines": { - "node": ">= 18.12.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - }, - "peerDependencies": { - "webpack": "^5.0.0" - }, - "peerDependenciesMeta": { - "webpack": { - "optional": true - } - } - }, - "node_modules/webpack-dev-server": { - "version": "5.2.2", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/bonjour": "^3.5.13", - "@types/connect-history-api-fallback": "^1.5.4", - "@types/express": "^4.17.21", - "@types/express-serve-static-core": "^4.17.21", - "@types/serve-index": "^1.9.4", - "@types/serve-static": "^1.15.5", - "@types/sockjs": "^0.3.36", - "@types/ws": "^8.5.10", - "ansi-html-community": "^0.0.8", - "bonjour-service": "^1.2.1", - "chokidar": "^3.6.0", - "colorette": "^2.0.10", - "compression": "^1.7.4", - "connect-history-api-fallback": "^2.0.0", - "express": "^4.21.2", - "graceful-fs": "^4.2.6", - "http-proxy-middleware": "^2.0.9", - "ipaddr.js": "^2.1.0", - "launch-editor": "^2.6.1", - "open": "^10.0.3", - "p-retry": "^6.2.0", - "schema-utils": "^4.2.0", - "selfsigned": "^2.4.1", - "serve-index": "^1.9.1", - "sockjs": "^0.3.24", - "spdy": "^4.0.2", - "webpack-dev-middleware": "^7.4.2", - "ws": "^8.18.0" - }, - "bin": { - "webpack-dev-server": "bin/webpack-dev-server.js" - }, - "engines": { - "node": ">= 18.12.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - }, - "peerDependencies": { - "webpack": "^5.0.0" - }, - "peerDependenciesMeta": { - "webpack": { - "optional": true - }, - "webpack-cli": { - "optional": true - } - } - }, - "node_modules/webpack-merge": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/webpack-merge/-/webpack-merge-6.0.1.tgz", - "integrity": "sha512-hXXvrjtx2PLYx4qruKl+kyRSLc52V+cCvMxRjmKwoA+CBbbF5GfIBtR6kCvl0fYGqTUPKB+1ktVmTHqMOzgCBg==", - "dev": true, - "license": "MIT", - "dependencies": { - "clone-deep": "^4.0.1", - "flat": "^5.0.2", - "wildcard": "^2.0.1" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/webpack-sources": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.3.3.tgz", - "integrity": "sha512-yd1RBzSGanHkitROoPFd6qsrxt+oFhg/129YzheDGqeustzX0vTZJZsSsQjVQC4yzBQ56K55XU8gaNCtIzOnTg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10.13.0" - } - }, - "node_modules/websocket-driver": { - "version": "0.7.4", - "dev": true, - "license": "Apache-2.0", - "dependencies": { - "http-parser-js": ">=0.5.1", - "safe-buffer": ">=5.1.0", - "websocket-extensions": ">=0.1.1" - }, - "engines": { - "node": ">=0.8.0" - } - }, - "node_modules/websocket-extensions": { - "version": "0.1.4", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=0.8.0" - } - }, "node_modules/which": { "version": "2.0.2", "dev": true, @@ -7946,13 +5890,6 @@ "node": ">= 8" } }, - "node_modules/wildcard": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/wildcard/-/wildcard-2.0.1.tgz", - "integrity": "sha512-CC1bOL87PIWSBhDcTrdeLo6eGT7mCFtrg0uIJtqJUFyK+eJnzl8A1niH56uu7KMa5XFrtiV+AQuHO3n7DsHnLQ==", - "dev": true, - "license": "MIT" - }, "node_modules/wordwrap": { "version": "1.0.0", "dev": true, @@ -8082,26 +6019,6 @@ "node": "^14.17.0 || ^16.13.0 || >=18.0.0" } }, - "node_modules/ws": { - "version": "8.18.0", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10.0.0" - }, - "peerDependencies": { - "bufferutil": "^4.0.1", - "utf-8-validate": ">=5.0.2" - }, - "peerDependenciesMeta": { - "bufferutil": { - "optional": true - }, - "utf-8-validate": { - "optional": true - } - } - }, "node_modules/xmlcreate": { "version": "2.0.4", "dev": true, diff --git a/package.json b/package.json index ed843857d..2f856d89b 100644 --- a/package.json +++ b/package.json @@ -25,8 +25,8 @@ "format": "prettier --write .", "format:check": "prettier --check .", "typegen": "tsc --build", - "dev": "webpack serve --no-client-overlay", - "build": "webpack && npm run typegen", + "dev": "node scripts/esbuild/dev.mjs", + "build": "node scripts/esbuild/build.mjs && npm run typegen", "test": "node --experimental-vm-modules --expose-gc node_modules/jest/bin/jest.js --verbose --logHeapUsage", "readme": "python ./docs/scripts/build_readme.py", "docs-api": "node ./docs/scripts/generate.js", @@ -64,15 +64,13 @@ "@types/jest": "^30.0.0", "@types/node": "^24.0.11", "@webgpu/types": "^0.1.64", + "esbuild": "^0.27.2", "jest": "^30.0.4", "jest-environment-node": "^30.0.4", "jsdoc-to-markdown": "^9.1.1", "prettier": "3.4.2", "typescript": "^5.8.3", - "wavefile": "11.0.0", - "webpack": "^5.99.9", - "webpack-cli": "^6.0.1", - "webpack-dev-server": "^5.2.2" + "wavefile": "11.0.0" }, "files": [ "src", diff --git a/scripts/esbuild/build.mjs b/scripts/esbuild/build.mjs new file mode 100644 index 000000000..8fa39c0c8 --- /dev/null +++ b/scripts/esbuild/build.mjs @@ -0,0 +1,130 @@ +import { build as esbuild } from "esbuild"; +import path from "node:path"; +import { stripNodePrefixPlugin } from "./build/plugins/stripNodePrefixPlugin.mjs"; +import { ignoreModulesPlugin } from "./build/plugins/ignoreModulesPlugin.mjs"; +import { postBuildPlugin } from "./build/plugins/postBuildPlugin.mjs"; +import { externalNodeBuiltinsPlugin } from "./build/plugins/externalNodeBuiltinsPlugin.mjs"; +import { + NODE_IGNORE_MODULES, + NODE_EXTERNAL_MODULES, + WEB_IGNORE_MODULES, + WEB_EXTERNAL_MODULES, + OUT_DIR, + ROOT_DIR, + getEsbuildProdConfig, +} from "./build/constants.mjs"; +import { reportSize } from "./build/reportSize.mjs"; +import prepareOutDir from "./build/prepareOutDir.mjs"; + +/** + * + * Helper function to create build configurations. + * Equivalent to webpack's buildConfig function. + */ +async function buildTarget({ + name = "", + suffix = ".js", + format = "esm", // 'esm' | 'cjs' + ignoreModules = [], + externalModules = [], + usePostBuild = false, +}) { + const platform = format === "cjs" ? "node" : "neutral"; + + const regularFile = `transformers${name}${suffix}`; + const minFile = `transformers${name}.min${suffix}`; + + const plugins = []; + // Add ignoreModulesPlugin FIRST so it can catch modules before stripNodePrefixPlugin marks them as external + if (ignoreModules.length > 0) { + plugins.push(ignoreModulesPlugin(ignoreModules)); + } + plugins.push(stripNodePrefixPlugin()); + plugins.push(externalNodeBuiltinsPlugin()); + if (usePostBuild) { + plugins.push(postBuildPlugin(OUT_DIR, ROOT_DIR)); + } + + console.log(`\nBuilding ${regularFile}...`); + await esbuild({ + ...getEsbuildProdConfig(ROOT_DIR), + platform, + format, + outfile: path.join(OUT_DIR, regularFile), + external: externalModules, + plugins, + }); + reportSize(path.join(OUT_DIR, regularFile)); + + console.log(`\nBuilding ${minFile}...`); + await esbuild({ + ...getEsbuildProdConfig(ROOT_DIR), + platform, + format, + outfile: path.join(OUT_DIR, minFile), + minify: true, + external: externalModules, + plugins, + legalComments: "none", + }); + reportSize(path.join(OUT_DIR, minFile)); +} + +console.log("\nBuilding transformers.js with esbuild...\n"); + +const startTime = performance.now(); + +try { + prepareOutDir(OUT_DIR); + + // Bundle build - bundles everything except ignored modules + console.log("\n=== Bundle Build (ESM) ==="); + await buildTarget({ + name: "", + suffix: ".js", + format: "esm", + ignoreModules: WEB_IGNORE_MODULES, + externalModules: [], + usePostBuild: true, + }); + + // Web build - external onnxruntime libs + console.log("\n=== Web Build (ESM) ==="); + await buildTarget({ + name: ".web", + suffix: ".js", + format: "esm", + ignoreModules: WEB_IGNORE_MODULES, + externalModules: WEB_EXTERNAL_MODULES, + usePostBuild: false, + }); + + // Node ESM build + console.log("\n=== Node Build (ESM) ==="); + await buildTarget({ + name: ".node", + suffix: ".mjs", + format: "esm", + ignoreModules: NODE_IGNORE_MODULES, + externalModules: NODE_EXTERNAL_MODULES, + usePostBuild: false, + }); + + // Node CJS build + console.log("\n=== Node Build (CJS) ==="); + await buildTarget({ + name: ".node", + suffix: ".cjs", + format: "cjs", + ignoreModules: NODE_IGNORE_MODULES, + externalModules: NODE_EXTERNAL_MODULES, + usePostBuild: false, + }); + + const endTime = performance.now(); + const duration = (endTime - startTime).toFixed(2); + console.log(`\nAll builds completed successfully in ${duration}ms!\n`); +} catch (error) { + console.error("\nBuild failed:", error); + process.exit(1); +} diff --git a/scripts/esbuild/build/constants.mjs b/scripts/esbuild/build/constants.mjs new file mode 100644 index 000000000..14c80ae2f --- /dev/null +++ b/scripts/esbuild/build/constants.mjs @@ -0,0 +1,38 @@ +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +export const DIST_FOLDER = "dist"; +export const NODE_IGNORE_MODULES = ["onnxruntime-web"]; +export const NODE_EXTERNAL_MODULES = [ + "onnxruntime-common", + "onnxruntime-node", + "sharp", + // node:* modules are handled by externalNodeBuiltinsPlugin +]; + +export const WEB_IGNORE_MODULES = ["onnxruntime-node", "sharp", "fs", "path", "url", "stream", "stream/promises"]; +export const WEB_EXTERNAL_MODULES = ["onnxruntime-common", "onnxruntime-web"]; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +export const ROOT_DIR = path.join(__dirname, "../../.."); +export const OUT_DIR = path.join(ROOT_DIR, DIST_FOLDER); + +export const getEsbuildDevConfig = (rootDir) => ({ + bundle: true, + treeShaking: true, + logLevel: "info", + entryPoints: [path.join(rootDir, "src/transformers.js")], + platform: "neutral", + format: "esm", + sourcemap: true, + logOverride: { + // Suppress import.meta warning for CJS builds - it's handled gracefully in the code + "empty-import-meta": "silent", + }, +}); + +export const getEsbuildProdConfig = (rootDir) => ({ + ...getEsbuildDevConfig(rootDir), + logLevel: "warning", + sourcemap: false, +}); diff --git a/scripts/esbuild/build/httpServer.mjs b/scripts/esbuild/build/httpServer.mjs new file mode 100644 index 000000000..8c86514de --- /dev/null +++ b/scripts/esbuild/build/httpServer.mjs @@ -0,0 +1,74 @@ +import { createServer } from "node:http"; +import { existsSync, readFileSync, statSync } from "node:fs"; +import path from "node:path"; + +const MIME_TYPES = { + ".html": "text/html", + ".js": "text/javascript", + ".mjs": "text/javascript", + ".css": "text/css", + ".json": "application/json", + ".wasm": "application/wasm", + ".png": "image/png", + ".jpg": "image/jpeg", + ".gif": "image/gif", + ".svg": "image/svg+xml", + ".ico": "image/x-icon", +}; + +export const startServer = (dir, PORT = 8080) => + new Promise((resolve) => { + const server = createServer((req, res) => { + // Enable CORS + res.setHeader("Access-Control-Allow-Origin", "*"); + res.setHeader("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); + res.setHeader("Access-Control-Allow-Headers", "Content-Type"); + + if (req.method === "OPTIONS") { + res.writeHead(204); + res.end(); + return; + } + + let filePath = req.url === "/" ? "/index.html" : req.url; + filePath = filePath.split("?")[0]; // Remove query params + + // Try to serve from outdir first, then fall back to rootDir + let fullPath = path.join(dir, filePath); + + // Check if file exists + if (!existsSync(fullPath)) { + res.writeHead(404, { "Content-Type": "text/plain" }); + res.end("404 Not Found"); + return; + } + + // Check if it's a directory + const stat = statSync(fullPath); + if (stat.isDirectory()) { + fullPath = path.join(fullPath, "index.html"); + if (!existsSync(fullPath)) { + res.writeHead(404, { "Content-Type": "text/plain" }); + res.end("404 Not Found"); + return; + } + } + + // Get MIME type + const ext = path.extname(fullPath); + const mimeType = MIME_TYPES[ext] || "application/octet-stream"; + + try { + const content = readFileSync(fullPath); + res.writeHead(200, { "Content-Type": mimeType }); + res.end(content); + } catch (error) { + res.writeHead(500, { "Content-Type": "text/plain" }); + res.end("500 Internal Server Error"); + } + }); + + server.listen(PORT, () => { + resolve(server); + }); + }); diff --git a/scripts/esbuild/build/plugins/externalNodeBuiltinsPlugin.mjs b/scripts/esbuild/build/plugins/externalNodeBuiltinsPlugin.mjs new file mode 100644 index 000000000..ce0b28cee --- /dev/null +++ b/scripts/esbuild/build/plugins/externalNodeBuiltinsPlugin.mjs @@ -0,0 +1,14 @@ +/** + * Plugin to automatically mark all node:* imports as external. + * This prevents having to manually list all Node.js built-in modules. + */ +export const externalNodeBuiltinsPlugin = () => ({ + name: "external-node-builtins", + setup(build) { + // Mark all node:* imports as external + build.onResolve({ filter: /^node:/ }, (args) => ({ + path: args.path, + external: true, + })); + }, +}); diff --git a/scripts/esbuild/build/plugins/ignoreModulesPlugin.mjs b/scripts/esbuild/build/plugins/ignoreModulesPlugin.mjs new file mode 100644 index 000000000..c9d57e6c6 --- /dev/null +++ b/scripts/esbuild/build/plugins/ignoreModulesPlugin.mjs @@ -0,0 +1,33 @@ +/** + * Plugin to ignore/exclude certain modules by returning an empty module. + * Equivalent to webpack's resolve.alias with false value. + */ +export const ignoreModulesPlugin = (modules = []) => ({ + name: "ignore-modules", + setup(build) { + // Escape special regex characters in module names + const escapeRegex = (str) => str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + const escapedModules = modules.map(escapeRegex); + + // Match both "module" and "node:module" patterns + const patterns = escapedModules.flatMap((mod) => [mod, `node:${mod}`]); + const filter = new RegExp(`^(${patterns.join("|")})$`); + + build.onResolve({ filter }, (args) => { + return { path: args.path, namespace: "ignore-modules" }; + }); + build.onLoad({ filter: /.*/, namespace: "ignore-modules" }, () => { + return { + contents: ` + const noop = () => {}; + const emptyObj = {}; + export default emptyObj; + export const Readable = { fromWeb: noop }; + export const pipeline = noop; + export const createWriteStream = noop; + export const createReadStream = noop; + `, + }; + }); + }, +}); diff --git a/scripts/esbuild/build/plugins/postBuildPlugin.mjs b/scripts/esbuild/build/plugins/postBuildPlugin.mjs new file mode 100644 index 000000000..8effd6970 --- /dev/null +++ b/scripts/esbuild/build/plugins/postBuildPlugin.mjs @@ -0,0 +1,38 @@ +import path from "node:path"; +import { copyFileSync, unlinkSync, existsSync } from "node:fs"; + +/** + * Plugin to post-process build files. + * Equivalent to webpack's PostBuildPlugin. + */ +export const postBuildPlugin = (distDir, rootDir) => { + // it should copy the files only once. In watch mode for example it should not rerun every time + let completed = false; + return { + name: "post-build", + setup(build) { + build.onEnd(() => { + if (completed) return; + completed = true; + + const ORT_JSEP_FILE = "ort-wasm-simd-threaded.jsep.mjs"; + const ORT_BUNDLE_FILE = "ort.bundle.min.mjs"; + + // 1. Remove unnecessary files + const file = path.join(distDir, ORT_BUNDLE_FILE); + if (existsSync(file)) unlinkSync(file); + + // 2. Copy unbundled JSEP file + try { + const ORT_SOURCE_DIR = path.join(rootDir, "node_modules/onnxruntime-web/dist"); + const src = path.join(ORT_SOURCE_DIR, ORT_JSEP_FILE); + const dest = path.join(distDir, ORT_JSEP_FILE); + copyFileSync(src, dest); + console.log(`Copied ${ORT_JSEP_FILE}`); + } catch (error) { + console.warn(`!!! Warning: Could not copy ${ORT_JSEP_FILE}:`, error.message); + } + }); + }, + }; +}; diff --git a/scripts/esbuild/build/plugins/rebuildPlugin.mjs b/scripts/esbuild/build/plugins/rebuildPlugin.mjs new file mode 100644 index 000000000..c385292d4 --- /dev/null +++ b/scripts/esbuild/build/plugins/rebuildPlugin.mjs @@ -0,0 +1,26 @@ +/** + * Plugin to log rebuild events with timing + */ +export const rebuildPlugin = (name) => { + let startTime = 0; + + return { + name: "rebuild-logger", + setup(build) { + build.onStart(() => { + startTime = performance.now(); + }); + + build.onEnd((result) => { + const endTime = performance.now(); + const duration = (endTime - startTime).toFixed(2); + + if (result.errors.length > 0) { + console.log(`\n${name} - Build failed with ${result.errors.length} error(s) in ${duration}ms`); + } else { + console.log(`\n${name} - Rebuilt in ${duration}ms`); + } + }); + }, + }; +}; diff --git a/scripts/esbuild/build/plugins/stripNodePrefixPlugin.mjs b/scripts/esbuild/build/plugins/stripNodePrefixPlugin.mjs new file mode 100644 index 000000000..1a4cab983 --- /dev/null +++ b/scripts/esbuild/build/plugins/stripNodePrefixPlugin.mjs @@ -0,0 +1,15 @@ +/** + * Plugin to strip the "node:" prefix from module requests. + * Equivalent to webpack's StripNodePrefixPlugin. + */ +export const stripNodePrefixPlugin = () => ({ + name: "strip-node-prefix", + setup(build) { + build.onResolve({ filter: /^node:/ }, (args) => { + return { + path: args.path.replace(/^node:/, ""), + external: true, + }; + }); + }, +}); diff --git a/scripts/esbuild/build/prepareOutDir.mjs b/scripts/esbuild/build/prepareOutDir.mjs new file mode 100644 index 000000000..5f9302d31 --- /dev/null +++ b/scripts/esbuild/build/prepareOutDir.mjs @@ -0,0 +1,9 @@ +import { existsSync, mkdirSync, rmSync } from "node:fs"; + +export default function prepareOutDir(dir) { + if (existsSync(dir)) { + rmSync(dir, { recursive: true, force: true }); + } + + mkdirSync(dir, { recursive: true }); +} diff --git a/scripts/esbuild/build/reportSize.mjs b/scripts/esbuild/build/reportSize.mjs new file mode 100644 index 000000000..b806f6b3f --- /dev/null +++ b/scripts/esbuild/build/reportSize.mjs @@ -0,0 +1,16 @@ +import { readFileSync } from "node:fs"; +import { gzipSync } from "node:zlib"; + +export const formatSize = (bytes) => { + if (bytes < 1024) return `${bytes}b`; + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)}kb`; + return `${(bytes / (1024 * 1024)).toFixed(2)}mb`; +}; + +export const reportSize = (outfile) => { + const content = readFileSync(outfile); + const size = content.length; + const gzipSize = gzipSync(content).length; + + console.log(`\n${outfile}\n${formatSize(size)} (gzip: ${formatSize(gzipSize)})`); +}; diff --git a/scripts/esbuild/dev.mjs b/scripts/esbuild/dev.mjs new file mode 100644 index 000000000..c879185cf --- /dev/null +++ b/scripts/esbuild/dev.mjs @@ -0,0 +1,69 @@ +import { context } from "esbuild"; +import path from "node:path"; +import { postBuildPlugin } from "./build/plugins/postBuildPlugin.mjs"; +import { stripNodePrefixPlugin } from "./build/plugins/stripNodePrefixPlugin.mjs"; +import { ignoreModulesPlugin } from "./build/plugins/ignoreModulesPlugin.mjs"; +import { rebuildPlugin } from "./build/plugins/rebuildPlugin.mjs"; +import { externalNodeBuiltinsPlugin } from "./build/plugins/externalNodeBuiltinsPlugin.mjs"; +import { getEsbuildDevConfig, OUT_DIR, ROOT_DIR, WEB_IGNORE_MODULES } from "./build/constants.mjs"; +import { startServer } from "./build/httpServer.mjs"; +import prepareOutDir from "./build/prepareOutDir.mjs"; + +const startTime = performance.now(); + +prepareOutDir(OUT_DIR); + +console.log("\n=== BUILD ==="); +console.log("Building transformers.js with esbuild in watch mode..."); + +// Create build contexts for watch mode +const bundleContext = await context({ + ...getEsbuildDevConfig(ROOT_DIR), + outfile: path.join(OUT_DIR, "transformers.js"), + plugins: [ + ignoreModulesPlugin(WEB_IGNORE_MODULES), + stripNodePrefixPlugin(), + externalNodeBuiltinsPlugin(), + postBuildPlugin(OUT_DIR, ROOT_DIR), + rebuildPlugin("Bundle"), + ], +}); + +const webContext = await context({ + ...getEsbuildDevConfig(ROOT_DIR), + outfile: path.join(OUT_DIR, "transformers.web.js"), + external: ["onnxruntime-common", "onnxruntime-web"], + plugins: [ + ignoreModulesPlugin(WEB_IGNORE_MODULES), + stripNodePrefixPlugin(), + externalNodeBuiltinsPlugin(), + rebuildPlugin("Web"), + ], +}); + +console.log("\nInitial build starting..."); + +await Promise.all([bundleContext.watch(), webContext.watch()]); + +const endTime = performance.now(); +const duration = (endTime - startTime).toFixed(2); +console.log(`\nAll builds completed successfully in ${duration}ms!`); + +const PORT = 8080; + +console.log("\n=== SERVE ==="); +const server = await startServer(OUT_DIR, PORT); + +console.log(`\nServer running at http://localhost:${PORT}/`); +console.log(`Serving files from: ${OUT_DIR}`); + +console.log(`\nWatching for changes...\n`); + +// Keep process alive and cleanup +process.on("SIGINT", async () => { + console.log("\n\nStopping watch mode and server..."); + server.close(); + await bundleContext.dispose(); + await webContext.dispose(); + process.exit(0); +}); diff --git a/src/generation/logits_process.js b/src/generation/logits_process.js index 3bdff2a2f..ecd99b5e8 100644 --- a/src/generation/logits_process.js +++ b/src/generation/logits_process.js @@ -239,7 +239,7 @@ export class SuppressTokensAtBeginLogitsProcessor extends LogitsProcessor { export class WhisperTimeStampLogitsProcessor extends LogitsProcessor { /** * Constructs a new WhisperTimeStampLogitsProcessor. - * @param {import('../models/whisper/generation_whisper.js').WhisperGenerationConfig} generate_config The config object passed to the `generate()` method of a transformer model. + * @param {import('../models/model-processors/whisper/generation_whisper.js').WhisperGenerationConfig} generate_config The config object passed to the `generate()` method of a transformer model. * @param {number[]} init_tokens The initial tokens of the input sequence. */ constructor(generate_config, init_tokens) { diff --git a/src/models.js b/src/models.js index 377bd7296..806f1f29a 100644 --- a/src/models.js +++ b/src/models.js @@ -37,8063 +37,46 @@ * @module models */ -import { AutoConfig, getCacheShapes } from './configs.js'; +import { AutoConfig } from './configs.js'; +import { PreTrainedModel } from './models/pre-trained-model.js'; -import { - deviceToExecutionProviders, - createInferenceSession, - isONNXTensor, - isONNXProxy, - runInferenceSession, -} from './backends/onnx.js'; -import { - DATA_TYPES, - DEFAULT_DEVICE_DTYPE_MAPPING, - DEFAULT_DTYPE_SUFFIX_MAPPING, - isWebGpuFp16Supported, -} from './utils/dtypes.js'; - -import { Callable } from './utils/generic.js'; - -import { mergeArrays, pick } from './utils/core.js'; - -import { getModelFile, getModelJSON, MAX_EXTERNAL_DATA_CHUNKS } from './utils/hub.js'; - -import { GITHUB_ISSUE_URL } from './utils/constants.js'; - -import { - LogitsProcessorList, - ForcedBOSTokenLogitsProcessor, - ForcedEOSTokenLogitsProcessor, - SuppressTokensAtBeginLogitsProcessor, - WhisperTimeStampLogitsProcessor, - NoRepeatNGramLogitsProcessor, - RepetitionPenaltyLogitsProcessor, - NoBadWordsLogitsProcessor, - MinLengthLogitsProcessor, - MinNewTokensLengthLogitsProcessor, - TemperatureLogitsWarper, - ClassifierFreeGuidanceLogitsProcessor, -} from './generation/logits_process.js'; - -import { GenerationConfig } from './generation/configuration_utils.js'; - -import { - cat, - mean, - zeros, - zeros_like, - ones, - ones_like, - full, - full_like, - stack, - std_mean, - Tensor, - DataTypeMap, - randn, -} from './utils/tensor.js'; -import { RawImage } from './utils/image.js'; - -import { dynamic_time_warping, max, medianFilter } from './utils/maths.js'; -import { EosTokenCriteria, MaxLengthCriteria, StoppingCriteriaList } from './generation/stopping_criteria.js'; -import { LogitsSampler } from './generation/logits_sampler.js'; -import { apis, env } from './env.js'; - -import { WhisperGenerationConfig } from './models/whisper/generation_whisper.js'; -import { whisper_language_to_code } from './models/whisper/common_whisper.js'; - -////////////////////////////////////////////////// -// Model types: used internally -const MODEL_TYPES = { - EncoderOnly: 0, - EncoderDecoder: 1, - Seq2Seq: 2, - Vision2Seq: 3, - DecoderOnly: 4, - MaskGeneration: 5, - ImageTextToText: 6, - Musicgen: 7, - MultiModality: 8, - Phi3V: 9, - AudioTextToText: 10, - AutoEncoder: 11, - ImageAudioTextToText: 12, - Supertonic: 13, - Chatterbox: 14, -}; -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Helper functions - -// NOTE: These will be populated fully later -const MODEL_TYPE_MAPPING = new Map(); -const MODEL_NAME_TO_CLASS_MAPPING = new Map(); -const MODEL_CLASS_TO_NAME_MAPPING = new Map(); - -/** - * Constructs an InferenceSession using a model file located at the specified path. - * @param {string} pretrained_model_name_or_path The path to the directory containing the model file. - * @param {string} fileName The name of the model file. - * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model. - * @param {boolean} [is_decoder=false] Whether the model is a decoder model. - * @returns {Promise<{buffer_or_path: Uint8Array|string, session_options: Object, session_config: Object}>} A Promise that resolves to the data needed to create an InferenceSession object. - * @private - */ -async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) { - let custom_config = options.config?.['transformers.js_config'] ?? {}; - - let device = options.device ?? custom_config.device; - if (device && typeof device !== 'string') { - if (device.hasOwnProperty(fileName)) { - device = device[fileName]; - } else { - console.warn(`device not specified for "${fileName}". Using the default device.`); - device = null; - } - } - - // If the device is not specified, we use the default (supported) execution providers. - const selectedDevice = /** @type {import("./utils/devices.js").DeviceType} */ ( - device ?? (apis.IS_NODE_ENV ? 'cpu' : 'wasm') - ); - - const executionProviders = deviceToExecutionProviders(selectedDevice); - - // Update custom config with the selected device's config, if it exists - const device_config = custom_config.device_config ?? {}; - if (device_config.hasOwnProperty(selectedDevice)) { - custom_config = { - ...custom_config, - ...device_config[selectedDevice], - }; - } - - // If options.dtype is specified, we use it to choose the suffix for the model file. - // Otherwise, we use the default dtype for the device. - let dtype = options.dtype ?? custom_config.dtype; - if (typeof dtype !== 'string') { - if (dtype && dtype.hasOwnProperty(fileName)) { - dtype = dtype[fileName]; - } else { - dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32; - console.warn( - `dtype not specified for "${fileName}". Using the default dtype (${dtype}) for this device (${selectedDevice}).`, - ); - } - } - - if (dtype === DATA_TYPES.auto) { - // Try to choose the auto dtype based on the custom config - let config_dtype = custom_config.dtype; - if (typeof config_dtype !== 'string') { - config_dtype = config_dtype?.[fileName]; - } - - if (config_dtype && config_dtype !== DATA_TYPES.auto && DATA_TYPES.hasOwnProperty(config_dtype)) { - // Defined by the config, and is not "auto" - dtype = config_dtype; - } else { - // Choose default dtype based on device, falling back to fp32 - dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32; - } - } - - const selectedDtype = /** @type {import("./utils/dtypes.js").DataType} */ (dtype); - - if (!DEFAULT_DTYPE_SUFFIX_MAPPING.hasOwnProperty(selectedDtype)) { - throw new Error(`Invalid dtype: ${selectedDtype}. Should be one of: ${Object.keys(DATA_TYPES).join(', ')}`); - } else if ( - selectedDevice === 'webgpu' && - // NOTE: Currently, we assume that the Native WebGPU EP always supports fp16. In future, we will add a check for this. - !apis.IS_NODE_ENV && - selectedDtype === DATA_TYPES.fp16 && - !(await isWebGpuFp16Supported()) - ) { - throw new Error(`The device (${selectedDevice}) does not support fp16.`); - } - - // Only valid for models with a decoder - const kv_cache_dtype_config = custom_config.kv_cache_dtype; - const kv_cache_dtype = kv_cache_dtype_config - ? typeof kv_cache_dtype_config === 'string' - ? kv_cache_dtype_config - : (kv_cache_dtype_config[selectedDtype] ?? 'float32') - : undefined; - - if (kv_cache_dtype && !['float32', 'float16'].includes(kv_cache_dtype)) { - throw new Error(`Invalid kv_cache_dtype: ${kv_cache_dtype}. Should be one of: float32, float16`); - } - - const session_config = { - dtype: selectedDtype, - kv_cache_dtype, - device: selectedDevice, - }; - - // Construct the model file name - const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[selectedDtype]; - const baseName = `${fileName}${suffix}.onnx`; - const modelFileName = `${options.subfolder ?? ''}/${baseName}`; - - const session_options = { ...options.session_options }; - - // Overwrite `executionProviders` if not specified - session_options.executionProviders ??= executionProviders; - - // Overwrite `freeDimensionOverrides` if specified in config and not set in session options - const free_dimension_overrides = custom_config.free_dimension_overrides; - if (free_dimension_overrides) { - session_options.freeDimensionOverrides ??= free_dimension_overrides; - } else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) { - console.warn( - `WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${selectedDevice}"]. ` + - `When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`, - ); - } - - const return_path = apis.IS_NODE_ENV && env.useFSCache; - const bufferOrPathPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options, return_path); - - // Handle onnx external data files - const use_external_data_format = options.use_external_data_format ?? custom_config.use_external_data_format; - /** @type {Promise[]} */ - let externalDataPromises = []; - if (use_external_data_format) { - let external_data_format; - if (typeof use_external_data_format === 'object') { - if (use_external_data_format.hasOwnProperty(baseName)) { - external_data_format = use_external_data_format[baseName]; - } else if (use_external_data_format.hasOwnProperty(fileName)) { - external_data_format = use_external_data_format[fileName]; - } else { - external_data_format = false; - } - } else { - external_data_format = use_external_data_format; - } - - const num_chunks = +external_data_format; // (false=0, true=1, number remains the same) - if (num_chunks > MAX_EXTERNAL_DATA_CHUNKS) { - throw new Error( - `The number of external data chunks (${num_chunks}) exceeds the maximum allowed value (${MAX_EXTERNAL_DATA_CHUNKS}).`, - ); - } - for (let i = 0; i < num_chunks; ++i) { - const path = `${baseName}_data${i === 0 ? '' : '_' + i}`; - const fullPath = `${options.subfolder ?? ''}/${path}`; - externalDataPromises.push( - new Promise(async (resolve, reject) => { - const data = await getModelFile( - pretrained_model_name_or_path, - fullPath, - true, - options, - return_path, - ); - resolve(data instanceof Uint8Array ? { path, data } : path); - }), - ); - } - } else if (session_options.externalData !== undefined) { - externalDataPromises = session_options.externalData.map(async (ext) => { - // if the external data is a string, fetch the file and replace the string with its content - // @ts-expect-error TS2339 - if (typeof ext.data === 'string') { - // @ts-expect-error TS2339 - const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options); - // @ts-expect-error TS2698 - return { ...ext, data: ext_buffer }; - } - return ext; - }); - } - - if (externalDataPromises.length > 0) { - const externalData = await Promise.all(externalDataPromises); - if (!apis.IS_NODE_ENV) { - session_options.externalData = externalData; - } - } - - if (is_decoder && selectedDevice === 'webgpu' && kv_cache_dtype_config !== false) { - const shapes = getCacheShapes(options.config, { - prefix: 'present', - }); - if (Object.keys(shapes).length > 0 && !isONNXProxy()) { - // Only set preferredOutputLocation if shapes are present and we aren't proxying ONNX - /** @type {Record} */ - const preferredOutputLocation = {}; - for (const key in shapes) { - preferredOutputLocation[key] = 'gpu-buffer'; - } - session_options.preferredOutputLocation = preferredOutputLocation; - } - } - - const buffer_or_path = await bufferOrPathPromise; - - return { buffer_or_path, session_options, session_config }; -} - -/** - * Helper function to create multiple InferenceSession objects. - * - * @param {string} pretrained_model_name_or_path The path to the directory containing the model file. - * @param {Record} names The names of the model files to load. - * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model. - * @param {string} [decoder_name] The name of the decoder model, if any. - * @returns {Promise>} A Promise that resolves to a dictionary of InferenceSession objects. - * @private - */ -async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = undefined) { - return Object.fromEntries( - await Promise.all( - Object.keys(names).map(async (name) => { - const { buffer_or_path, session_options, session_config } = await getSession( - pretrained_model_name_or_path, - names[name], - options, - name === decoder_name, - ); - const session = await createInferenceSession(buffer_or_path, session_options, session_config); - return [name, session]; - }), - ), - ); -} - -/** - * Helper function to load multiple optional configuration files - * @param {string} pretrained_model_name_or_path The path to the directory containing the config file. - * @param {Record} names The names of the config files to load. - * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the configs. - * @returns {Promise>} A Promise that resolves to a dictionary of configuration objects. - * @private - */ -async function getOptionalConfigs(pretrained_model_name_or_path, names, options) { - return Object.fromEntries( - await Promise.all( - Object.keys(names).map(async (name) => { - const config = await getModelJSON(pretrained_model_name_or_path, names[name], false, options); - return [name, config]; - }), - ), - ); -} - -/** - * Validate model inputs - * @param {Object} session The InferenceSession object that will be run. - * @param {Object} inputs The inputs to check. - * @returns {Record} The checked inputs. - * @throws {Error} If any inputs are missing. - * @private - */ -function validateInputs(session, inputs) { - /** - * NOTE: Create either a shallow or deep copy based on `onnx.wasm.proxy` - * @type {Record} - */ - const checkedInputs = Object.create(null); - const missingInputs = []; - for (const inputName of session.inputNames) { - const tensor = inputs[inputName]; - // Rare case where one of the model's input names corresponds to a built-in - // object name (e.g., toString), which would cause a simple (!tensor) check to fail, - // because it's not undefined but a function. - if (!(tensor instanceof Tensor)) { - missingInputs.push(inputName); - continue; - } - // NOTE: When `env.wasm.proxy is true` the tensor is moved across the Worker - // boundary, transferring ownership to the worker and invalidating the tensor. - // So, in this case, we simply sacrifice a clone for it. - checkedInputs[inputName] = isONNXProxy() ? tensor.clone() : tensor; - } - if (missingInputs.length > 0) { - throw new Error( - `An error occurred during model execution: "Missing the following inputs: ${missingInputs.join(', ')}.`, - ); - } - - const numInputsProvided = Object.keys(inputs).length; - const numInputsNeeded = session.inputNames.length; - if (numInputsProvided > numInputsNeeded) { - // No missing inputs, but too many inputs were provided. - // Warn the user and ignore the extra inputs. - let ignored = Object.keys(inputs).filter((inputName) => !session.inputNames.includes(inputName)); - console.warn( - `WARNING: Too many inputs were provided (${numInputsProvided} > ${numInputsNeeded}). The following inputs will be ignored: "${ignored.join(', ')}".`, - ); - } - - return checkedInputs; -} - -/** - * Executes an InferenceSession using the specified inputs. - * NOTE: `inputs` must contain at least the input names of the model. - * - If additional inputs are passed, they will be ignored. - * - If inputs are missing, an error will be thrown. - * - * @param {Object} session The InferenceSession object to run. - * @param {Object} inputs An object that maps input names to input tensors. - * @returns {Promise} A Promise that resolves to an object that maps output names to output tensors. - * @private - */ -async function sessionRun(session, inputs) { - const checkedInputs = validateInputs(session, inputs); - try { - // pass the original ort tensor - const ortFeed = Object.fromEntries(Object.entries(checkedInputs).map(([k, v]) => [k, v.ort_tensor])); - const output = await runInferenceSession(session, ortFeed); - return replaceTensors(output); - } catch (e) { - // Error messages can be long (nested) and uninformative. For this reason, - // we apply minor formatting to show the most important information - const formatted = Object.fromEntries( - Object.entries(checkedInputs).map(([k, tensor]) => { - // Extract these properties from the underlying ORT tensor - const unpacked = { - type: tensor.type, - dims: tensor.dims, - location: tensor.location, - }; - if (unpacked.location !== 'gpu-buffer') { - // Only return the data if it's not a GPU buffer - unpacked.data = tensor.data; - } - return [k, unpacked]; - }), - ); - - // This usually occurs when the inputs are of the wrong type. - console.error(`An error occurred during model execution: "${e}".`); - console.error('Inputs given to model:', formatted); - throw e; - } -} - -/** - * Replaces ONNX Tensor objects with custom Tensor objects to support additional functions. - * @param {Object} obj The object to replace tensor objects in. - * @returns {Object} The object with tensor objects replaced by custom Tensor objects. - * @private - */ -function replaceTensors(obj) { - for (let prop in obj) { - if (isONNXTensor(obj[prop])) { - obj[prop] = new Tensor(obj[prop]); - } else if (typeof obj[prop] === 'object') { - replaceTensors(obj[prop]); - } - } - return obj; -} - -/** - * Converts an array or Tensor of integers to an int64 Tensor. - * @param {any[]|Tensor} items The input integers to be converted. - * @returns {Tensor} The int64 Tensor with the converted values. - * @throws {Error} If the input array is empty or the input is a batched Tensor and not all sequences have the same length. - * @private - */ -function toI64Tensor(items) { - if (items instanceof Tensor) { - return items; - } - // items is an array - if (items.length === 0) { - throw Error('items must be non-empty'); - } - - if (Array.isArray(items[0])) { - // batched - if (items.some((x) => x.length !== items[0].length)) { - throw Error( - "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.", - ); - } - - return new Tensor('int64', BigInt64Array.from(items.flat().map((x) => BigInt(x))), [ - items.length, - items[0].length, - ]); - } else { - //flat - return new Tensor('int64', BigInt64Array.from(items.map((x) => BigInt(x))), [1, items.length]); - } -} - -/** - * Creates a boolean tensor with a single value. - * @param {boolean} value The value of the tensor. - * @returns {Tensor} The boolean tensor. - * @private - */ -function boolTensor(value) { - return new Tensor('bool', [value], [1]); -} - -// JS doesn't support mixins, so we define some reused functions here, and allow "this" to be passed in -/** - * Perform forward pass on the seq2seq model (both encoder and decoder). - * @param {Object} self The seq2seq model object. - * @param {Object} model_inputs The input object for the model containing encoder and decoder inputs. - * @returns {Promise} Promise that resolves with the output of the seq2seq model. - * @private - */ -async function seq2seqForward(self, model_inputs) { - let { encoder_outputs, input_ids, decoder_input_ids, ...other_decoder_inputs } = model_inputs; - // Encode if needed - if (!encoder_outputs) { - const encoder_inputs = pick(model_inputs, self.sessions['model'].inputNames); - // Encoder outputs are not given, so we must compute them. - encoder_outputs = (await encoderForward(self, encoder_inputs)).last_hidden_state; - } - - other_decoder_inputs.input_ids = decoder_input_ids; - other_decoder_inputs.encoder_hidden_states = encoder_outputs; - - if (self.sessions['decoder_model_merged'].inputNames.includes('encoder_attention_mask')) { - other_decoder_inputs.encoder_attention_mask = model_inputs.attention_mask; - } - - const decoderResults = await decoderForward(self, other_decoder_inputs, true); - - return decoderResults; -} - -/** - * Forward pass of an encoder model. - * @param {Object} self The encoder model. - * @param {Object} model_inputs The input data to be used for the forward pass. - * @returns {Promise} The model's outputs. - * @private - */ -async function encoderForward(self, model_inputs) { - const session = self.sessions['model']; - const encoderFeeds = pick(model_inputs, session.inputNames); - - if (session.inputNames.includes('inputs_embeds') && !encoderFeeds.inputs_embeds) { - if (!model_inputs.input_ids) { - throw new Error('Both `input_ids` and `inputs_embeds` are missing in the model inputs.'); - } - encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids }); - } - if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) { - if (!encoderFeeds.input_ids) { - throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.'); - } - // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it, - // but they weren't created by the tokenizer. - encoderFeeds.token_type_ids = zeros_like(encoderFeeds.input_ids); - } - if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) { - if (!encoderFeeds.pixel_values) { - throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.'); - } - // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it, - // but they weren't created by the processor. - const dims = encoderFeeds.pixel_values.dims; - encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]); - } - - return await sessionRun(session, encoderFeeds); -} - -async function autoEncoderForward(self, model_inputs) { - const encoded = await self.encode(model_inputs); - const decoded = await self.decode(encoded); - return decoded; -} - -/** - * Forward pass of a decoder model. - * @param {Object} self The decoder model. - * @param {Object} model_inputs The input data to be used for the forward pass. - * @returns {Promise} The logits and past key values. - * @private - */ -async function decoderForward(self, model_inputs, is_encoder_decoder = false) { - const session = self.sessions[is_encoder_decoder ? 'decoder_model_merged' : 'model']; - - const { past_key_values, ...new_model_inputs } = model_inputs; - - if (session.inputNames.includes('use_cache_branch')) { - new_model_inputs.use_cache_branch = boolTensor(!!past_key_values); - } - if ( - session.inputNames.includes('position_ids') && - new_model_inputs.attention_mask && - !new_model_inputs.position_ids - ) { - // NOTE: Handle a special case for paligemma/gemma3 models, where positions are 1-indexed - const start_index = ['paligemma', 'gemma3_text', 'gemma3'].includes(self.config.model_type) ? 1 : 0; - new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index); - } - - // Unpack the `past_key_values` object into model inputs - self.addPastKeyValues(new_model_inputs, past_key_values); - - // Select only the inputs that are needed for the current session - const fixed = pick(new_model_inputs, session.inputNames); - return await sessionRun(session, fixed); -} - -function default_merge_input_ids_with_features({ - modality_token_id, - inputs_embeds, - modality_features, - input_ids, - attention_mask, -}) { - const token_positions = input_ids.tolist().map((ids) => - ids.reduce((acc, x, idx) => { - if (x == modality_token_id) acc.push(idx); - return acc; - }, []), - ); - const n_tokens = token_positions.reduce((acc, x) => acc + x.length, 0); - const n_features = modality_features.dims[0]; - if (n_tokens !== n_features) { - throw new Error(`Number of tokens and features do not match: tokens: ${n_tokens}, features ${n_features}`); - } - - // Equivalent to performing a masked_scatter - let img = 0; - for (let i = 0; i < token_positions.length; ++i) { - const tokens = token_positions[i]; - const embeds = inputs_embeds[i]; - for (let j = 0; j < tokens.length; ++j) { - embeds[tokens[j]].data.set(modality_features[img++].data); - } - } - return { inputs_embeds, attention_mask }; -} - -function default_merge_input_ids_with_image_features({ - image_token_id, - inputs_embeds, - image_features, - input_ids, - attention_mask, -}) { - return default_merge_input_ids_with_features({ - modality_token_id: image_token_id, - inputs_embeds, - modality_features: image_features, - input_ids, - attention_mask, - }); -} - -function default_merge_input_ids_with_audio_features({ - audio_token_id, - inputs_embeds, - audio_features, - input_ids, - attention_mask, -}) { - return default_merge_input_ids_with_features({ - modality_token_id: audio_token_id, - inputs_embeds, - modality_features: audio_features, - input_ids, - attention_mask, - }); -} - -/** - * Abstract forward pass function for image-text-to-text or audio-text-to-text models. - * @param {Object} self The model object. - * @param {Object} params Additional parameters. - * @param {Function} [params.encode_function] The function to encode the modality values. - * @param {Function} [params.merge_function] The function to merge the modality features with the input embeddings. - * @param {string} [params.modality_input_name] The modality input name. - * @param {string} [params.modality_output_name] The modality output name. - * @param {Tensor} [params.input_ids=null] - * @param {Tensor} [params.attention_mask=null] - * @param {Tensor} [params.position_ids=null] - * @param {Tensor} [params.inputs_embeds=null] - * @param {Tensor} [params.past_key_values=null] - * @param {Object} [params.generation_config=null] - * @param {Object} [params.logits_processor=null] - * @returns {Promise} The model's output tensor - * @private - */ -async function genericTextToTextForward( - self, - { - // Generic parameters: - encode_function, - merge_function, - modality_input_name, - modality_output_name, - - // Produced by the tokenizer/processor: - input_ids = null, - attention_mask = null, - - // Used during generation: - position_ids = null, - inputs_embeds = null, - past_key_values = null, - - // Generic generation parameters - generation_config = null, - logits_processor = null, - - // Additional parameters - ...kwargs - }, -) { - const modality_values = kwargs[modality_input_name]; - if (!inputs_embeds) { - // 1. Extract the text embeddings. - inputs_embeds = await self.encode_text({ input_ids, ...kwargs }); - - // 2. Possibly, merge text and modality values - if (modality_values && input_ids.dims[1] !== 1) { - const modality_features = await encode_function({ - // Pass the modality values under its expected key. - // The caller knows whether this is audio or image. - [modality_input_name]: modality_values, - ...kwargs, - }); - ({ inputs_embeds, attention_mask } = merge_function({ - [modality_output_name]: modality_features, - inputs_embeds, - input_ids, - attention_mask, - })); - } else if (past_key_values && modality_values && input_ids.dims[1] === 1) { - // This branch handles the cache case. - const target_length = input_ids.dims[1]; // always 1 - const past_length = Object.values(past_key_values)[0].dims.at(-2); - - attention_mask = cat( - [ - ones([input_ids.dims[0], past_length]), - attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]]), - ], - 1, - ); - } - } - - if (!position_ids) { - if (self.config.model_type === 'qwen2_vl') { - // Special case for qwen2_vl models - // @ts-ignore - const { image_grid_thw, video_grid_thw } = kwargs; - [position_ids] = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask); - } - } - - // 3. Call the decoder forward using the updated inputs. - const outputs = await decoderForward( - self, - { - inputs_embeds, - past_key_values, - attention_mask, - position_ids, - generation_config, - logits_processor, - }, - true, - ); - return outputs; -} - -/** - * Forward pass of an audio-text-to-text model. - * @param {Object} self The audio-text-to-text model. - * @param {Object} params The inputs for the audio-text-to-text forward pass. - * @returns {Promise} The model's output tensor. - * @private - */ -async function audioTextToTextForward(self, params) { - return await genericTextToTextForward(self, { - ...params, - modality_input_name: 'audio_values', - modality_output_name: 'audio_features', - encode_function: self.encode_audio.bind(self), - merge_function: self._merge_input_ids_with_audio_features.bind(self), - }); -} - -/** - * Forward pass of an image-text-to-text model. - * @param {Object} self The image-text-to-text model. - * @param {Object} params The inputs for the image-text-to-text forward pass. - * @returns {Promise} The model's output tensor. - * @private - */ -async function imageTextToTextForward(self, params) { - return await genericTextToTextForward(self, { - ...params, - modality_input_name: 'pixel_values', - modality_output_name: 'image_features', - encode_function: self.encode_image.bind(self), - merge_function: self._merge_input_ids_with_image_features.bind(self), - }); -} - -/** - * Helper function to perform the following: - * ```python - * x = attention_mask.long().cumsum(-1) - 1 - * x.masked_fill_(attention_mask == 0, 1) - * ``` - * @param {Tensor} attention_mask - * @returns {{data: BigInt64Array, dims: number[]}} - */ -function cumsum_masked_fill(attention_mask, start_index = 0) { - const [bz, seq_len] = attention_mask.dims; - const attn_mask_data = attention_mask.data; - - const data = new BigInt64Array(attn_mask_data.length); - for (let i = 0; i < bz; ++i) { - const start = i * seq_len; - let sum = BigInt(start_index); - for (let j = 0; j < seq_len; ++j) { - const index = start + j; - if (attn_mask_data[index] === 0n) { - data[index] = BigInt(1); - } else { - // === 1n - data[index] = sum; - sum += attn_mask_data[index]; - } - } - } - return { data, dims: attention_mask.dims }; -} - -/** - * If the model supports providing position_ids, we create position_ids on the fly for batch generation, - * by computing the cumulative sum of the attention mask along the sequence length dimension. - * - * Equivalent to: - * ```python - * position_ids = attention_mask.long().cumsum(-1) - 1 - * position_ids.masked_fill_(attention_mask == 0, 1) - * if past_key_values: - * position_ids = position_ids[:, -input_ids.shape[1] :] - * ``` - */ -function createPositionIds(model_inputs, past_key_values = null, start_index = 0) { - const { input_ids, inputs_embeds, attention_mask } = model_inputs; - - const { data, dims } = cumsum_masked_fill(attention_mask, start_index); - let position_ids = new Tensor('int64', data, dims); - if (past_key_values) { - const offset = -(input_ids ?? inputs_embeds).dims.at(1); - position_ids = position_ids.slice(null, [offset, null]); - } - return position_ids; -} - -function decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { - const past_length = model_inputs.past_key_values ? Object.values(model_inputs.past_key_values)[0].dims.at(-2) : 0; - - if (!model_inputs.attention_mask) { - // If the attention mask is not provided, we attempt to infer based on provided inputs - let dims; - for (const key of ['input_ids', 'inputs_embeds', 'position_ids']) { - if (model_inputs[key]) { - dims = model_inputs[key].dims; - break; - } - } - if (!dims) { - throw new Error('attention_mask is not provided, and unable to infer its shape from model inputs.'); - } - model_inputs.attention_mask = ones([dims[0], past_length + dims[1]]); - } - - if (model_inputs.past_key_values) { - const { input_ids, attention_mask } = model_inputs; - - // Keep only the unprocessed tokens: - // 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - // some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - // input) - if (attention_mask && attention_mask.dims[1] > input_ids.dims[1]) { - // NOTE: not needed since we only pass the generated tokens to the next forward pass - // const offset = -(attention_mask.dims[1] - past_length); - // model_inputs.input_ids = input_ids.slice(null, [offset, null]); - } - // 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. - // We can discard input_ids based on the past_length. - else if (past_length < input_ids.dims[1]) { - // NOTE: Required for phi models. - // See https://github.com/huggingface/transformers/issues/30809#issuecomment-2111918479 for more information. - model_inputs.input_ids = input_ids.slice(null, [past_length, null]); - } - // 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - else { - } - } - - return model_inputs; -} - -function encoder_decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { - if (model_inputs.past_key_values) { - input_ids = input_ids.map((x) => [x.at(-1)]); - } - - return { - ...model_inputs, - decoder_input_ids: toI64Tensor(input_ids), - }; -} - -function multimodal_text_to_text_prepare_inputs_for_generation(self, ...args) { - if (self.config.is_encoder_decoder) { - return encoder_decoder_prepare_inputs_for_generation(self, ...args); - } else { - return decoder_prepare_inputs_for_generation(self, ...args); - } -} - -function multimodality_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { - const has_past_key_values = !!model_inputs.past_key_values; - - if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { - if (has_past_key_values) { - model_inputs.input_ids = cat([model_inputs.input_ids, model_inputs.input_ids], 0); - // NOTE: attention_mask handled in generation - } else { - model_inputs.input_ids = cat( - [model_inputs.input_ids, full_like(model_inputs.input_ids, BigInt(generation_config.pad_token_id))], - 0, - ); - model_inputs.attention_mask = cat( - [model_inputs.attention_mask, full_like(model_inputs.attention_mask, 0n)], - 0, - ); - } - } - - if (has_past_key_values || !model_inputs.pixel_values) { - model_inputs.pixel_values = full([0, 0, 3, 384, 384], 1.0); - } - - if (has_past_key_values) { - const num_img_tokens = 0; - const num_text_tokens = 1; - const has_image = num_img_tokens > 0 ? 1 : 0; - - const batch_size = 1; - model_inputs.images_seq_mask = new Tensor( - 'bool', - new Array(num_img_tokens + num_text_tokens).fill(true).fill(false, 0, num_text_tokens), - [batch_size, num_img_tokens + num_text_tokens], - ); - model_inputs.images_emb_mask = new Tensor('bool', new Array(num_img_tokens).fill(!!has_image), [ - batch_size, - 1, - num_img_tokens, - ]); - } - return model_inputs; -} - -function chatterbox_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { - if (!model_inputs.position_ids && self.sessions['embed_tokens'].inputNames.includes('position_ids')) { - // If position_ids are not provided, we create them on the fly using the position of the START_SPEECH_TOKEN - const START_SPEECH_TOKEN = 6561; - if (model_inputs.input_ids.dims[1] === 1) { - const position_ids = Array.from( - { - length: input_ids.length, - }, - (_, i) => input_ids[i].length - input_ids[i].findLastIndex((x) => x == START_SPEECH_TOKEN) - 1, - ); - model_inputs.position_ids = new Tensor('int64', position_ids, [input_ids.length, 1]); - } else { - const batched_input_ids = model_inputs.input_ids.tolist(); - const position_ids_list = batched_input_ids.map((ids) => { - let position = 0; - return ids.map((id) => (id >= START_SPEECH_TOKEN ? 0 : position++)); - }); - model_inputs.position_ids = new Tensor('int64', position_ids_list.flat(), model_inputs.input_ids.dims); - } - } - if (model_inputs.input_ids.dims[1] === 1) { - // We are in generation mode and no longer need the audio inputs - delete model_inputs.audio_values; - delete model_inputs.audio_features; - delete model_inputs.audio_tokens; - delete model_inputs.speaker_embeddings; - delete model_inputs.speaker_features; - } - return decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config); -} - -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -/** - * A base class for pre-trained models that provides the model configuration and an ONNX session. - */ -export class PreTrainedModel extends Callable { - main_input_name = 'input_ids'; - forward_params = ['input_ids', 'attention_mask']; - - _return_dict_in_generate_keys = null; - /** - * Creates a new instance of the `PreTrainedModel` class. - * @param {import('./configs.js').PretrainedConfig} config The model configuration. - * @param {Record} sessions The inference sessions for the model. - * @param {Record} configs Additional configuration files (e.g., generation_config.json). - */ - constructor(config, sessions, configs) { - super(); - - this.config = config; - this.sessions = sessions; - this.configs = configs; - - const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); - const modelType = MODEL_TYPE_MAPPING.get(modelName); - - this.can_generate = false; - this._forward = null; - - this._prepare_inputs_for_generation = null; - switch (modelType) { - case MODEL_TYPES.DecoderOnly: - this.can_generate = true; - this._forward = decoderForward; - this._prepare_inputs_for_generation = decoder_prepare_inputs_for_generation; - break; - case MODEL_TYPES.Seq2Seq: - case MODEL_TYPES.Vision2Seq: - case MODEL_TYPES.Musicgen: - this.can_generate = true; - - this._forward = seq2seqForward; - this._prepare_inputs_for_generation = encoder_decoder_prepare_inputs_for_generation; - break; - - case MODEL_TYPES.EncoderDecoder: - this._forward = seq2seqForward; - break; - case MODEL_TYPES.ImageTextToText: - this.can_generate = true; - this._forward = imageTextToTextForward; - this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation; - break; - case MODEL_TYPES.AudioTextToText: - this.can_generate = true; - this._forward = audioTextToTextForward; - this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation; - break; - case MODEL_TYPES.Phi3V: - case MODEL_TYPES.ImageAudioTextToText: - this.can_generate = true; - this._prepare_inputs_for_generation = multimodal_text_to_text_prepare_inputs_for_generation; - break; - case MODEL_TYPES.MultiModality: - this.can_generate = true; - this._prepare_inputs_for_generation = multimodality_prepare_inputs_for_generation; - break; - case MODEL_TYPES.AutoEncoder: - this._forward = autoEncoderForward; - break; - case MODEL_TYPES.Chatterbox: - this.can_generate = true; - this._prepare_inputs_for_generation = chatterbox_prepare_inputs_for_generation; - default: - // should be MODEL_TYPES.EncoderOnly - this._forward = encoderForward; - break; - } - - if (this.can_generate) { - this.forward_params.push('past_key_values'); - } - - /** @type {import('./configs.js').TransformersJSConfig} */ - this.custom_config = this.config['transformers.js_config'] ?? {}; - } - - /** - * Disposes of all the ONNX sessions that were created during inference. - * @returns {Promise} An array of promises, one for each ONNX session that is being disposed. - * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry - */ - async dispose() { - const promises = []; - for (const session of Object.values(this.sessions)) { - promises.push(session.release?.()); - } - return await Promise.all(promises); - } - - /** - * Instantiate one of the model classes of the library from a pretrained model. - * - * The model class to instantiate is selected based on the `model_type` property of the config object - * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) - * - * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: - * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a - * user or organization name, like `dbmdz/bert-base-german-cased`. - * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. - * @param {import('./utils/hub.js').PretrainedModelOptions} options Additional options for loading the model. - * - * @returns {Promise} A new instance of the `PreTrainedModel` class. - */ - static async from_pretrained( - pretrained_model_name_or_path, - { - progress_callback = null, - config = null, - cache_dir = null, - local_files_only = false, - revision = 'main', - model_file_name = null, - subfolder = 'onnx', - device = null, - dtype = null, - use_external_data_format = null, - session_options = {}, - } = {}, - ) { - let options = { - progress_callback, - config, - cache_dir, - local_files_only, - revision, - model_file_name, - subfolder, - device, - dtype, - use_external_data_format, - session_options, - }; - - const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this); - const modelType = MODEL_TYPE_MAPPING.get(modelName); - - config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options); - - let info; - if (modelType === MODEL_TYPES.DecoderOnly) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - model: options.model_file_name ?? 'model', - }, - options, - 'model', - ), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - model: 'encoder_model', - decoder_model_merged: 'decoder_model_merged', - }, - options, - 'decoder_model_merged', - ), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.MaskGeneration) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - model: 'vision_encoder', - prompt_encoder_mask_decoder: 'prompt_encoder_mask_decoder', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.EncoderDecoder) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - model: 'encoder_model', - decoder_model_merged: 'decoder_model_merged', - }, - options, - 'decoder_model_merged', - ), - ]); - } else if (modelType === MODEL_TYPES.ImageTextToText) { - const sessions = { - embed_tokens: 'embed_tokens', - vision_encoder: 'vision_encoder', - decoder_model_merged: 'decoder_model_merged', - }; - if (config.is_encoder_decoder) { - sessions['model'] = 'encoder_model'; - } - info = await Promise.all([ - constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.AudioTextToText) { - const sessions = { - embed_tokens: 'embed_tokens', - audio_encoder: 'audio_encoder', - decoder_model_merged: 'decoder_model_merged', - }; - info = await Promise.all([ - constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.ImageAudioTextToText) { - const sessions = { - embed_tokens: 'embed_tokens', - audio_encoder: 'audio_encoder', - vision_encoder: 'vision_encoder', - decoder_model_merged: 'decoder_model_merged', - }; - info = await Promise.all([ - constructSessions(pretrained_model_name_or_path, sessions, options), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.Musicgen) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - model: 'text_encoder', - decoder_model_merged: 'decoder_model_merged', - encodec_decode: 'encodec_decode', - }, - options, - 'decoder_model_merged', - ), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.MultiModality) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - prepare_inputs_embeds: 'prepare_inputs_embeds', - model: 'language_model', - lm_head: 'lm_head', - gen_head: 'gen_head', - gen_img_embeds: 'gen_img_embeds', - image_decode: 'image_decode', - }, - options, - 'model', - ), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.Phi3V) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - prepare_inputs_embeds: 'prepare_inputs_embeds', - model: 'model', - vision_encoder: 'vision_encoder', - }, - options, - 'model', - ), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.Chatterbox) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - embed_tokens: 'embed_tokens', - speech_encoder: 'speech_encoder', - model: 'language_model', - conditional_decoder: 'conditional_decoder', - }, - options, - 'model', - ), - getOptionalConfigs( - pretrained_model_name_or_path, - { - generation_config: 'generation_config.json', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.AutoEncoder) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - encoder_model: 'encoder_model', - decoder_model: 'decoder_model', - }, - options, - ), - ]); - } else if (modelType === MODEL_TYPES.Supertonic) { - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - text_encoder: 'text_encoder', - latent_denoiser: 'latent_denoiser', - voice_decoder: 'voice_decoder', - }, - options, - ), - ]); - } else { - // should be MODEL_TYPES.EncoderOnly - if (modelType !== MODEL_TYPES.EncoderOnly) { - const type = modelName ?? config?.model_type; - if (type !== 'custom') { - console.warn( - `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`, - ); - } - } - info = await Promise.all([ - constructSessions( - pretrained_model_name_or_path, - { - model: options.model_file_name ?? 'model', - }, - options, - ), - ]); - } - - // @ts-ignore - return new this(config, ...info); - } - - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Object containing input tensors - * @returns {Promise} Object containing output tensors - */ - async _call(model_inputs) { - return await this.forward(model_inputs); - } - - /** - * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method - * will be chosen based on the model type. - * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model. - * @returns {Promise} The output data from the model in the format specified in the ONNX model. - * @throws {Error} This method must be implemented in subclasses. - */ - async forward(model_inputs) { - return await this._forward(this, model_inputs); - } - - /** - * Get the model's generation config, if it exists. - * @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`. - */ - get generation_config() { - return this.configs?.generation_config ?? null; - } - - /** - * @param {GenerationConfig} generation_config - * @param {number} input_ids_seq_length The starting sequence length for the input ids. - * @returns {LogitsProcessorList} - * @private - */ - _get_logits_processor( - generation_config, - input_ids_seq_length, - // encoder_input_ids, TODO - // prefix_allowed_tokens_fn, TODO - logits_processor = null, - ) { - const processors = new LogitsProcessorList(); - - // if (generation_config.diversity_penalty !== null && generation_config.diversity_penalty > 0.0) { - // processors.push(new HammingDiversityLogitsProcessor( - // generation_config.diversity_penalty, - // generation_config.num_beams, - // generation_config.num_beam_groups - // )); - // } - - // if (generation_config.encoder_repetition_penalty !== null && generation_config.encoder_repetition_penalty !== 1.0) { - // processors.push(new EncoderRepetitionPenaltyLogitsProcessor( - // generation_config.encoder_repetition_penalty, - // encoder_input_ids - // )); - // } - - if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1.0) { - processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty)); - } - - if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) { - processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size)); - } - - // if (generation_config.encoder_no_repeat_ngram_size !== null && generation_config.encoder_no_repeat_ngram_size > 0) { - // if (this.config.is_encoder_decoder) { - // processors.push(new EncoderNoRepeatNGramLogitsProcessor( - // generation_config.encoder_no_repeat_ngram_size, - // encoder_input_ids - // )); - // } else { - // throw new Error("It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture"); - // } - // } - - if (generation_config.bad_words_ids !== null) { - processors.push( - new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id), - ); - } - - if ( - generation_config.min_length !== null && - generation_config.eos_token_id !== null && - generation_config.min_length > 0 - ) { - processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id)); - } - - if ( - generation_config.min_new_tokens !== null && - generation_config.eos_token_id !== null && - generation_config.min_new_tokens > 0 - ) { - processors.push( - new MinNewTokensLengthLogitsProcessor( - input_ids_seq_length, - generation_config.min_new_tokens, - generation_config.eos_token_id, - ), - ); - } - - // if (prefix_allowed_tokens_fn !== null) { - // processors.push(new PrefixConstrainedLogitsProcessor( - // prefix_allowed_tokens_fn, - // generation_config.num_beams / generation_config.num_beam_groups - // )); - // } - - if (generation_config.forced_bos_token_id !== null) { - processors.push(new ForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id)); - } - - if (generation_config.forced_eos_token_id !== null) { - processors.push( - new ForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id), - ); - } - - // if (generation_config.remove_invalid_values === true) { - // processors.push(new InfNanRemoveLogitsProcessor()); - // } - - // if (generation_config.exponential_decay_length_penalty !== null) { - // processors.push(new ExponentialDecayLengthPenalty( - // generation_config.exponential_decay_length_penalty, - // generation_config.eos_token_id, - // input_ids_seq_length - // )); - // } - - // if (generation_config.suppress_tokens !== null) { - // processors.push(new SuppressTokensLogitsProcessor(generation_config.suppress_tokens)); - // } - - if (generation_config.begin_suppress_tokens !== null) { - const begin_index = - input_ids_seq_length > 1 || generation_config.forced_bos_token_id === null - ? input_ids_seq_length - : input_ids_seq_length + 1; - - processors.push( - new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index), - ); - } - - // DEPRECATED: https://github.com/huggingface/transformers/pull/29485 - // if (generation_config.forced_decoder_ids !== null) { - // processors.push(new ForceTokensLogitsProcessor(generation_config.forced_decoder_ids)); - // } - - // 8. prepare batched CFG externally - if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { - processors.push(new ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale)); - } - - if (generation_config.temperature === 0 && generation_config.do_sample) { - console.warn( - '`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`.', - ); - generation_config.do_sample = false; - } - - if (generation_config.do_sample) { - if (generation_config.temperature !== null && generation_config.temperature !== 1.0) { - processors.push(new TemperatureLogitsWarper(generation_config.temperature)); - } - // TODO: Add TopPLogitsWarper and TopKLogitsWarper - // if (generation_config.top_k !== null && generation_config.top_k !== 0) { - // processors.push(new TopKLogitsWarper(generation_config.top_k)); - // } - // if (generation_config.top_p !== null && generation_config.top_p < 1.0) { - // processors.push(new TopPLogitsWarper(generation_config.top_p)); - // } - } - - if (logits_processor !== null) { - processors.extend(logits_processor); - } - - // `LogitNormalization` should always be the last logit processor, when present - // if (generation_config.renormalize_logits === true) { - // processors.push(new LogitNormalization()); - // } - - return processors; - } - - /** - * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation. - * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object. - * @param {GenerationConfig|null} generation_config A `GenerationConfig` object containing generation parameters. - * @param {Object} kwargs Additional generation parameters to be used in place of those in the `generation_config` object. - * @returns {GenerationConfig} The final generation config object to be used by the model for text generation. - */ - _prepare_generation_config(generation_config, kwargs, cls = GenerationConfig) { - // Create empty generation config (contains defaults) - // We pass `this.config` so that if `eos_token_id` or `bos_token_id` exist in the model's config, we will use them - const config = { ...this.config }; - for (const key of ['decoder', 'generator', 'text_config']) { - // Special case: some models have generation attributes set in the decoder. - // Use them if still unset in the generation config. - if (key in config) { - Object.assign(config, config[key]); - } - } - - const gen_config = new cls(config); - - // Apply model's generation config, if it exists - Object.assign(gen_config, this.generation_config ?? {}); - - // Next, use any generation config specified by the user - // when calling `generate` - if (generation_config) { - Object.assign(gen_config, generation_config); - } - - // Finally, if any kwargs were passed, use them to overwrite - if (kwargs) { - Object.assign(gen_config, pick(kwargs, Object.getOwnPropertyNames(gen_config))); - } - - return gen_config; - } - - /** - * - * @param {GenerationConfig} generation_config - * @param {StoppingCriteriaList} [stopping_criteria=null] - */ - _get_stopping_criteria(generation_config, stopping_criteria = null) { - const criteria = new StoppingCriteriaList(); - - if (generation_config.max_length !== null) { - criteria.push( - new MaxLengthCriteria(generation_config.max_length, this.config.max_position_embeddings ?? null), - ); - } - // if (generation_config.max_time !== null) { - // criteria.push(new MaxTimeCriteria(generation_config.max_time)); - // } - if (generation_config.eos_token_id !== null) { - criteria.push(new EosTokenCriteria(generation_config.eos_token_id)); - } - - if (stopping_criteria) { - criteria.extend(stopping_criteria); - } - return criteria; - } - - /** - * Confirms that the model class is compatible with generation. - * If not, raises an exception that points to the right class to use. - */ - _validate_model_class() { - if (!this.can_generate) { - const generate_compatible_mappings = [ - MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, - // MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, // TODO - MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, - MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, - MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, - ]; - - const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); - - const generate_compatible_classes = new Set(); - const modelType = this.config.model_type; - for (const model_mapping of generate_compatible_mappings) { - const supported_models = model_mapping.get(modelType); - if (supported_models) { - generate_compatible_classes.add(supported_models[0]); - } - } - - let errorMessage = `The current model class (${modelName}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`; - if (generate_compatible_classes.size > 0) { - errorMessage += ` Please use the following class instead: ${[...generate_compatible_classes].join(', ')}`; - } - throw Error(errorMessage); - } - } - - prepare_inputs_for_generation(...args) { - return this._prepare_inputs_for_generation(this, ...args); - } - - /** - * - * @param {Object} inputs - * @param {bigint[][]} inputs.generated_input_ids - * @param {Object} inputs.outputs - * @param {Object} inputs.model_inputs - * @param {boolean} inputs.is_encoder_decoder - * @returns {Object} The updated model inputs for the next generation iteration. - */ - _update_model_kwargs_for_generation({ generated_input_ids, outputs, model_inputs, is_encoder_decoder }) { - // update past_key_values - model_inputs['past_key_values'] = this.getPastKeyValues(outputs, model_inputs.past_key_values); - - // update inputs for next run - model_inputs['input_ids'] = new Tensor('int64', generated_input_ids.flat(), [generated_input_ids.length, 1]); - - if (!is_encoder_decoder) { - // update attention mask - model_inputs.attention_mask = cat( - [model_inputs.attention_mask, ones([model_inputs.attention_mask.dims[0], 1])], - 1, - ); - } else if ('decoder_attention_mask' in model_inputs) { - // TODO: update decoder attention mask if the model requires it - } - - // force recreate position_ids in next iteration - model_inputs['position_ids'] = null; - - return model_inputs; - } - - /** - * This function extracts the model-specific `inputs` for generation. - * @param {Object} params - * @param {Tensor} [params.inputs=null] - * @param {number} [params.bos_token_id=null] - * @param {Record} [params.model_kwargs] - * @returns {{inputs_tensor: Tensor, model_inputs: Record, model_input_name: string}} The model-specific inputs for generation. - */ - _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) { - const model_inputs = pick(model_kwargs, this.forward_params); - const input_name = this.main_input_name; - if (input_name in model_inputs) { - if (inputs) { - throw new Error( - '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' + - 'Make sure to either pass {inputs} or {input_name}=...', - ); - } - } else { - model_inputs[input_name] = inputs; - } - - const inputs_tensor = model_inputs[input_name]; - - return { inputs_tensor, model_inputs, model_input_name: input_name }; - } - - async _prepare_encoder_decoder_kwargs_for_generation({ - inputs_tensor, - model_inputs, - model_input_name, - generation_config, - }) { - if ( - this.sessions['model'].inputNames.includes('inputs_embeds') && - !model_inputs.inputs_embeds && - '_prepare_inputs_embeds' in this - ) { - // Encoder expects `inputs_embeds` instead of `input_ids` - const { input_ids, pixel_values, attention_mask, ...kwargs } = model_inputs; - // @ts-ignore - const prepared_inputs = await this._prepare_inputs_embeds(model_inputs); - model_inputs = { - ...kwargs, - ...pick(prepared_inputs, ['inputs_embeds', 'attention_mask']), - }; - } - let { last_hidden_state } = await encoderForward(this, model_inputs); - - // for classifier free guidance we need to add a 'null' input to our encoder hidden states - if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { - last_hidden_state = cat([last_hidden_state, full_like(last_hidden_state, 0.0)], 0); - - if ('attention_mask' in model_inputs) { - model_inputs['attention_mask'] = cat( - [model_inputs['attention_mask'], zeros_like(model_inputs['attention_mask'])], - 0, - ); - } - } else if (model_inputs.decoder_input_ids) { - // Ensure that the encoder outputs have the same batch size as the decoder inputs, - // allowing for more efficient batched generation for single inputs - const decoder_input_ids_batch_size = toI64Tensor(model_inputs.decoder_input_ids).dims[0]; - if (decoder_input_ids_batch_size !== last_hidden_state.dims[0]) { - if (last_hidden_state.dims[0] !== 1) { - throw new Error( - `The encoder outputs have a different batch size (${last_hidden_state.dims[0]}) than the decoder inputs (${decoder_input_ids_batch_size}).`, - ); - } - last_hidden_state = cat( - Array.from({ length: decoder_input_ids_batch_size }, () => last_hidden_state), - 0, - ); - } - } - model_inputs['encoder_outputs'] = last_hidden_state; - - return model_inputs; - } - - /** - * Prepares `decoder_input_ids` for generation with encoder-decoder models - * @param {*} param0 - */ - _prepare_decoder_input_ids_for_generation({ - batch_size, - model_input_name, - model_kwargs, - decoder_start_token_id, - bos_token_id, - generation_config, - }) { - let { decoder_input_ids, ...model_inputs } = model_kwargs; - - // Prepare input ids if the user has not defined `decoder_input_ids` manually. - if (!(decoder_input_ids instanceof Tensor)) { - if (!decoder_input_ids) { - decoder_start_token_id ??= bos_token_id; - - if (this.config.model_type === 'musicgen') { - // Custom logic (TODO: move to Musicgen class) - decoder_input_ids = Array.from( - { - // @ts-expect-error TS2339 - length: batch_size * this.config.decoder.num_codebooks, - }, - () => [decoder_start_token_id], - ); - } else if (Array.isArray(decoder_start_token_id)) { - if (decoder_start_token_id.length !== batch_size) { - throw new Error( - `\`decoder_start_token_id\` expcted to have length ${batch_size} but got ${decoder_start_token_id.length}`, - ); - } - decoder_input_ids = decoder_start_token_id; - } else { - decoder_input_ids = Array.from( - { - length: batch_size, - }, - () => [decoder_start_token_id], - ); - } - } else if (!Array.isArray(decoder_input_ids[0])) { - // Correct batch size - decoder_input_ids = Array.from( - { - length: batch_size, - }, - () => decoder_input_ids, - ); - } - decoder_input_ids = toI64Tensor(decoder_input_ids); - } - - model_kwargs['decoder_attention_mask'] = ones_like(decoder_input_ids); - - return { input_ids: decoder_input_ids, model_inputs }; - } - - /** - * Generates sequences of token ids for models with a language modeling head. - * @param {import('./generation/parameters.js').GenerationFunctionParameters} options - * @returns {Promise} The output of the model, which can contain the generated token ids, attentions, and scores. - */ - async generate({ - inputs = null, - generation_config = null, - logits_processor = null, - stopping_criteria = null, - streamer = null, - - // inputs_attention_mask = null, - ...kwargs - }) { - this._validate_model_class(); - - // Update generation config with defaults and kwargs - generation_config = this._prepare_generation_config(generation_config, kwargs); - - // 3. Define model inputs - let { inputs_tensor, model_inputs, model_input_name } = this._prepare_model_inputs({ - inputs, - model_kwargs: kwargs, - }); - - const is_encoder_decoder = this.config.is_encoder_decoder; - - // 4. Define other model kwargs - if (!is_encoder_decoder) { - // decoder-only models should use left-padding for generation - } else if (!('encoder_outputs' in model_inputs)) { - // if model is encoder decoder encoder_outputs are created - // and added to `model_kwargs` - model_inputs = await this._prepare_encoder_decoder_kwargs_for_generation({ - inputs_tensor, - model_inputs, - model_input_name, - generation_config, - }); - } - - // 5. Prepare `input_ids` which will be used for auto-regressive generation - // TODO: Update to align with HF transformers' implementation - let input_ids; - if (is_encoder_decoder) { - // Generating from the encoder outputs - ({ input_ids, model_inputs } = this._prepare_decoder_input_ids_for_generation({ - batch_size: model_inputs[model_input_name].dims.at(0), - model_input_name, - model_kwargs: model_inputs, - decoder_start_token_id: generation_config.decoder_start_token_id, - bos_token_id: generation_config.bos_token_id, - generation_config, - })); - } else { - input_ids = model_inputs[model_input_name]; - } - - // 6. Prepare `max_length` depending on other stopping criteria. - let input_ids_length = input_ids.dims.at(-1); - - if (generation_config.max_new_tokens !== null) { - generation_config.max_length = input_ids_length + generation_config.max_new_tokens; - } - - // input_ids_length = model_inputs[model_input_name].dims.at(1); - // // inputs instanceof Tensor ? : inputs.length; - - // // decoder-only - // if (input_ids_length === 0) { - // throw Error("Must supply a non-empty array of input token ids.") - // } - - // let decoder_input_ids = - // generation_config.decoder_input_ids - // ?? generation_config.decoder_start_token_id - // ?? generation_config.bos_token_id - // ?? generation_config.eos_token_id; - - // Update logits processor - // 8. prepare distribution pre_processing samplers - const prepared_logits_processor = this._get_logits_processor( - generation_config, - input_ids_length, - logits_processor, - ); - - // 9. prepare stopping criteria - const prepared_stopping_criteria = this._get_stopping_criteria(generation_config, stopping_criteria); - - // /** @type {number[]} */ - // let eos_token_ids = generation_config.eos_token_id; - // if (eos_token_ids !== null && !Array.isArray(eos_token_ids)) { - // eos_token_ids = [eos_token_ids]; - // } - - const numInputs = model_inputs[model_input_name].dims.at(0); - - // TODO: - // done is a list of booleans to keep track of which inputs are done - // const done = new Array(numInputs).fill(false); - // For efficiency purposes, we remove completed rows from model_inputs - // when the beam is complete, and we keep track of the row index - // const rowIndexToBatchIndex = new Map(); - - const sampler = LogitsSampler.getSampler(generation_config); - - // TODO make > numInputs - const scores = new Array(numInputs).fill(0); - /** @type {bigint[][]} */ - const all_input_ids = input_ids.tolist(); - if (streamer) { - streamer.put(all_input_ids); - } - // const all_generated_input_ids = Array.from({ length: numInputs }, () => []); - - // NOTE: For now, we don't support spawning new beams - // TODO: when we do, we simply copy past key values and accumulate into single large tensor - - //////////////////////////////////////////////////// - // Generic search which handles 4 generation modes: - // - GenerationMode.GREEDY_SEARCH - // - GenerationMode.SAMPLE - // - GenerationMode.BEAM_SEARCH - // - GenerationMode.BEAM_SAMPLE - //////////////////////////////////////////////////// - let outputs; - let attentions = {}; - let return_dict_items = {}; - while (true) { - // prepare model inputs - model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config); - outputs = await this.forward(model_inputs); - - if (generation_config.return_dict_in_generate) { - if (generation_config.output_attentions) { - // Get attentions if they are present - const token_attentions = this.getAttentions(outputs); - for (const key in token_attentions) { - if (!(key in attentions)) { - attentions[key] = []; - } - attentions[key].push(token_attentions[key]); - } - } else if (this._return_dict_in_generate_keys) { - Object.assign(return_dict_items, pick(outputs, this._return_dict_in_generate_keys)); - } - } - - // Logits are of the form [batch_size, out_seq_length, vocab_size] - // In most cases, this will be [batch_size, 1, vocab_size] - // So, we select the last token's logits: - // (equivalent to `logits = outputs.logits[:, -1, :]`) - // The `.to('float32')` is necessary for models with float16 logits, - // and is a no-op for float32 logits. - // TODO: Support float16 sampling in the sampler directly - const logits = outputs.logits.slice(null, -1, null).to('float32'); - - const next_tokens_scores = prepared_logits_processor(all_input_ids, logits); - - /** @type {[bigint][]} */ - const generated_input_ids = []; - // const new_kv_cache = [];// NOTE: Only used for beam search when concatenating new kv - // Loop over each batch - for (let batch_idx = 0; batch_idx < next_tokens_scores.dims.at(0); ++batch_idx) { - const logs = next_tokens_scores[batch_idx]; - - const sampledTokens = await sampler(logs); - for (const [newTokenId, logProb] of sampledTokens) { - const bigint = BigInt(newTokenId); - // TODO: If branching, use previous beam as a starting point - // update generated ids, model inputs, and length for next step - scores[batch_idx] += logProb; - all_input_ids[batch_idx].push(bigint); - generated_input_ids.push([bigint]); - - // TODO: Support beam search - break; - } - } - if (streamer) { - streamer.put(generated_input_ids); - } - - const stop = prepared_stopping_criteria(all_input_ids); - if (stop.every((x) => x)) { - break; - } - - model_inputs = this._update_model_kwargs_for_generation({ - generated_input_ids, - outputs, - model_inputs, - is_encoder_decoder, - }); - } - - if (streamer) { - streamer.end(); - } - - // Retrieve and dispose all final past key values (including encoder attentions) - const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true); - - // TODO: ensure all_input_ids is padded correctly... - const sequences = new Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]); - - if (generation_config.return_dict_in_generate) { - return { - sequences, - past_key_values, - ...attentions, - ...return_dict_items, - // TODO: - // scores, - // logits, - }; - } else { - // Dispose all remaining tensors - for (const tensor of Object.values(outputs)) { - if (tensor.location === 'gpu-buffer') { - tensor.dispose(); - } - } - return sequences; - } - } - - /** - * Returns an object containing past key values from the given decoder results object. - * - * @param {Object} decoderResults The decoder results object. - * @param {Object} pastKeyValues The previous past key values. - * @returns {Object} An object containing past key values. - */ - getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) { - const pkvs = Object.create(null); - - for (const name in decoderResults) { - if (name.startsWith('present')) { - const newName = name - // Hybrid cache architecture - .replace('present_ssm', 'past_ssm') // Mamba - .replace('present_conv', 'past_conv') // LFM2 - - // Standard cache architecture - .replace('present', 'past_key_values'); - const is_encoder_pkv = name.includes('encoder'); - if (is_encoder_pkv && pastKeyValues) { - // Optimization introduced by optimum to reuse past key values. - // So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values. - // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704 - pkvs[newName] = pastKeyValues[newName]; - } else { - // decoder or using first encoder PKVs - pkvs[newName] = decoderResults[name]; - } - - if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) { - // - Always dispose decoder PKVs - // - Only dispose encoder past key values when requested (after generation) - const t = pastKeyValues[newName]; - if (t.location === 'gpu-buffer') { - t.dispose(); - } - } - } - } - return pkvs; - } - - /** - * Returns an object containing attentions from the given model output object. - * - * @param {Object} model_output The output of the model. - * @returns {{cross_attentions?: Tensor[]}} An object containing attentions. - */ - getAttentions(model_output) { - const attentions = {}; - - for (const attnName of ['cross_attentions', 'encoder_attentions', 'decoder_attentions']) { - for (const name in model_output) { - if (name.startsWith(attnName)) { - if (!(attnName in attentions)) { - attentions[attnName] = []; - } - attentions[attnName].push(model_output[name]); - } - } - } - return attentions; - } - - /** - * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values. - * - * @param {Object} decoderFeeds The decoder feeds object to add past key values to. - * @param {Object} pastKeyValues An object containing past key values. - */ - addPastKeyValues(decoderFeeds, pastKeyValues) { - if (pastKeyValues) { - Object.assign(decoderFeeds, pastKeyValues); - } else { - const session = this.sessions['decoder_model_merged'] ?? this.sessions['model']; - const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1; - - const dtype = session?.config?.kv_cache_dtype ?? 'float32'; - const cls = dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32; - const shapes = getCacheShapes(this.config, { batch_size }); - for (const name in shapes) { - const size = shapes[name].reduce((a, b) => a * b, 1); - decoderFeeds[name] = new Tensor(dtype, new cls(size), shapes[name]); - } - } - } - - async encode_image({ pixel_values }) { - // image_inputs === { pixel_values } - return (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features; - } - - async encode_text({ input_ids }) { - // text_inputs === { input_ids, attention_mask } - return (await sessionRun(this.sessions['embed_tokens'], { input_ids })).inputs_embeds; - } - - async encode_audio({ audio_values }) { - // audio_inputs === { audio_values } - return (await sessionRun(this.sessions['audio_encoder'], { audio_values })).audio_features; - } -} - -////////////////////////////////////////////////// -// Base model output class -export class ModelOutput {} - -/** - * Base class for model's outputs, with potential hidden states and attentions. - */ -export class BaseModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model. - * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - */ - constructor({ last_hidden_state, hidden_states = null, attentions = null }) { - super(); - this.last_hidden_state = last_hidden_state; - this.hidden_states = hidden_states; - this.attentions = attentions; - } -} -////////////////////////////////////////////////// -// Bert models -export class BertPreTrainedModel extends PreTrainedModel {} -export class BertModel extends BertPreTrainedModel {} - -/** - * BertForMaskedLM is a class representing a BERT model for masked language modeling. - */ -export class BertForMaskedLM extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * BertForSequenceClassification is a class representing a BERT model for sequence classification. - */ -export class BertForSequenceClassification extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * BertForTokenClassification is a class representing a BERT model for token classification. - */ -export class BertForTokenClassification extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * BertForQuestionAnswering is a class representing a BERT model for question answering. - */ -export class BertForQuestionAnswering extends BertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// NeoBert models -export class NeoBertPreTrainedModel extends PreTrainedModel {} -export class NeoBertModel extends NeoBertPreTrainedModel {} - -export class NeoBertForMaskedLM extends NeoBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -export class NeoBertForSequenceClassification extends NeoBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -export class NeoBertForTokenClassification extends NeoBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -export class NeoBertForQuestionAnswering extends NeoBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// ModernBert models -export class ModernBertPreTrainedModel extends PreTrainedModel {} -export class ModernBertModel extends ModernBertPreTrainedModel {} - -export class ModernBertForMaskedLM extends ModernBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -export class ModernBertForSequenceClassification extends ModernBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -export class ModernBertForTokenClassification extends ModernBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// ModernBERT Decoder models -export class ModernBertDecoderPreTrainedModel extends PreTrainedModel {} -export class ModernBertDecoderModel extends ModernBertDecoderPreTrainedModel {} -export class ModernBertDecoderForCausalLM extends ModernBertDecoderPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// NomicBert models -export class NomicBertPreTrainedModel extends PreTrainedModel {} -export class NomicBertModel extends NomicBertPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// RoFormer models -export class RoFormerPreTrainedModel extends PreTrainedModel {} - -/** - * The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top. - */ -export class RoFormerModel extends RoFormerPreTrainedModel {} - -/** - * RoFormer Model with a `language modeling` head on top. - */ -export class RoFormerForMaskedLM extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class RoFormerForSequenceClassification extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) - * e.g. for Named-Entity-Recognition (NER) tasks. - */ -export class RoFormerForTokenClassification extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD - * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - */ -export class RoFormerForQuestionAnswering extends RoFormerPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -// TODO: Add RoFormerForCausalLM and RoFormerForMultipleChoice -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// ConvBert models -export class ConvBertPreTrainedModel extends PreTrainedModel {} - -/** - * The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top. - */ -export class ConvBertModel extends ConvBertPreTrainedModel {} - -/** - * ConvBERT Model with a language modeling head on top. - */ -export class ConvBertForMaskedLM extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class ConvBertForSequenceClassification extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) - * e.g. for Named-Entity-Recognition (NER) tasks. - */ -export class ConvBertForTokenClassification extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD - * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`) - */ -export class ConvBertForQuestionAnswering extends ConvBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Electra models -export class ElectraPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Electra Model transformer outputting raw hidden-states without any specific head on top. - * Identical to the BERT model except that it uses an additional linear layer between the embedding - * layer and the encoder if the hidden size and embedding size are different. - */ -export class ElectraModel extends ElectraPreTrainedModel {} -// TODO add ElectraForPreTraining -/** - * Electra model with a language modeling head on top. - */ -export class ElectraForMaskedLM extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class ElectraForSequenceClassification extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * Electra model with a token classification head on top. - */ -export class ElectraForTokenClassification extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * LECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD - * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - */ -export class ElectraForQuestionAnswering extends ElectraPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// CamemBERT models -export class CamembertPreTrainedModel extends PreTrainedModel {} - -/** - * The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top. - */ -export class CamembertModel extends CamembertPreTrainedModel {} - -/** - * CamemBERT Model with a `language modeling` head on top. - */ -export class CamembertForMaskedLM extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. - */ -export class CamembertForSequenceClassification extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. - */ -export class CamembertForTokenClassification extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * CamemBERT Model with a span classification head on top for extractive question-answering tasks - */ -export class CamembertForQuestionAnswering extends CamembertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// DeBERTa models -export class DebertaPreTrainedModel extends PreTrainedModel {} - -/** - * The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top. - */ -export class DebertaModel extends DebertaPreTrainedModel {} - -/** - * DeBERTa Model with a `language modeling` head on top. - */ -export class DebertaForMaskedLM extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class DebertaForSequenceClassification extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. - */ -export class DebertaForTokenClassification extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - */ -export class DebertaForQuestionAnswering extends DebertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// DeBERTa-v2 models -export class DebertaV2PreTrainedModel extends PreTrainedModel {} - -/** - * The bare DeBERTa-V2 Model transformer outputting raw hidden-states without any specific head on top. - */ -export class DebertaV2Model extends DebertaV2PreTrainedModel {} - -/** - * DeBERTa-V2 Model with a `language modeling` head on top. - */ -export class DebertaV2ForMaskedLM extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * DeBERTa-V2 Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class DebertaV2ForSequenceClassification extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * DeBERTa-V2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. - */ -export class DebertaV2ForTokenClassification extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * DeBERTa-V2 Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). - */ -export class DebertaV2ForQuestionAnswering extends DebertaV2PreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// DistilBert models -export class DistilBertPreTrainedModel extends PreTrainedModel {} -export class DistilBertModel extends DistilBertPreTrainedModel {} - -/** - * DistilBertForSequenceClassification is a class representing a DistilBERT model for sequence classification. - */ -export class DistilBertForSequenceClassification extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * DistilBertForTokenClassification is a class representing a DistilBERT model for token classification. - */ -export class DistilBertForTokenClassification extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * DistilBertForQuestionAnswering is a class representing a DistilBERT model for question answering. - */ -export class DistilBertForQuestionAnswering extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} - -/** - * DistilBertForMaskedLM is a class representing a DistilBERT model for masking task. - */ -export class DistilBertForMaskedLM extends DistilBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// ESM models -export class EsmPreTrainedModel extends PreTrainedModel {} - -/** - * The bare ESM Model transformer outputting raw hidden-states without any specific head on top. - */ -export class EsmModel extends EsmPreTrainedModel {} - -/** - * ESM Model with a `language modeling` head on top. - */ -export class EsmForMaskedLM extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class EsmForSequenceClassification extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * ESM Model with a token classification head on top (a linear layer on top of the hidden-states output) - * e.g. for Named-Entity-Recognition (NER) tasks. - */ -export class EsmForTokenClassification extends EsmPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MobileBert models -export class MobileBertPreTrainedModel extends PreTrainedModel {} -export class MobileBertModel extends MobileBertPreTrainedModel {} - -/** - * MobileBertForMaskedLM is a class representing a MobileBERT model for masking task. - */ -export class MobileBertForMaskedLM extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class MobileBertForSequenceClassification extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * MobileBert Model with a span classification head on top for extractive question-answering tasks - */ -export class MobileBertForQuestionAnswering extends MobileBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MPNet models -export class MPNetPreTrainedModel extends PreTrainedModel {} - -/** - * The bare MPNet Model transformer outputting raw hidden-states without any specific head on top. - */ -export class MPNetModel extends MPNetPreTrainedModel {} - -/** - * MPNetForMaskedLM is a class representing a MPNet model for masked language modeling. - */ -export class MPNetForMaskedLM extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for masked language modeling. - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * MPNetForSequenceClassification is a class representing a MPNet model for sequence classification. - */ -export class MPNetForSequenceClassification extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * MPNetForTokenClassification is a class representing a MPNet model for token classification. - */ -export class MPNetForTokenClassification extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * MPNetForQuestionAnswering is a class representing a MPNet model for question answering. - */ -export class MPNetForQuestionAnswering extends MPNetPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for question answering. - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// SqueezeBert models -export class SqueezeBertPreTrainedModel extends PreTrainedModel {} -export class SqueezeBertModel extends SqueezeBertPreTrainedModel {} -export class SqueezeBertForMaskedLM extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} -export class SqueezeBertForSequenceClassification extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -export class SqueezeBertForQuestionAnswering extends SqueezeBertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Albert models -export class AlbertPreTrainedModel extends PreTrainedModel {} -export class AlbertModel extends AlbertPreTrainedModel {} -export class AlbertForSequenceClassification extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -export class AlbertForQuestionAnswering extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -export class AlbertForMaskedLM extends AlbertPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// T5 models -export class T5PreTrainedModel extends PreTrainedModel { - forward_params = [ - 'input_ids', - 'attention_mask', - 'encoder_outputs', - 'decoder_input_ids', - 'decoder_attention_mask', - 'past_key_values', - ]; -} - -export class T5Model extends T5PreTrainedModel {} - -/** - * T5Model is a class representing a T5 model for conditional generation. - */ -export class T5ForConditionalGeneration extends T5PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// LONGT5 models -/** - * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. - */ -export class LongT5PreTrainedModel extends PreTrainedModel {} - -/** - * The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top. - */ -export class LongT5Model extends LongT5PreTrainedModel {} - -/** - * LONGT5 Model with a `language modeling` head on top. - */ -export class LongT5ForConditionalGeneration extends LongT5PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MT5 models -export class MT5PreTrainedModel extends PreTrainedModel {} - -export class MT5Model extends MT5PreTrainedModel {} - -/** - * A class representing a conditional sequence-to-sequence model based on the MT5 architecture. - */ -export class MT5ForConditionalGeneration extends MT5PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Bart models -export class BartPretrainedModel extends PreTrainedModel {} - -/** - * The bare BART Model outputting raw hidden-states without any specific head on top. - */ -export class BartModel extends BartPretrainedModel {} - -/** - * The BART Model with a language modeling head. Can be used for summarization. - */ -export class BartForConditionalGeneration extends BartPretrainedModel {} - -/** - * Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) - */ -export class BartForSequenceClassification extends BartPretrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MBart models -export class MBartPreTrainedModel extends PreTrainedModel {} - -/** - * The bare MBART Model outputting raw hidden-states without any specific head on top. - */ -export class MBartModel extends MBartPreTrainedModel {} - -/** - * The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models. - */ -export class MBartForConditionalGeneration extends MBartPreTrainedModel {} - -/** - * MBart model with a sequence classification/head on top (a linear layer on top of the pooled output). - */ -export class MBartForSequenceClassification extends MBartPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -export class MBartForCausalLM extends MBartPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Blenderbot models -export class BlenderbotPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Blenderbot Model outputting raw hidden-states without any specific head on top. - */ -export class BlenderbotModel extends BlenderbotPreTrainedModel {} - -/** - * The Blenderbot Model with a language modeling head. Can be used for summarization. - */ -export class BlenderbotForConditionalGeneration extends BlenderbotPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Blenderbot models -export class BlenderbotSmallPreTrainedModel extends PreTrainedModel {} - -/** - * The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top. - */ -export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel {} - -/** - * The BlenderbotSmall Model with a language modeling head. Can be used for summarization. - */ -export class BlenderbotSmallForConditionalGeneration extends BlenderbotSmallPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Roberta models -export class RobertaPreTrainedModel extends PreTrainedModel {} -export class RobertaModel extends RobertaPreTrainedModel {} - -/** - * RobertaForMaskedLM class for performing masked language modeling on Roberta models. - */ -export class RobertaForMaskedLM extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * RobertaForSequenceClassification class for performing sequence classification on Roberta models. - */ -export class RobertaForSequenceClassification extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * RobertaForTokenClassification class for performing token classification on Roberta models. - */ -export class RobertaForTokenClassification extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * RobertaForQuestionAnswering class for performing question answering on Roberta models. - */ -export class RobertaForQuestionAnswering extends RobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// XLM models -/** - * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. - */ -export class XLMPreTrainedModel extends PreTrainedModel {} - -/** - * The bare XLM Model transformer outputting raw hidden-states without any specific head on top. - */ -export class XLMModel extends XLMPreTrainedModel {} - -/** - * The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). - */ -export class XLMWithLMHeadModel extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) - */ -export class XLMForSequenceClassification extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) - */ -export class XLMForTokenClassification extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * XLM Model with a span classification head on top for extractive question-answering tasks - */ -export class XLMForQuestionAnswering extends XLMPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// XLMRoberta models -export class XLMRobertaPreTrainedModel extends PreTrainedModel {} -export class XLMRobertaModel extends XLMRobertaPreTrainedModel {} - -/** - * XLMRobertaForMaskedLM class for performing masked language modeling on XLMRoberta models. - */ -export class XLMRobertaForMaskedLM extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new MaskedLMOutput(await super._call(model_inputs)); - } -} - -/** - * XLMRobertaForSequenceClassification class for performing sequence classification on XLMRoberta models. - */ -export class XLMRobertaForSequenceClassification extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * XLMRobertaForTokenClassification class for performing token classification on XLMRoberta models. - */ -export class XLMRobertaForTokenClassification extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for token classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * XLMRobertaForQuestionAnswering class for performing question answering on XLMRoberta models. - */ -export class XLMRobertaForQuestionAnswering extends XLMRobertaPreTrainedModel { - /** - * Calls the model on new inputs. - * - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} returned object - */ - async _call(model_inputs) { - return new QuestionAnsweringModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Audio Spectrogram Transformer (AST) models -export class ASTPreTrainedModel extends PreTrainedModel {} - -/** - * The bare AST Model transformer outputting raw hidden-states without any specific head on top. - */ -export class ASTModel extends ASTPreTrainedModel {} - -/** - * Audio Spectrogram Transformer model with an audio classification head on top - * (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2. - */ -export class ASTForAudioClassification extends ASTPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Whisper models -export class WhisperPreTrainedModel extends PreTrainedModel { - requires_attention_mask = false; - main_input_name = 'input_features'; - forward_params = [ - 'input_features', - 'attention_mask', - 'decoder_input_ids', - 'decoder_attention_mask', - 'past_key_values', - ]; -} - -/** - * WhisperModel class for training Whisper models without a language model head. - */ -export class WhisperModel extends WhisperPreTrainedModel {} - -/** - * WhisperForConditionalGeneration class for generating conditional outputs from Whisper models. - */ -export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { - _prepare_generation_config(generation_config, kwargs) { - return /** @type {WhisperGenerationConfig} */ ( - super._prepare_generation_config(generation_config, kwargs, WhisperGenerationConfig) - ); - } - - /** - * - * @param {WhisperGenerationConfig} generation_config - */ - _retrieve_init_tokens(generation_config) { - // prefix tokens are of the form: - // - Multilingual: <|startoftranscript|> <|lang_id|> <|task|> [<|notimestamps|>] - // - English-only: <|startoftranscript|> [<|notimestamps|>] - - // 1. Handle <|startoftranscript|> token - const init_tokens = [generation_config.decoder_start_token_id]; - - // 2. Handle <|lang_id|> and <|task> tokens - let language = generation_config.language; - const task = generation_config.task; - if (generation_config.is_multilingual) { - if (!language) { - // TODO: Implement language detection - console.warn('No language specified - defaulting to English (en).'); - language = 'en'; - } - - // Add language token - const language_code = whisper_language_to_code(language); - const language_token = `<|${language_code}|>`; - init_tokens.push(generation_config.lang_to_id[language_token]); - - // Add task token - // NOTE: Defaults to 'transcribe' if no task is specified - init_tokens.push(generation_config.task_to_id[task ?? 'transcribe']); - } else if (language || task) { - throw new Error( - 'Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.', - ); - } - - // 3. Handle <|notimestamps|> token - if ( - !generation_config.return_timestamps && - generation_config.no_timestamps_token_id && - init_tokens.at(-1) !== generation_config.no_timestamps_token_id - ) { - init_tokens.push(generation_config.no_timestamps_token_id); - } else if ( - generation_config.return_timestamps && - init_tokens.at(-1) === generation_config.no_timestamps_token_id - ) { - console.warn( - '<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`.', - ); - init_tokens.pop(); - } - - // let's make sure we don't pass `null` tokens as prompt tokens - return init_tokens.filter((token) => token != null); - } - - /** - * Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids. - * @param {import('./models/whisper/generation_whisper.js').WhisperGenerationFunctionParameters} options - * @returns {Promise} The output of the model, which can contain the generated token ids, attentions, and scores. - */ - async generate({ - inputs = null, - generation_config = null, - logits_processor = null, - stopping_criteria = null, - - // Whisper-specific options (passed to kwargs) - // prompt_ids = null, - // language = null, - // task = null, - - ...kwargs - }) { - generation_config = this._prepare_generation_config(generation_config, kwargs); - - const init_tokens = kwargs.decoder_input_ids ?? this._retrieve_init_tokens(generation_config); - - if (generation_config.return_timestamps) { - logits_processor ??= new LogitsProcessorList(); - logits_processor.push(new WhisperTimeStampLogitsProcessor(generation_config, init_tokens)); - } - - if (generation_config.begin_suppress_tokens) { - logits_processor ??= new LogitsProcessorList(); - logits_processor.push( - new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, init_tokens.length), - ); - } - - if (generation_config.return_token_timestamps) { - if (!generation_config.alignment_heads) { - throw new Error( - 'Model generation config has no `alignment_heads`, token-level timestamps not available. ' + - 'See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.', - ); - } - - if (generation_config.task === 'translate') { - console.warn("Token-level timestamps may not be reliable for task 'translate'."); - } - - generation_config.output_attentions = true; - generation_config.return_dict_in_generate = true; - } - - const outputs = await super.generate({ - inputs, - generation_config, - logits_processor, - decoder_input_ids: init_tokens, - ...kwargs, - }); - - if (generation_config.return_token_timestamps) { - outputs['token_timestamps'] = this._extract_token_timestamps( - // @ts-expect-error TS2345 - outputs, - generation_config.alignment_heads, - generation_config.num_frames, - ); - } - - return outputs; - } - - /** - * Calculates token-level timestamps using the encoder-decoder cross-attentions and - * dynamic time-warping (DTW) to map each output token to a position in the input audio. - * If `num_frames` is specified, the encoder-decoder cross-attentions will be cropped before applying DTW. - * @param {Object} generate_outputs Outputs generated by the model - * @param {Tensor[][]} generate_outputs.cross_attentions The cross attentions output by the model - * @param {Tensor} generate_outputs.sequences The sequences output by the model - * @param {number[][]} alignment_heads Alignment heads of the model - * @param {number} [num_frames=null] Number of frames in the input audio. - * @param {number} [time_precision=0.02] Precision of the timestamps in seconds - * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token - */ - _extract_token_timestamps(generate_outputs, alignment_heads, num_frames = null, time_precision = 0.02) { - if (!generate_outputs.cross_attentions) { - throw new Error( - 'Model outputs must contain cross attentions to extract timestamps. ' + - 'This is most likely because the model was not exported with `output_attentions=True`.', - ); - } - if (num_frames == null) { - console.warn( - '`num_frames` has not been set, meaning the entire audio will be analyzed. ' + - 'This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).', - ); - } - - // @ts-expect-error TS2339 - let median_filter_width = this.config.median_filter_width; - if (median_filter_width === undefined) { - console.warn('Model config has no `median_filter_width`, using default value of 7.'); - median_filter_width = 7; - } - - // TODO: Improve batch processing - const batch = generate_outputs.cross_attentions; - // Create a list with `decoder_layers` elements, each a tensor of shape - // (batch size, attention_heads, output length, input length). - const cross_attentions = Array.from( - // @ts-expect-error TS2339 - { length: this.config.decoder_layers }, - // Concatenate the cross attentions for each layer across sequence length dimension. - (_, i) => - cat( - batch.map((x) => x[i]), - 2, - ), - ); - - const weights = stack( - alignment_heads.map(([l, h]) => { - if (l >= cross_attentions.length) { - throw new Error( - `Layer index ${l} is out of bounds for cross attentions (length ${cross_attentions.length}).`, - ); - } - return num_frames - ? cross_attentions[l].slice(null, h, null, [0, num_frames]) - : cross_attentions[l].slice(null, h); - }), - ).transpose(1, 0, 2, 3); - - const [std, calculatedMean] = std_mean(weights, -2, 0, true); - - // Normalize and smoothen the weights. - const smoothedWeights = weights.clone(); // [1, 8, seqLength, 1500] - - for (let a = 0; a < smoothedWeights.dims[0]; ++a) { - const aTensor = smoothedWeights[a]; // [8, seqLength, 1500] - - for (let b = 0; b < aTensor.dims[0]; ++b) { - const bTensor = aTensor[b]; // [seqLength, 1500] - - const stdTensorData = std[a][b][0].data; // [1500] - const meanTensorData = calculatedMean[a][b][0].data; // [1500] - - for (let c = 0; c < bTensor.dims[0]; ++c) { - let cTensorData = bTensor[c].data; // [1500] - for (let d = 0; d < cTensorData.length; ++d) { - cTensorData[d] = (cTensorData[d] - meanTensorData[d]) / stdTensorData[d]; - } - - // Apply median filter. - cTensorData.set(medianFilter(cTensorData, median_filter_width)); - } - } - } - - // Average the different cross-attention heads. - const batchedMatrices = [mean(smoothedWeights, 1)]; - - const timestampsShape = generate_outputs.sequences.dims; - - const timestamps = new Tensor( - 'float32', - new Float32Array(timestampsShape[0] * timestampsShape[1]), - timestampsShape, - ); - - // Perform dynamic time warping on each element of the batch. - for (let batch_idx = 0; batch_idx < timestampsShape[0]; ++batch_idx) { - // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions - // as the python implementation - const matrix = batchedMatrices[batch_idx].neg().squeeze_(0); - const [text_indices, time_indices] = dynamic_time_warping(matrix.tolist()); - - const diffs = Array.from( - { length: text_indices.length - 1 }, - (v, i) => text_indices[i + 1] - text_indices[i], - ); - const jumps = mergeArrays([1], diffs).map((x) => !!x); // convert to boolean - - const jump_times = []; - for (let i = 0; i < jumps.length; ++i) { - if (jumps[i]) { - // NOTE: No point in rounding here, since we set to Float32Array later - jump_times.push(time_indices[i] * time_precision); - } - } - timestamps[batch_idx].data.set(jump_times, 1); - } - - return timestamps; - } -} -////////////////////////////////////////////////// - -export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration {} - -////////////////////////////////////////////////// -// Moonshine models -export class MoonshinePreTrainedModel extends PreTrainedModel { - requires_attention_mask = false; - main_input_name = 'input_values'; - forward_params = ['input_values', 'decoder_input_ids', 'past_key_values']; -} - -/** - * MoonshineModel class for training Moonshine models without a language model head. - */ -export class MoonshineModel extends MoonshinePreTrainedModel {} - -export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -/** - * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks - */ -export class VisionEncoderDecoderModel extends PreTrainedModel { - main_input_name = 'pixel_values'; - forward_params = [ - // Encoder inputs - 'pixel_values', - - // Decoder inpputs - 'decoder_input_ids', - 'encoder_hidden_states', - 'past_key_values', - ]; -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// LLaVa Models -export class LlavaPreTrainedModel extends PreTrainedModel { - forward_params = ['input_ids', 'attention_mask', 'pixel_values', 'position_ids', 'past_key_values']; -} - -/** - * The LLAVA model which consists of a vision backbone and a language model. - */ -export class LlavaForConditionalGeneration extends LlavaPreTrainedModel { - _merge_input_ids_with_image_features(kwargs) { - const vision_hidden_size = kwargs.image_features.dims.at(-1); - const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); - - return default_merge_input_ids_with_image_features({ - // @ts-ignore - image_token_id: this.config.image_token_index, - ...kwargs, - image_features: reshaped_image_hidden_states, - }); - } -} -////////////////////////////////////////////////// - -export class LlavaOnevisionForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration -export class Moondream1ForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration - -export class Florence2PreTrainedModel extends PreTrainedModel { - forward_params = [ - // Encoder inputs - 'input_ids', - 'inputs_embeds', - 'attention_mask', - 'pixel_values', - - // Decoder inputs - 'encoder_outputs', - 'decoder_input_ids', - 'decoder_inputs_embeds', - 'decoder_attention_mask', - 'past_key_values', - ]; - main_input_name = 'inputs_embeds'; -} - -export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel { - _merge_input_ids_with_image_features({ inputs_embeds, image_features, input_ids, attention_mask }) { - return { - inputs_embeds: cat( - [ - image_features, // image embeds - inputs_embeds, // task prefix embeds - ], - 1, - ), - attention_mask: cat( - [ - ones(image_features.dims.slice(0, 2)), // image attention mask - attention_mask, // task prefix attention mask - ], - 1, - ), - }; - } - - async _prepare_inputs_embeds({ input_ids, pixel_values, inputs_embeds, attention_mask }) { - if (!input_ids && !pixel_values) { - throw new Error('Either `input_ids` or `pixel_values` should be provided.'); - } - - // 1. Possibly, extract the input embeddings - let text_features, image_features; - if (input_ids) { - text_features = await this.encode_text({ input_ids }); - } - if (pixel_values) { - image_features = await this.encode_image({ pixel_values }); - } - - // 2. Possibly, merge text and images - if (text_features && image_features) { - ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({ - inputs_embeds: text_features, - image_features, - input_ids, - attention_mask, - })); - } else { - inputs_embeds = text_features || image_features; - } - - return { inputs_embeds, attention_mask }; - } - - async forward({ - input_ids, - pixel_values, - attention_mask, - decoder_input_ids, - decoder_attention_mask, - encoder_outputs, - past_key_values, - - inputs_embeds, - decoder_inputs_embeds, - }) { - if (!inputs_embeds) { - ({ inputs_embeds, attention_mask } = await this._prepare_inputs_embeds({ - input_ids, - pixel_values, - inputs_embeds, - attention_mask, - })); - } - - if (!encoder_outputs) { - // Must compute encoder outputs - let { last_hidden_state } = await encoderForward(this, { inputs_embeds, attention_mask }); - encoder_outputs = last_hidden_state; - } - - if (!decoder_inputs_embeds) { - if (!decoder_input_ids) { - throw new Error('Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.'); - } - decoder_inputs_embeds = await this.encode_text({ input_ids: decoder_input_ids }); - } - - const decoderFeeds = { - inputs_embeds: decoder_inputs_embeds, - attention_mask: decoder_attention_mask, - encoder_attention_mask: attention_mask, - encoder_hidden_states: encoder_outputs, - past_key_values, - }; - const decoder_outputs = await decoderForward(this, decoderFeeds, true); - return decoder_outputs; - } -} - -export class PaliGemmaPreTrainedModel extends PreTrainedModel { - forward_params = [ - 'input_ids', - // 'inputs_embeds', - 'attention_mask', - 'pixel_values', - 'position_ids', - 'past_key_values', - ]; -} - -export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel { - _merge_input_ids_with_image_features(kwargs) { - const vision_hidden_size = kwargs.image_features.dims.at(-1); - const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); - - return default_merge_input_ids_with_image_features({ - // @ts-ignore - image_token_id: this.config.image_token_index, - ...kwargs, - image_features: reshaped_image_hidden_states, - }); - } -} - -export class LlavaQwen2ForCausalLM extends LlavaPreTrainedModel { - _merge_input_ids_with_image_features(kwargs) { - const vision_hidden_size = kwargs.image_features.dims.at(-1); - const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); - - return default_merge_input_ids_with_image_features({ - // @ts-ignore - image_token_id: this.config.image_token_index, - ...kwargs, - image_features: reshaped_image_hidden_states, - }); - } -} - -export class Gemma3nPreTrainedModel extends PreTrainedModel { - forward_params = [ - 'input_ids', - 'attention_mask', - 'inputs_embeds', - 'per_layer_inputs', - - 'position_ids', - 'pixel_values', - 'input_features', - 'input_features_mask', - 'past_key_values', - ]; -} -export class Gemma3nForConditionalGeneration extends Gemma3nPreTrainedModel { - async forward({ - // Produced by the tokenizer/processor: - input_ids = null, - attention_mask = null, - pixel_values = null, - input_features = null, - input_features_mask = null, - - // Used during generation: - position_ids = null, - inputs_embeds = null, - per_layer_inputs = null, - past_key_values = null, - - // Generic generation parameters - generation_config = null, - logits_processor = null, - - // TODO: needed? - ...kwargs - }) { - if (!inputs_embeds || !per_layer_inputs) { - // 1. Extract the text embeddings. - ({ inputs_embeds, per_layer_inputs } = await sessionRun(this.sessions['embed_tokens'], { - input_ids, - })); - if (input_ids.dims[1] !== 1) { - if (pixel_values) { - // Encode the image - const { image_features } = await sessionRun(this.sessions['vision_encoder'], { - pixel_values, - }); - ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({ - image_features, - inputs_embeds, - input_ids, - attention_mask, - })); - } - - if (input_features) { - // Encode the audio - const { audio_features } = await sessionRun(this.sessions['audio_encoder'], { - input_features, - input_features_mask, - }); - ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_audio_features({ - audio_features, - inputs_embeds, - input_ids, - attention_mask, - })); - } - } - } - - const outputs = await decoderForward( - this, - { - inputs_embeds, - per_layer_inputs, - past_key_values, - attention_mask, - position_ids, - generation_config, - logits_processor, - }, - true, - ); - return outputs; - } - - _merge_input_ids_with_image_features(kwargs) { - const vision_hidden_size = kwargs.image_features.dims.at(-1); - const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); - return default_merge_input_ids_with_image_features({ - // @ts-ignore - image_token_id: this.config.image_token_id, - ...kwargs, - image_features: reshaped_image_hidden_states, - }); - } - _merge_input_ids_with_audio_features(kwargs) { - const audio_hidden_size = kwargs.audio_features.dims.at(-1); - const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size); - - return default_merge_input_ids_with_audio_features({ - // @ts-ignore - audio_token_id: this.config.audio_token_id, - ...kwargs, - audio_features: reshaped_audio_features, - }); - } -} - -////////////////////////////////////////////////// -// Idefics3 Models -export class Idefics3PreTrainedModel extends PreTrainedModel { - forward_params = [ - 'input_ids', - 'attention_mask', - 'pixel_values', - 'pixel_attention_mask', - 'position_ids', - 'past_key_values', - ]; -} - -/** - * The Idefics3 model which consists of a vision backbone and a language model. - */ -export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel { - async encode_image({ pixel_values, pixel_attention_mask }) { - const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask })) - .image_features; - return features; - } - - _merge_input_ids_with_image_features(kwargs) { - const vision_hidden_size = kwargs.image_features.dims.at(-1); - const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); - - return default_merge_input_ids_with_image_features({ - // @ts-ignore - image_token_id: this.config.image_token_id, - ...kwargs, - image_features: reshaped_image_hidden_states, - }); - } -} -////////////////////////////////////////////////// - -/** - * The SmolVLM Model with a language modeling head. - * It is made up a SigLIP vision encoder, with a language modeling head on top. - */ -export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {} - -////////////////////////////////////////////////// -export class Phi3VPreTrainedModel extends PreTrainedModel { - forward_params = [ - 'input_ids', - 'inputs_embeds', - 'attention_mask', - 'position_ids', - 'pixel_values', - 'image_sizes', - 'past_key_values', - ]; -} -export class Phi3VForCausalLM extends Phi3VPreTrainedModel { - async forward({ - // Produced by the tokenizer/processor: - input_ids = null, - attention_mask = null, - pixel_values = null, - image_sizes = null, - - // Used during generation: - position_ids = null, - inputs_embeds = null, - past_key_values = null, - - // Generic generation parameters - generation_config = null, - logits_processor = null, - - // TODO: needed? - ...kwargs - }) { - if (!inputs_embeds) { - let image_features; - if (pixel_values && input_ids.dims[1] !== 1) { - if (!image_sizes) { - throw new Error('`image_sizes` must be provided when `pixel_values` is provided.'); - } - - // Encode the image - ({ image_features } = await sessionRun(this.sessions['vision_encoder'], { - pixel_values, - image_sizes, - })); - } else { - const hidden_size = this.config.normalized_config.hidden_size; - image_features = new Tensor('float32', [], [0, hidden_size]); - } - - ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], { - input_ids, - image_features, - })); - } - - const outputs = await decoderForward( - this, - { - inputs_embeds, - past_key_values, - attention_mask, - position_ids, - generation_config, - logits_processor, - }, - false, - ); - return outputs; - } -} - -////////////////////////////////////////////////// -export class CLIPPreTrainedModel extends PreTrainedModel {} - -/** - * CLIP Text and Vision Model with a projection layers on top - * - * **Example:** Perform zero-shot image classification with a `CLIPModel`. - * - * ```javascript - * import { AutoTokenizer, AutoProcessor, CLIPModel, RawImage } from '@huggingface/transformers'; - * - * // Load tokenizer, processor, and model - * let tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16'); - * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); - * let model = await CLIPModel.from_pretrained('Xenova/clip-vit-base-patch16'); - * - * // Run tokenization - * let texts = ['a photo of a car', 'a photo of a football match'] - * let text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * - * // Read image and run processor - * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); - * let image_inputs = await processor(image); - * - * // Run model with both text and pixel inputs - * let output = await model({ ...text_inputs, ...image_inputs }); - * // { - * // logits_per_image: Tensor { - * // dims: [ 1, 2 ], - * // data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ], - * // }, - * // logits_per_text: Tensor { - * // dims: [ 2, 1 ], - * // data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ], - * // }, - * // text_embeds: Tensor { - * // dims: [ 2, 512 ], - * // data: Float32Array(1024) [ ... ], - * // }, - * // image_embeds: Tensor { - * // dims: [ 1, 512 ], - * // data: Float32Array(512) [ ... ], - * // } - * // } - * ``` - */ -export class CLIPModel extends CLIPPreTrainedModel {} - -/** - * The text model from CLIP without any head or projection on top. - */ -export class CLIPTextModel extends CLIPPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'text_model', - }); - } -} - -/** - * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output) - * - * **Example:** Compute text embeddings with `CLIPTextModelWithProjection`. - * - * ```javascript - * import { AutoTokenizer, CLIPTextModelWithProjection } from '@huggingface/transformers'; - * - * // Load tokenizer and text model - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16'); - * const text_model = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16'); - * - * // Run tokenization - * let texts = ['a photo of a car', 'a photo of a football match']; - * let text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * - * // Compute embeddings - * const { text_embeds } = await text_model(text_inputs); - * // Tensor { - * // dims: [ 2, 512 ], - * // type: 'float32', - * // data: Float32Array(1024) [ ... ], - * // size: 1024 - * // } - * ``` - */ -export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'text_model', - }); - } -} - -/** - * The vision model from CLIP without any head or projection on top. - */ -export class CLIPVisionModel extends CLIPPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'vision_model', - }); - } -} - -/** - * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output) - * - * **Example:** Compute vision embeddings with `CLIPVisionModelWithProjection`. - * - * ```javascript - * import { AutoProcessor, CLIPVisionModelWithProjection, RawImage} from '@huggingface/transformers'; - * - * // Load processor and vision model - * const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); - * const vision_model = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16'); - * - * // Read image and run processor - * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); - * let image_inputs = await processor(image); - * - * // Compute embeddings - * const { image_embeds } = await vision_model(image_inputs); - * // Tensor { - * // dims: [ 1, 512 ], - * // type: 'float32', - * // data: Float32Array(512) [ ... ], - * // size: 512 - * // } - * ``` - */ -export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'vision_model', - }); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// SigLIP models -export class SiglipPreTrainedModel extends PreTrainedModel {} - -/** - * SigLIP Text and Vision Model with a projection layers on top - * - * **Example:** Perform zero-shot image classification with a `SiglipModel`. - * - * ```javascript - * import { AutoTokenizer, AutoProcessor, SiglipModel, RawImage } from '@huggingface/transformers'; - * - * // Load tokenizer, processor, and model - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224'); - * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224'); - * const model = await SiglipModel.from_pretrained('Xenova/siglip-base-patch16-224'); - * - * // Run tokenization - * const texts = ['a photo of 2 cats', 'a photo of 2 dogs']; - * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true }); - * - * // Read image and run processor - * const image = await RawImage.read('http://images.cocodataset.org/val2017/000000039769.jpg'); - * const image_inputs = await processor(image); - * - * // Run model with both text and pixel inputs - * const output = await model({ ...text_inputs, ...image_inputs }); - * // { - * // logits_per_image: Tensor { - * // dims: [ 1, 2 ], - * // data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ], - * // }, - * // logits_per_text: Tensor { - * // dims: [ 2, 1 ], - * // data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ], - * // }, - * // text_embeds: Tensor { - * // dims: [ 2, 768 ], - * // data: Float32Array(1536) [ ... ], - * // }, - * // image_embeds: Tensor { - * // dims: [ 1, 768 ], - * // data: Float32Array(768) [ ... ], - * // } - * // } - * ``` - */ -export class SiglipModel extends SiglipPreTrainedModel {} - -/** - * The text model from SigLIP without any head or projection on top. - * - * **Example:** Compute text embeddings with `SiglipTextModel`. - * - * ```javascript - * import { AutoTokenizer, SiglipTextModel } from '@huggingface/transformers'; - * - * // Load tokenizer and text model - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224'); - * const text_model = await SiglipTextModel.from_pretrained('Xenova/siglip-base-patch16-224'); - * - * // Run tokenization - * const texts = ['a photo of 2 cats', 'a photo of 2 dogs']; - * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true }); - * - * // Compute embeddings - * const { pooler_output } = await text_model(text_inputs); - * // Tensor { - * // dims: [ 2, 768 ], - * // type: 'float32', - * // data: Float32Array(1536) [ ... ], - * // size: 1536 - * // } - * ``` - */ -export class SiglipTextModel extends SiglipPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'text_model', - }); - } -} - -/** - * The vision model from SigLIP without any head or projection on top. - * - * **Example:** Compute vision embeddings with `SiglipVisionModel`. - * - * ```javascript - * import { AutoProcessor, SiglipVisionModel, RawImage} from '@huggingface/transformers'; - * - * // Load processor and vision model - * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224'); - * const vision_model = await SiglipVisionModel.from_pretrained('Xenova/siglip-base-patch16-224'); - * - * // Read image and run processor - * const image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); - * const image_inputs = await processor(image); - * - * // Compute embeddings - * const { pooler_output } = await vision_model(image_inputs); - * // Tensor { - * // dims: [ 1, 768 ], - * // type: 'float32', - * // data: Float32Array(768) [ ... ], - * // size: 768 - * // } - * ``` - */ -export class SiglipVisionModel extends CLIPPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'vision_model', - }); - } -} -////////////////////////////////////////////////// -// ChineseCLIP models -export class ChineseCLIPPreTrainedModel extends PreTrainedModel {} - -export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// JinaCLIP models -export class JinaCLIPPreTrainedModel extends PreTrainedModel {} - -export class JinaCLIPModel extends JinaCLIPPreTrainedModel { - async forward(model_inputs) { - const missing_text_inputs = !model_inputs.input_ids; - const missing_image_inputs = !model_inputs.pixel_values; - - if (missing_text_inputs && missing_image_inputs) { - throw new Error('Either `input_ids` or `pixel_values` should be provided.'); - } - - // If either `input_ids` or `pixel_values` aren't passed, we need to create dummy input since the model requires a value to be specified. - if (missing_text_inputs) { - // NOTE: We cannot pass zero-dimension tensor as input for input_ids. - // Fortunately, the majority of time is spent in the vision encoder, so this shouldn't significantly impact performance. - model_inputs.input_ids = ones([model_inputs.pixel_values.dims[0], 1]); - } - - if (missing_image_inputs) { - // NOTE: Since we create a zero-sized tensor, this does not increase computation time. - // @ts-ignore - const { image_size } = this.config.vision_config; - model_inputs.pixel_values = full([0, 3, image_size, image_size], 0.0); // (pass zero-dimension tensor) - } - - const { text_embeddings, image_embeddings, l2norm_text_embeddings, l2norm_image_embeddings } = - await super.forward(model_inputs); - - const result = {}; - if (!missing_text_inputs) { - result.text_embeddings = text_embeddings; - result.l2norm_text_embeddings = l2norm_text_embeddings; - } - if (!missing_image_inputs) { - result.image_embeddings = image_embeddings; - result.l2norm_image_embeddings = l2norm_image_embeddings; - } - return result; - } -} - -export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'text_model', - }); - } -} - -export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'vision_model', - }); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// CLIPSeg models -export class CLIPSegPreTrainedModel extends PreTrainedModel {} - -export class CLIPSegModel extends CLIPSegPreTrainedModel {} - -/** - * CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation. - * - * **Example:** Perform zero-shot image segmentation with a `CLIPSegForImageSegmentation` model. - * - * ```javascript - * import { AutoTokenizer, AutoProcessor, CLIPSegForImageSegmentation, RawImage } from '@huggingface/transformers'; - * - * // Load tokenizer, processor, and model - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clipseg-rd64-refined'); - * const processor = await AutoProcessor.from_pretrained('Xenova/clipseg-rd64-refined'); - * const model = await CLIPSegForImageSegmentation.from_pretrained('Xenova/clipseg-rd64-refined'); - * - * // Run tokenization - * const texts = ['a glass', 'something to fill', 'wood', 'a jar']; - * const text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * - * // Read image and run processor - * const image = await RawImage.read('https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true'); - * const image_inputs = await processor(image); - * - * // Run model with both text and pixel inputs - * const { logits } = await model({ ...text_inputs, ...image_inputs }); - * // logits: Tensor { - * // dims: [4, 352, 352], - * // type: 'float32', - * // data: Float32Array(495616) [ ... ], - * // size: 495616 - * // } - * ``` - * - * You can visualize the predictions as follows: - * ```javascript - * const preds = logits - * .unsqueeze_(1) - * .sigmoid_() - * .mul_(255) - * .round_() - * .to('uint8'); - * - * for (let i = 0; i < preds.dims[0]; ++i) { - * const img = RawImage.fromTensor(preds[i]); - * img.save(`prediction_${i}.png`); - * } - * ``` - */ -export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// GPT2 models -export class GPT2PreTrainedModel extends PreTrainedModel {} - -export class GPT2Model extends GPT2PreTrainedModel {} - -/** - * GPT-2 language model head on top of the GPT-2 base model. This model is suitable for text generation tasks. - */ -export class GPT2LMHeadModel extends GPT2PreTrainedModel {} -// export class GPT2ForSequenceClassification extends GPT2PreTrainedModel { -// TODO -// } -////////////////////////////////////////////////// - - -////////////////////////////////////////////////// -// GPT OSS models -export class GptOssPreTrainedModel extends PreTrainedModel {} -export class GptOssModel extends GptOssPreTrainedModel {} -export class GptOssForCausalLM extends GptOssPreTrainedModel {} -////////////////////////////////////////////////// - - -////////////////////////////////////////////////// -// JAIS models -export class JAISPreTrainedModel extends PreTrainedModel {} - -/** - * The bare JAIS Model transformer outputting raw hidden-states without any specific head on top. - */ -export class JAISModel extends JAISPreTrainedModel {} - -/** - * The JAIS Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). - */ -export class JAISLMHeadModel extends JAISPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// GPTNeo models -export class GPTNeoPreTrainedModel extends PreTrainedModel {} -export class GPTNeoModel extends GPTNeoPreTrainedModel {} - -export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// GPTNeoX models -export class GPTNeoXPreTrainedModel extends PreTrainedModel {} -export class GPTNeoXModel extends GPTNeoXPreTrainedModel {} - -export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// GPT-J models -export class GPTJPreTrainedModel extends PreTrainedModel {} - -export class GPTJModel extends GPTJPreTrainedModel {} - -export class GPTJForCausalLM extends GPTJPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// GPTBigCode models -export class GPTBigCodePreTrainedModel extends PreTrainedModel {} - -export class GPTBigCodeModel extends GPTBigCodePreTrainedModel {} - -export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// CodeGen models -export class CodeGenPreTrainedModel extends PreTrainedModel {} -/** - * CodeGenModel is a class representing a code generation model without a language model head. - */ -export class CodeGenModel extends CodeGenPreTrainedModel {} - -/** - * CodeGenForCausalLM is a class that represents a code generation model based on the GPT-2 architecture. It extends the `CodeGenPreTrainedModel` class. - */ -export class CodeGenForCausalLM extends CodeGenPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// LLama models - -/** - * The bare LLama Model outputting raw hidden-states without any specific head on top. - */ -export class LlamaPreTrainedModel extends PreTrainedModel {} -/** - * The bare LLaMA Model outputting raw hidden-states without any specific head on top. - */ -export class LlamaModel extends LlamaPreTrainedModel {} - -export class LlamaForCausalLM extends LlamaPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Llama4PreTrainedModel extends PreTrainedModel {} -export class Llama4ForCausalLM extends Llama4PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// NanoChat models -export class NanoChatPreTrainedModel extends PreTrainedModel {} -export class NanoChatModel extends NanoChatPreTrainedModel {} -export class NanoChatForCausalLM extends NanoChatPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Apertus models -export class ApertusPreTrainedModel extends PreTrainedModel {} -export class ApertusModel extends ApertusPreTrainedModel {} -export class ApertusForCausalLM extends ApertusPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Arcee models -export class ArceePreTrainedModel extends PreTrainedModel {} -export class ArceeModel extends ArceePreTrainedModel {} -export class ArceeForCausalLM extends ArceePreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// LFM2 models -export class Lfm2PreTrainedModel extends PreTrainedModel {} -export class Lfm2Model extends Lfm2PreTrainedModel {} -export class Lfm2ForCausalLM extends Lfm2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// SmolLM3 models -export class SmolLM3PreTrainedModel extends PreTrainedModel {} -export class SmolLM3Model extends SmolLM3PreTrainedModel {} -export class SmolLM3ForCausalLM extends SmolLM3PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Helium models -export class HeliumPreTrainedModel extends PreTrainedModel {} -export class HeliumModel extends HeliumPreTrainedModel {} -export class HeliumForCausalLM extends HeliumPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Glm models -export class GlmPreTrainedModel extends PreTrainedModel {} -export class GlmModel extends GlmPreTrainedModel {} -export class GlmForCausalLM extends GlmPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// EXAONE models -export class ExaonePreTrainedModel extends PreTrainedModel {} -export class ExaoneModel extends ExaonePreTrainedModel {} -export class ExaoneForCausalLM extends ExaonePreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MobileLLM models -export class MobileLLMPreTrainedModel extends PreTrainedModel {} -export class MobileLLMModel extends MobileLLMPreTrainedModel {} -export class MobileLLMForCausalLM extends MobileLLMPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Olmo models -export class OlmoPreTrainedModel extends PreTrainedModel {} -export class OlmoModel extends OlmoPreTrainedModel {} -export class OlmoForCausalLM extends OlmoPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Olmo2 models -export class Olmo2PreTrainedModel extends PreTrainedModel {} -export class Olmo2Model extends Olmo2PreTrainedModel {} -export class Olmo2ForCausalLM extends Olmo2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Olmo3 models -export class Olmo3PreTrainedModel extends PreTrainedModel {} -export class Olmo3Model extends Olmo3PreTrainedModel {} -export class Olmo3ForCausalLM extends Olmo3PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Granite models -export class GranitePreTrainedModel extends PreTrainedModel {} -export class GraniteModel extends GranitePreTrainedModel {} -export class GraniteForCausalLM extends GranitePreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// GraniteMoeHybrid models -export class GraniteMoeHybridPreTrainedModel extends PreTrainedModel {} -export class GraniteMoeHybridModel extends GraniteMoeHybridPreTrainedModel {} -export class GraniteMoeHybridForCausalLM extends GraniteMoeHybridPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Cohere models - -/** - * The bare Cohere Model outputting raw hidden-states without any specific head on top. - */ -export class CoherePreTrainedModel extends PreTrainedModel {} -export class CohereModel extends CoherePreTrainedModel {} - -export class CohereForCausalLM extends CoherePreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Gemma models - -/** - * The bare Gemma Model outputting raw hidden-states without any specific head on top. - */ -export class GemmaPreTrainedModel extends PreTrainedModel {} -/** - * The bare Gemma Model outputting raw hidden-states without any specific head on top. - */ -export class GemmaModel extends GemmaPreTrainedModel {} - -export class GemmaForCausalLM extends GemmaPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Gemma2 models - -/** - * The bare Gemma2 Model outputting raw hidden-states without any specific head on top. - */ -export class Gemma2PreTrainedModel extends PreTrainedModel {} -/** - * The bare Gemma2 Model outputting raw hidden-states without any specific head on top. - */ -export class Gemma2Model extends Gemma2PreTrainedModel {} - -export class Gemma2ForCausalLM extends Gemma2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// VaultGemma models -export class VaultGemmaPreTrainedModel extends PreTrainedModel {} -export class VaultGemmaModel extends VaultGemmaPreTrainedModel {} -export class VaultGemmaForCausalLM extends VaultGemmaPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Gemma3 models - -/** - * The bare Gemma3 Model outputting raw hidden-states without any specific head on top. - */ -export class Gemma3PreTrainedModel extends PreTrainedModel {} -/** - * The bare Gemma3 Model outputting raw hidden-states without any specific head on top. - */ -export class Gemma3Model extends Gemma3PreTrainedModel {} - -export class Gemma3ForCausalLM extends Gemma3PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class OpenELMPreTrainedModel extends PreTrainedModel {} -export class OpenELMModel extends OpenELMPreTrainedModel {} - -export class OpenELMForCausalLM extends OpenELMPreTrainedModel {} - -////////////////////////////////////////////////// -// Qwen2 models - -/** - * The bare Qwen2 Model outputting raw hidden-states without any specific head on top. - */ -export class Qwen2PreTrainedModel extends PreTrainedModel {} -/** - * The bare Qwen2 Model outputting raw hidden-states without any specific head on top. - */ -export class Qwen2Model extends Qwen2PreTrainedModel {} - -export class Qwen2ForCausalLM extends Qwen2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Qwen3 models - -/** - * The bare Qwen3 Model outputting raw hidden-states without any specific head on top. - */ -export class Qwen3PreTrainedModel extends PreTrainedModel {} -/** - * The bare Qwen3 Model outputting raw hidden-states without any specific head on top. - */ -export class Qwen3Model extends Qwen3PreTrainedModel {} - -export class Qwen3ForCausalLM extends Qwen3PreTrainedModel {} -////////////////////////////////////////////////// - -export class Qwen2VLPreTrainedModel extends PreTrainedModel { - forward_params = [ - // Text inputs - 'input_ids', - 'attention_mask', - 'position_ids', - 'past_key_values', - - // Vision inputs - 'pixel_values', - 'image_grid_thw', - ]; -} -export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel { - /** - * Calculate the 3D rope index based on image and video's temporal, height and width in LLM. - * - * Explanation: - * Each embedding sequence contains vision embedding and text embedding or just contains text embedding. - * - * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs. - * Examples: - * input_ids: [T T T T T], here T is for text. - * temporal position_ids: [0, 1, 2, 3, 4] - * height position_ids: [0, 1, 2, 3, 4] - * width position_ids: [0, 1, 2, 3, 4] - * - * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part - * and 1D rotary position embeddin for text part. - * Examples: - * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches. - * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. - * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] - * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] - * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] - * text temporal position_ids: [3, 4, 5, 6, 7] - * text height position_ids: [3, 4, 5, 6, 7] - * text width position_ids: [3, 4, 5, 6, 7] - * Here we calculate the text start position_ids as the max vision position_ids plus 1. - * - * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`. - * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`. - * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`. - * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`: - * - 1 for tokens that are **not masked**, - * - 0 for tokens that are **masked**. - * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with: - * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`. - * - mrope_position_deltas: Tensor of shape `(batch_size)`. - */ - get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) { - // @ts-ignore - const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config; - const spatial_merge_size = vision_config.spatial_merge_size ?? 2; - - const mrope_position_deltas = []; - if (image_grid_thw || video_grid_thw) { - let total_input_ids = input_ids.tolist(); - if (!attention_mask) { - attention_mask = ones_like(input_ids); - } - - const attention_mask_list = attention_mask.tolist(); - const position_ids_list = Array.from({ length: 3 }, (_) => - Array.from({ length: input_ids.dims[0] }, (_) => Array.from({ length: input_ids.dims[1] }, (_) => 1)), - ); - - const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : []; - const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : []; - - let image_index = 0; - let video_index = 0; - for (let i = 0; i < total_input_ids.length; ++i) { - const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1); - - const vision_start_indices = ids.reduce((acc, x, idx) => { - if (x == vision_start_token_id) acc.push(idx); - return acc; - }, []); - - const vision_tokens = vision_start_indices.map((x) => ids[x + 1]); - const image_nums = vision_tokens.filter((x) => x == image_token_id).length; - const video_nums = vision_tokens.filter((x) => x == video_token_id).length; - - /** @type {number[][]} */ - let llm_pos_ids_list = []; - let st = 0; - let remain_images = image_nums; - let remain_videos = video_nums; - for (let j = 0; j < vision_tokens.length; ++j) { - const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id); - const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id); - - const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1; - - const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1; - - let ed; - let t, h, w; - if (ed_image < ed_video) { - [t, h, w] = image_grid_thw_list[image_index]; - ++image_index; - --remain_images; - ed = ed_image; - } else { - [t, h, w] = video_grid_thw_list[video_index]; - ++video_index; - --remain_videos; - ed = ed_video; - } - - const [llm_grid_t, llm_grid_h, llm_grid_w] = [ - Number(t), - Math.floor(Number(h) / spatial_merge_size), - Math.floor(Number(w) / spatial_merge_size), - ]; - const text_len = ed - st; - const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0; - - llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len))); - - const offset = text_len + st_idx; - const grid_size = llm_grid_t * llm_grid_h * llm_grid_w; - const t_index = Array.from( - { length: grid_size }, - (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)), - ); - const h_index = Array.from( - { length: grid_size }, - (_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h), - ); - const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w)); - - llm_pos_ids_list.push([t_index, h_index, w_index].flat()); - - st = ed + grid_size; - } - - if (st < ids.length) { - const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0; - const text_len = ids.length - st; - - llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len))); - } - - // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len), - // meaning to perform concatenation along dim=1, we can do the following: - const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0); - /** @type {number[]} */ - const llm_positions = new Array(num_items); - let index = 0; - for (let x = 0; x < 3; ++x) { - for (let y = 0; y < llm_pos_ids_list.length; ++y) { - const val = llm_pos_ids_list[y]; - const text_len = val.length / 3; - for (let z = x * text_len; z < (x + 1) * text_len; ++z) { - llm_positions[index++] = val[z]; - } - } - } - - let count = 0; - const attn_mask = attention_mask_list[i]; - for (let y = 0; y < attn_mask.length; ++y) { - if (attn_mask[y] == 1) { - for (let x = 0; x < 3; ++x) { - position_ids_list[x][i][y] = llm_positions[(x * num_items) / 3 + count]; - } - ++count; - } - } - - const max_llm_positions = max(llm_positions)[0]; - mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length); - } - - return [ - new Tensor('int64', position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]), - new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]), - ]; - } else { - // Text-only - if (attention_mask) { - const { data, dims } = cumsum_masked_fill(attention_mask); - - const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]); - /** @type {bigint[]} */ - const mrope_position_deltas = Array.from( - { length: dims[0] }, - (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]), - ); - - return [ - new Tensor('int64', position_ids, [3, ...dims]), - new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]), - ]; - } else { - const [batch_size, seq_length] = input_ids.dims; - const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) => - BigInt(Math.floor((i % seq_length) / batch_size)), - ); - - return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])]; - } - } - } - - async encode_image({ pixel_values, image_grid_thw }) { - const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, grid_thw: image_grid_thw })) - .image_features; - return features; - } - - _merge_input_ids_with_image_features(kwargs) { - return default_merge_input_ids_with_image_features({ - // @ts-ignore - image_token_id: this.config.image_token_id, - ...kwargs, - }); - } - - prepare_inputs_for_generation(input_ids, model_inputs, generation_config) { - // Overwritten -- in specific circumstances we don't want to forward image inputs to the model - if (model_inputs.attention_mask && !model_inputs.position_ids) { - // Calculate position_ids and rope_deltas - if (!model_inputs.past_key_values) { - [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index( - model_inputs.input_ids, - model_inputs.image_grid_thw, - model_inputs.video_grid_thw, - model_inputs.attention_mask, - ); - } else { - model_inputs.pixel_values = null; - // model_inputs.pixel_values_videos = null; - - const delta = BigInt(Object.values(model_inputs.past_key_values)[0].dims.at(-2)); - const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x); - model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0); - } - } - - return model_inputs; - } -} - -////////////////////////////////////////////////// -// Phi models -export class PhiPreTrainedModel extends PreTrainedModel {} -/** - * The bare Phi Model outputting raw hidden-states without any specific head on top. - */ -export class PhiModel extends PhiPreTrainedModel {} - -export class PhiForCausalLM extends PhiPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Phi3 models -export class Phi3PreTrainedModel extends PreTrainedModel {} - -/** - * The bare Phi3 Model outputting raw hidden-states without any specific head on top. - */ -export class Phi3Model extends Phi3PreTrainedModel {} - -export class Phi3ForCausalLM extends Phi3PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Bloom models -/** - * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). - */ -export class BloomPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Bloom Model transformer outputting raw hidden-states without any specific head on top. - */ -export class BloomModel extends BloomPreTrainedModel {} - -/** - * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). - */ -export class BloomForCausalLM extends BloomPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MPT models -export class MptPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Mpt Model transformer outputting raw hidden-states without any specific head on top. - */ -export class MptModel extends MptPreTrainedModel {} - -/** - * The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). - */ -export class MptForCausalLM extends MptPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// OPT models -export class OPTPreTrainedModel extends PreTrainedModel {} - -/** - * The bare OPT Model outputting raw hidden-states without any specific head on top. - */ -export class OPTModel extends OPTPreTrainedModel {} - -/** - * The OPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). - */ -export class OPTForCausalLM extends OPTPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class ViTPreTrainedModel extends PreTrainedModel {} -export class ViTModel extends ViTPreTrainedModel {} -export class ViTForImageClassification extends ViTPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class IJepaPreTrainedModel extends PreTrainedModel {} -export class IJepaModel extends IJepaPreTrainedModel {} -export class IJepaForImageClassification extends IJepaPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class VitPosePreTrainedModel extends PreTrainedModel {} - -/** - * The VitPose model with a pose estimation head on top. - */ -export class VitPoseForPoseEstimation extends VitPosePreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class PvtPreTrainedModel extends PreTrainedModel {} -export class PvtModel extends PvtPreTrainedModel {} -export class PvtForImageClassification extends PvtPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class ViTMAEPreTrainedModel extends PreTrainedModel {} -export class ViTMAEModel extends ViTMAEPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class ViTMSNPreTrainedModel extends PreTrainedModel {} -export class ViTMSNModel extends ViTMSNPreTrainedModel {} -export class ViTMSNForImageClassification extends ViTMSNPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class GroupViTPreTrainedModel extends PreTrainedModel {} -export class GroupViTModel extends GroupViTPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class FastViTPreTrainedModel extends PreTrainedModel {} -export class FastViTModel extends FastViTPreTrainedModel {} -export class FastViTForImageClassification extends FastViTPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class VitMattePreTrainedModel extends PreTrainedModel {} - -/** - * ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes. - * - * **Example:** Perform image matting with a `VitMatteForImageMatting` model. - * ```javascript - * import { AutoProcessor, VitMatteForImageMatting, RawImage } from '@huggingface/transformers'; - * - * // Load processor and model - * const processor = await AutoProcessor.from_pretrained('Xenova/vitmatte-small-distinctions-646'); - * const model = await VitMatteForImageMatting.from_pretrained('Xenova/vitmatte-small-distinctions-646'); - * - * // Load image and trimap - * const image = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_image.png'); - * const trimap = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_trimap.png'); - * - * // Prepare image + trimap for the model - * const inputs = await processor(image, trimap); - * - * // Predict alpha matte - * const { alphas } = await model(inputs); - * // Tensor { - * // dims: [ 1, 1, 640, 960 ], - * // type: 'float32', - * // size: 614400, - * // data: Float32Array(614400) [ 0.9894027709960938, 0.9970508813858032, ... ] - * // } - * ``` - * - * You can visualize the alpha matte as follows: - * ```javascript - * import { Tensor, cat } from '@huggingface/transformers'; - * - * // Visualize predicted alpha matte - * const imageTensor = image.toTensor(); - * - * // Convert float (0-1) alpha matte to uint8 (0-255) - * const alphaChannel = alphas - * .squeeze(0) - * .mul_(255) - * .clamp_(0, 255) - * .round_() - * .to('uint8'); - * - * // Concatenate original image with predicted alpha - * const imageData = cat([imageTensor, alphaChannel], 0); - * - * // Save output image - * const outputImage = RawImage.fromTensor(imageData); - * outputImage.save('output.png'); - * ``` - */ -export class VitMatteForImageMatting extends VitMattePreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new ImageMattingOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class MobileViTPreTrainedModel extends PreTrainedModel {} -export class MobileViTModel extends MobileViTPreTrainedModel {} -export class MobileViTForImageClassification extends MobileViTPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -// TODO: MobileViTForSemanticSegmentation - -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class MobileViTV2PreTrainedModel extends PreTrainedModel {} -export class MobileViTV2Model extends MobileViTV2PreTrainedModel {} -export class MobileViTV2ForImageClassification extends MobileViTV2PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -// TODO: MobileViTV2ForSemanticSegmentation - -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class OwlViTPreTrainedModel extends PreTrainedModel {} -export class OwlViTModel extends OwlViTPreTrainedModel {} -export class OwlViTForObjectDetection extends OwlViTPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Owlv2PreTrainedModel extends PreTrainedModel {} -export class Owlv2Model extends Owlv2PreTrainedModel {} -export class Owlv2ForObjectDetection extends Owlv2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Beit Models -export class BeitPreTrainedModel extends PreTrainedModel {} -export class BeitModel extends BeitPreTrainedModel {} -export class BeitForImageClassification extends BeitPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DetrPreTrainedModel extends PreTrainedModel {} -export class DetrModel extends DetrPreTrainedModel {} -export class DetrForObjectDetection extends DetrPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new DetrObjectDetectionOutput(await super._call(model_inputs)); - } -} - -export class DetrForSegmentation extends DetrPreTrainedModel { - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Model inputs - * @returns {Promise} Object containing segmentation outputs - */ - async _call(model_inputs) { - return new DetrSegmentationOutput(await super._call(model_inputs)); - } -} - -export class DetrObjectDetectionOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification logits (including no-object) for all queries. - * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). - * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). - */ - constructor({ logits, pred_boxes }) { - super(); - this.logits = logits; - this.pred_boxes = pred_boxes; - } -} - -export class DetrSegmentationOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits The output logits of the model. - * @param {Tensor} output.pred_boxes Predicted boxes. - * @param {Tensor} output.pred_masks Predicted masks. - */ - constructor({ logits, pred_boxes, pred_masks }) { - super(); - this.logits = logits; - this.pred_boxes = pred_boxes; - this.pred_masks = pred_masks; - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class RTDetrPreTrainedModel extends PreTrainedModel {} -export class RTDetrModel extends RTDetrPreTrainedModel {} -export class RTDetrForObjectDetection extends RTDetrPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new RTDetrObjectDetectionOutput(await super._call(model_inputs)); - } -} - -export class RTDetrObjectDetectionOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification logits (including no-object) for all queries. - * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). - * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). - */ - constructor({ logits, pred_boxes }) { - super(); - this.logits = logits; - this.pred_boxes = pred_boxes; - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class RTDetrV2PreTrainedModel extends PreTrainedModel {} -export class RTDetrV2Model extends RTDetrV2PreTrainedModel {} -export class RTDetrV2ForObjectDetection extends RTDetrV2PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new RTDetrV2ObjectDetectionOutput(await super._call(model_inputs)); - } -} - -export class RTDetrV2ObjectDetectionOutput extends RTDetrObjectDetectionOutput {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class RFDetrPreTrainedModel extends PreTrainedModel {} -export class RFDetrModel extends RFDetrPreTrainedModel {} -export class RFDetrForObjectDetection extends RFDetrPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new RFDetrObjectDetectionOutput(await super._call(model_inputs)); - } -} - -export class RFDetrObjectDetectionOutput extends RTDetrObjectDetectionOutput {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DFinePreTrainedModel extends PreTrainedModel {} -export class DFineModel extends DFinePreTrainedModel {} -export class DFineForObjectDetection extends DFinePreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new RTDetrObjectDetectionOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class TableTransformerPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) - * outputting raw hidden-states without any specific head on top. - */ -export class TableTransformerModel extends TableTransformerPreTrainedModel {} - -/** - * Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) - * with object detection heads on top, for tasks such as COCO detection. - */ -export class TableTransformerForObjectDetection extends TableTransformerPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new TableTransformerObjectDetectionOutput(await super._call(model_inputs)); - } -} -export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DeiTPreTrainedModel extends PreTrainedModel {} -export class DeiTModel extends DeiTPreTrainedModel {} -export class DeiTForImageClassification extends DeiTPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class HieraPreTrainedModel extends PreTrainedModel {} -export class HieraModel extends HieraPreTrainedModel {} -export class HieraForImageClassification extends HieraPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -/** - * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. - */ -export class ResNetPreTrainedModel extends PreTrainedModel {} - -/** - * The bare ResNet model outputting raw features without any specific head on top. - */ -export class ResNetModel extends ResNetPreTrainedModel {} - -/** - * ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. - */ -export class ResNetForImageClassification extends ResNetPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class SwinPreTrainedModel extends PreTrainedModel {} -export class SwinModel extends SwinPreTrainedModel {} -export class SwinForImageClassification extends SwinPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -export class SwinForSemanticSegmentation extends SwinPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Swin2SRPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top. - */ -export class Swin2SRModel extends Swin2SRPreTrainedModel {} - -/** - * Swin2SR Model transformer with an upsampler head on top for image super resolution and restoration. - * - * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64`. - * - * ```javascript - * import { AutoProcessor, Swin2SRForImageSuperResolution, RawImage } from '@huggingface/transformers'; - * - * // Load processor and model - * const model_id = 'Xenova/swin2SR-classical-sr-x2-64'; - * const processor = await AutoProcessor.from_pretrained(model_id); - * const model = await Swin2SRForImageSuperResolution.from_pretrained(model_id); - * - * // Prepare model inputs - * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/butterfly.jpg'; - * const image = await RawImage.fromURL(url); - * const inputs = await processor(image); - * - * // Run model - * const outputs = await model(inputs); - * - * // Convert Tensor to RawImage - * const output = outputs.reconstruction.squeeze().clamp_(0, 1).mul_(255).round_().to('uint8'); - * const outputImage = RawImage.fromTensor(output); - * // RawImage { - * // data: Uint8Array(786432) [ 41, 31, 24, ... ], - * // width: 512, - * // height: 512, - * // channels: 3 - * // } - * ``` - */ -export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DPTPreTrainedModel extends PreTrainedModel {} - -/** - * The bare DPT Model transformer outputting raw hidden-states without any specific head on top. - */ -export class DPTModel extends DPTPreTrainedModel {} - -/** - * DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. - * - * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`. - * ```javascript - * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers'; - * - * // Load model and processor - * const model_id = 'Xenova/dpt-hybrid-midas'; - * const model = await DPTForDepthEstimation.from_pretrained(model_id); - * const processor = await AutoProcessor.from_pretrained(model_id); - * - * // Load image from URL - * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg'; - * const image = await RawImage.read(url); - * - * // Prepare image for the model - * const inputs = await processor(image); - * - * // Run model - * const { predicted_depth } = await model(inputs); - * - * // Interpolate to original size - * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), { - * size: image.size.reverse(), - * mode: 'bilinear', - * })).squeeze(1); - * - * // Visualize the prediction - * const min = prediction.min().item(); - * const max = prediction.max().item(); - * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8'); - * const depth = RawImage.fromTensor(formatted); - * // RawImage { - * // data: Uint8Array(307200) [ 85, 85, 84, ... ], - * // width: 640, - * // height: 480, - * // channels: 1 - * // } - * ``` - */ -export class DPTForDepthEstimation extends DPTPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DepthAnythingPreTrainedModel extends PreTrainedModel {} - -/** - * Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. - */ -export class DepthAnythingForDepthEstimation extends DepthAnythingPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class SapiensPreTrainedModel extends PreTrainedModel {} -export class SapiensForSemanticSegmentation extends SapiensPreTrainedModel {} -export class SapiensForDepthEstimation extends SapiensPreTrainedModel {} -export class SapiensForNormalEstimation extends SapiensPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DepthProPreTrainedModel extends PreTrainedModel {} -export class DepthProForDepthEstimation extends DepthProPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Metric3DPreTrainedModel extends PreTrainedModel {} -export class Metric3DForDepthEstimation extends Metric3DPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Metric3Dv2PreTrainedModel extends PreTrainedModel {} -export class Metric3Dv2ForDepthEstimation extends Metric3Dv2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class MaskFormerPreTrainedModel extends PreTrainedModel {} -export class MaskFormerModel extends MaskFormerPreTrainedModel {} -export class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class GLPNPreTrainedModel extends PreTrainedModel {} - -/** - * The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. - */ -export class GLPNModel extends GLPNPreTrainedModel {} - -/** - * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers'; - * - * // Load model and processor - * const model_id = 'Xenova/glpn-kitti'; - * const model = await GLPNForDepthEstimation.from_pretrained(model_id); - * const processor = await AutoProcessor.from_pretrained(model_id); - * - * // Load image from URL - * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg'; - * const image = await RawImage.read(url); - * - * // Prepare image for the model - * const inputs = await processor(image); - * - * // Run model - * const { predicted_depth } = await model(inputs); - * - * // Interpolate to original size - * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), { - * size: image.size.reverse(), - * mode: 'bilinear', - * })).squeeze(1); - * - * // Visualize the prediction - * const min = prediction.min().item(); - * const max = prediction.max().item(); - * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8'); - * const depth = RawImage.fromTensor(formatted); - * // RawImage { - * // data: Uint8Array(307200) [ 85, 85, 84, ... ], - * // width: 640, - * // height: 480, - * // channels: 1 - * // } - * ``` - */ -export class GLPNForDepthEstimation extends GLPNPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DonutSwinPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top. - * - * **Example:** Step-by-step Document Parsing. - * - * ```javascript - * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers'; - * - * // Choose model to use - * const model_id = 'Xenova/donut-base-finetuned-cord-v2'; - * - * // Prepare image inputs - * const processor = await AutoProcessor.from_pretrained(model_id); - * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/receipt.png'; - * const image = await RawImage.read(url); - * const image_inputs = await processor(image); - * - * // Prepare decoder inputs - * const tokenizer = await AutoTokenizer.from_pretrained(model_id); - * const task_prompt = ''; - * const decoder_input_ids = tokenizer(task_prompt, { - * add_special_tokens: false, - * }).input_ids; - * - * // Create the model - * const model = await AutoModelForVision2Seq.from_pretrained(model_id); - * - * // Run inference - * const output = await model.generate(image_inputs.pixel_values, { - * decoder_input_ids, - * max_length: model.config.decoder.max_position_embeddings, - * }); - * - * // Decode output - * const decoded = tokenizer.batch_decode(output)[0]; - * // CINNAMON SUGAR 17,000 1 x 17,000 17,000 17,000 20,000 3,000 - * ``` - * - * **Example:** Step-by-step Document Visual Question Answering (DocVQA) - * - * ```javascript - * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers'; - * - * // Choose model to use - * const model_id = 'Xenova/donut-base-finetuned-docvqa'; - * - * // Prepare image inputs - * const processor = await AutoProcessor.from_pretrained(model_id); - * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/invoice.png'; - * const image = await RawImage.read(url); - * const image_inputs = await processor(image); - * - * // Prepare decoder inputs - * const tokenizer = await AutoTokenizer.from_pretrained(model_id); - * const question = 'What is the invoice number?'; - * const task_prompt = `${question}`; - * const decoder_input_ids = tokenizer(task_prompt, { - * add_special_tokens: false, - * }).input_ids; - * - * // Create the model - * const model = await AutoModelForVision2Seq.from_pretrained(model_id); - * - * // Run inference - * const output = await model.generate(image_inputs.pixel_values, { - * decoder_input_ids, - * max_length: model.config.decoder.max_position_embeddings, - * }); - * - * // Decode output - * const decoded = tokenizer.batch_decode(output)[0]; - * // What is the invoice number? us-001 - * ``` - */ -export class DonutSwinModel extends DonutSwinPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class ConvNextPreTrainedModel extends PreTrainedModel {} - -/** - * The bare ConvNext model outputting raw features without any specific head on top. - */ -export class ConvNextModel extends ConvNextPreTrainedModel {} - -/** - * ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. - */ -export class ConvNextForImageClassification extends ConvNextPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class ConvNextV2PreTrainedModel extends PreTrainedModel {} - -/** - * The bare ConvNextV2 model outputting raw features without any specific head on top. - */ -export class ConvNextV2Model extends ConvNextV2PreTrainedModel {} - -/** - * ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. - */ -export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Dinov2PreTrainedModel extends PreTrainedModel {} - -/** - * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top. - */ -export class Dinov2Model extends Dinov2PreTrainedModel {} - -/** - * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. - */ -export class Dinov2ForImageClassification extends Dinov2PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Dinov2WithRegistersPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top. - */ -export class Dinov2WithRegistersModel extends Dinov2WithRegistersPreTrainedModel {} - -/** - * Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. - */ -export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DINOv3ViTPreTrainedModel extends PreTrainedModel {} -export class DINOv3ViTModel extends DINOv3ViTPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class DINOv3ConvNextPreTrainedModel extends PreTrainedModel {} -export class DINOv3ConvNextModel extends DINOv3ConvNextPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class GroundingDinoPreTrainedModel extends PreTrainedModel {} -export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel {} - -////////////////////////////////////////////////// -export class YolosPreTrainedModel extends PreTrainedModel {} -export class YolosModel extends YolosPreTrainedModel {} -export class YolosForObjectDetection extends YolosPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new YolosObjectDetectionOutput(await super._call(model_inputs)); - } -} - -export class YolosObjectDetectionOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification logits (including no-object) for all queries. - * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). - * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). - */ - constructor({ logits, pred_boxes }) { - super(); - this.logits = logits; - this.pred_boxes = pred_boxes; - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class SamPreTrainedModel extends PreTrainedModel {} - -/** - * Segment Anything Model (SAM) for generating segmentation masks, given an input image - * and optional 2D location and bounding boxes. - * - * **Example:** Perform mask generation w/ `Xenova/sam-vit-base`. - * ```javascript - * import { SamModel, AutoProcessor, RawImage } from '@huggingface/transformers'; - * - * const model = await SamModel.from_pretrained('Xenova/sam-vit-base'); - * const processor = await AutoProcessor.from_pretrained('Xenova/sam-vit-base'); - * - * const img_url = 'https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png'; - * const raw_image = await RawImage.read(img_url); - * const input_points = [[[450, 600]]] // 2D localization of a window - * - * const inputs = await processor(raw_image, { input_points }); - * const outputs = await model(inputs); - * - * const masks = await processor.post_process_masks(outputs.pred_masks, inputs.original_sizes, inputs.reshaped_input_sizes); - * // [ - * // Tensor { - * // dims: [ 1, 3, 1764, 2646 ], - * // type: 'bool', - * // data: Uint8Array(14002632) [ ... ], - * // size: 14002632 - * // } - * // ] - * const scores = outputs.iou_scores; - * // Tensor { - * // dims: [ 1, 1, 3 ], - * // type: 'float32', - * // data: Float32Array(3) [ - * // 0.8892380595207214, - * // 0.9311248064041138, - * // 0.983696699142456 - * // ], - * // size: 3 - * // } - * ``` - */ -export class SamModel extends SamPreTrainedModel { - /** - * Compute image embeddings and positional image embeddings, given the pixel values of an image. - * @param {Object} model_inputs Object containing the model inputs. - * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`. - * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings. - */ - async get_image_embeddings({ pixel_values }) { - // in: - // - pixel_values: tensor.float32[batch_size,3,1024,1024] - // - // out: - // - image_embeddings: tensor.float32[batch_size,256,64,64] - // - image_positional_embeddings: tensor.float32[batch_size,256,64,64] - return await encoderForward(this, { pixel_values }); - } - - /** - * @typedef {Object} SamModelInputs Object containing the model inputs. - * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. - * These can be obtained using a `SamProcessor`. - * @property {Tensor} [input_points] Input 2D spatial points with shape `(batch_size, num_points, 2)`. - * This is used by the prompt encoder to encode the prompt. - * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. - * This is used by the prompt encoder to encode the prompt. There are 4 types of labels: - * - `1`: the point is a point that contains the object of interest - * - `0`: the point is a point that does not contain the object of interest - * - `-1`: the point corresponds to the background - * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder - * @property {Tensor} [input_boxes] Input bounding boxes with shape `(batch_size, num_boxes, 4)`. - * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder. - * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder. - */ - - /** - * @param {SamModelInputs} model_inputs Object containing the model inputs. - * @returns {Promise} The output of the model. - */ - async forward(model_inputs) { - if (!model_inputs.image_embeddings || !model_inputs.image_positional_embeddings) { - // Compute the image embeddings if they are missing - model_inputs = { - ...model_inputs, - ...(await this.get_image_embeddings(model_inputs)), - }; - } else { - model_inputs = { ...model_inputs }; - } - - // Set default input labels if they are missing - model_inputs.input_labels ??= ones(model_inputs.input_points.dims.slice(0, -1)); - - const decoder_inputs = { - image_embeddings: model_inputs.image_embeddings, - image_positional_embeddings: model_inputs.image_positional_embeddings, - }; - if (model_inputs.input_points) { - decoder_inputs.input_points = model_inputs.input_points; - } - if (model_inputs.input_labels) { - decoder_inputs.input_labels = model_inputs.input_labels; - } - if (model_inputs.input_boxes) { - decoder_inputs.input_boxes = model_inputs.input_boxes; - } - - // Returns: - // - iou_scores: tensor.float32[batch_size,point_batch_size,3] - // - pred_masks: tensor.float32[batch_size,point_batch_size,3,256,256] - return await sessionRun(this.sessions['prompt_encoder_mask_decoder'], decoder_inputs); - } - - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Model inputs - * @returns {Promise} Object containing segmentation outputs - */ - async _call(model_inputs) { - return new SamImageSegmentationOutput(await super._call(model_inputs)); - } -} - -/** - * Base class for Segment-Anything model's output. - */ -export class SamImageSegmentationOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.iou_scores The output logits of the model. - * @param {Tensor} output.pred_masks Predicted boxes. - */ - constructor({ iou_scores, pred_masks }) { - super(); - this.iou_scores = iou_scores; - this.pred_masks = pred_masks; - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Sam2ImageSegmentationOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.iou_scores The output logits of the model. - * @param {Tensor} output.pred_masks Predicted boxes. - * @param {Tensor} output.object_score_logits Logits for the object score, indicating if an object is present. - */ - constructor({ iou_scores, pred_masks, object_score_logits }) { - super(); - this.iou_scores = iou_scores; - this.pred_masks = pred_masks; - this.object_score_logits = object_score_logits; - } -} - -export class Sam2PreTrainedModel extends PreTrainedModel {} -export class Sam2Model extends Sam2PreTrainedModel { - /** - * Compute image embeddings and positional image embeddings, given the pixel values of an image. - * @param {Object} model_inputs Object containing the model inputs. - * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `Sam2Processor`. - * @returns {Promise>} The image embeddings. - */ - async get_image_embeddings({ pixel_values }) { - // in: - // - pixel_values: tensor.float32[batch_size,3,1024,1024] - // - // out: - // - image_embeddings.0: tensor.float32[batch_size,32,256,256] - // - image_embeddings.1: tensor.float32[batch_size,64,128,128] - // - image_embeddings.2: tensor.float32[batch_size,256,64,64] - return await encoderForward(this, { pixel_values }); - } - - async forward(model_inputs) { - // @ts-expect-error ts(2339) - const { num_feature_levels } = this.config.vision_config; - const image_embeddings_name = Array.from({ length: num_feature_levels }, (_, i) => `image_embeddings.${i}`); - - if (image_embeddings_name.some((name) => !model_inputs[name])) { - // Compute the image embeddings if they are missing - model_inputs = { - ...model_inputs, - ...(await this.get_image_embeddings(model_inputs)), - }; - } else { - model_inputs = { ...model_inputs }; - } - - if (model_inputs.input_points) { - if (model_inputs.input_boxes && model_inputs.input_boxes.dims[1] !== 1) { - throw new Error( - 'When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.', - ); - } - const shape = model_inputs.input_points.dims; - model_inputs.input_labels ??= ones(shape.slice(0, -1)); - model_inputs.input_boxes ??= full([shape[0], 0, 4], 0.0); - } else if (model_inputs.input_boxes) { - // only boxes - const shape = model_inputs.input_boxes.dims; - model_inputs.input_labels = full([shape[0], shape[1], 0], -1n); - model_inputs.input_points = full([shape[0], 1, 0, 2], 0.0); - } else { - throw new Error('At least one of `input_points` or `input_boxes` must be provided.'); - } - - const prompt_encoder_mask_decoder_session = this.sessions['prompt_encoder_mask_decoder']; - const decoder_inputs = pick(model_inputs, prompt_encoder_mask_decoder_session.inputNames); - - // Returns: - // - iou_scores: tensor.float32[batch_size,num_boxes_or_points,3] - // - pred_masks: tensor.float32[batch_size,num_boxes_or_points,3,256,256] - // - object_score_logits: tensor.float32[batch_size,num_boxes_or_points,1] - return await sessionRun(prompt_encoder_mask_decoder_session, decoder_inputs); - } - - /** - * Runs the model with the provided inputs - * @param {Object} model_inputs Model inputs - * @returns {Promise} Object containing segmentation outputs - */ - async _call(model_inputs) { - return new Sam2ImageSegmentationOutput(await super._call(model_inputs)); - } -} -export class EdgeTamModel extends Sam2Model {} // NOTE: extends Sam2Model -export class Sam3TrackerModel extends Sam2Model {} // NOTE: extends Sam2Model -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MarianMT models -export class MarianPreTrainedModel extends PreTrainedModel {} - -export class MarianModel extends MarianPreTrainedModel {} - -export class MarianMTModel extends MarianPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// M2M100 models -export class M2M100PreTrainedModel extends PreTrainedModel {} - -export class M2M100Model extends M2M100PreTrainedModel {} - -export class M2M100ForConditionalGeneration extends M2M100PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Wav2Vec2 models -export class Wav2Vec2PreTrainedModel extends PreTrainedModel {} - -/** - * The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top. - * - * **Example:** Load and run a `Wav2Vec2Model` for feature extraction. - * - * ```javascript - * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; - * - * // Read and preprocess audio - * const processor = await AutoProcessor.from_pretrained('Xenova/mms-300m'); - * const audio = await read_audio('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac', 16000); - * const inputs = await processor(audio); - * - * // Run model with inputs - * const model = await AutoModel.from_pretrained('Xenova/mms-300m'); - * const output = await model(inputs); - * // { - * // last_hidden_state: Tensor { - * // dims: [ 1, 1144, 1024 ], - * // type: 'float32', - * // data: Float32Array(1171456) [ ... ], - * // size: 1171456 - * // } - * // } - * ``` - */ -export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel {} - -export class Wav2Vec2ForCTC extends Wav2Vec2PreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } -} - -export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization. - */ -export class Wav2Vec2ForAudioFrameClassification extends Wav2Vec2PreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Parakeet models -export class ParakeetPreTrainedModel extends PreTrainedModel {} -export class ParakeetForCTC extends ParakeetPreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// PyAnnote models -export class PyAnnotePreTrainedModel extends PreTrainedModel {} - -/** - * The bare PyAnnote Model transformer outputting raw hidden-states without any specific head on top. - */ -export class PyAnnoteModel extends PyAnnotePreTrainedModel {} - -/** - * PyAnnote Model with a frame classification head on top for tasks like Speaker Diarization. - * - * **Example:** Load and run a `PyAnnoteForAudioFrameClassification` for speaker diarization. - * - * ```javascript - * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers'; - * - * // Load model and processor - * const model_id = 'onnx-community/pyannote-segmentation-3.0'; - * const model = await AutoModelForAudioFrameClassification.from_pretrained(model_id); - * const processor = await AutoProcessor.from_pretrained(model_id); - * - * // Read and preprocess audio - * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.wav'; - * const audio = await read_audio(url, processor.feature_extractor.config.sampling_rate); - * const inputs = await processor(audio); - * - * // Run model with inputs - * const { logits } = await model(inputs); - * // { - * // logits: Tensor { - * // dims: [ 1, 767, 7 ], // [batch_size, num_frames, num_classes] - * // type: 'float32', - * // data: Float32Array(5369) [ ... ], - * // size: 5369 - * // } - * // } - * - * const result = processor.post_process_speaker_diarization(logits, audio.length); - * // [ - * // [ - * // { id: 0, start: 0, end: 1.0512535626298245, confidence: 0.8220156481664611 }, - * // { id: 2, start: 1.0512535626298245, end: 2.3398869619825127, confidence: 0.9008811707860472 }, - * // ... - * // ] - * // ] - * - * // Display result - * console.table(result[0], ['start', 'end', 'id', 'confidence']); - * // ┌─────────┬────────────────────┬────────────────────┬────┬─────────────────────┐ - * // │ (index) │ start │ end │ id │ confidence │ - * // ├─────────┼────────────────────┼────────────────────┼────┼─────────────────────┤ - * // │ 0 │ 0 │ 1.0512535626298245 │ 0 │ 0.8220156481664611 │ - * // │ 1 │ 1.0512535626298245 │ 2.3398869619825127 │ 2 │ 0.9008811707860472 │ - * // │ 2 │ 2.3398869619825127 │ 3.5946089560890773 │ 0 │ 0.7521651315796233 │ - * // │ 3 │ 3.5946089560890773 │ 4.578039708226655 │ 2 │ 0.8491978128022479 │ - * // │ 4 │ 4.578039708226655 │ 4.594995410849717 │ 0 │ 0.2935352600416393 │ - * // │ 5 │ 4.594995410849717 │ 6.121008646925269 │ 3 │ 0.6788051309866024 │ - * // │ 6 │ 6.121008646925269 │ 6.256654267909762 │ 0 │ 0.37125512393851134 │ - * // │ 7 │ 6.256654267909762 │ 8.630452635138397 │ 2 │ 0.7467035186353542 │ - * // │ 8 │ 8.630452635138397 │ 10.088643060721703 │ 0 │ 0.7689364814666032 │ - * // │ 9 │ 10.088643060721703 │ 12.58113134631177 │ 2 │ 0.9123324509131324 │ - * // │ 10 │ 12.58113134631177 │ 13.005023911888312 │ 0 │ 0.4828358177572041 │ - * // └─────────┴────────────────────┴────────────────────┴────┴─────────────────────┘ - * ``` - */ -export class PyAnnoteForAudioFrameClassification extends PyAnnotePreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// WeSpeakerResNet models -export class WeSpeakerResNetPreTrainedModel extends PreTrainedModel {} -export class WeSpeakerResNetModel extends WeSpeakerResNetPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// UniSpeech models -export class UniSpeechPreTrainedModel extends PreTrainedModel {} - -/** - * The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top. - */ -export class UniSpeechModel extends UniSpeechPreTrainedModel {} - -/** - * UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). - */ -export class UniSpeechForCTC extends UniSpeechPreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } -} - -/** - * UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output). - */ -export class UniSpeechForSequenceClassification extends UniSpeechPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// UniSpeechSat models -export class UniSpeechSatPreTrainedModel extends PreTrainedModel {} - -/** - * The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top. - */ -export class UniSpeechSatModel extends UniSpeechSatPreTrainedModel {} - -/** - * UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). - */ -export class UniSpeechSatForCTC extends UniSpeechSatPreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } -} - -/** - * UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output). - */ -export class UniSpeechSatForSequenceClassification extends UniSpeechSatPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization. - */ -export class UniSpeechSatForAudioFrameClassification extends UniSpeechSatPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Wav2Vec2Bert models -export class Wav2Vec2BertPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top. - */ -export class Wav2Vec2BertModel extends Wav2Vec2BertPreTrainedModel {} - -/** - * Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). - */ -export class Wav2Vec2BertForCTC extends Wav2Vec2BertPreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_features Float values of input mel-spectrogram. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } -} - -/** - * Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output). - */ -export class Wav2Vec2BertForSequenceClassification extends Wav2Vec2BertPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Hubert models -export class HubertPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Hubert Model transformer outputting raw hidden-states without any specific head on top. - * - * **Example:** Load and run a `HubertModel` for feature extraction. - * - * ```javascript - * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; - * - * // Read and preprocess audio - * const processor = await AutoProcessor.from_pretrained('Xenova/hubert-base-ls960'); - * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000); - * const inputs = await processor(audio); - * - * // Load and run model with inputs - * const model = await AutoModel.from_pretrained('Xenova/hubert-base-ls960'); - * const output = await model(inputs); - * // { - * // last_hidden_state: Tensor { - * // dims: [ 1, 549, 768 ], - * // type: 'float32', - * // data: Float32Array(421632) [0.0682469978928566, 0.08104046434164047, -0.4975186586380005, ...], - * // size: 421632 - * // } - * // } - * ``` - */ -export class HubertModel extends Wav2Vec2PreTrainedModel {} - -/** - * Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). - */ -export class HubertForCTC extends Wav2Vec2PreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } -} - -/** - * Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB Keyword Spotting. - */ -export class HubertForSequenceClassification extends Wav2Vec2PreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// WavLM models -/** - * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. - */ -export class WavLMPreTrainedModel extends PreTrainedModel {} - -/** - * The bare WavLM Model transformer outputting raw hidden-states without any specific head on top. - * - * **Example:** Load and run a `WavLMModel` for feature extraction. - * - * ```javascript - * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; - * - * // Read and preprocess audio - * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base'); - * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000); - * const inputs = await processor(audio); - * - * // Run model with inputs - * const model = await AutoModel.from_pretrained('Xenova/wavlm-base'); - * const output = await model(inputs); - * // { - * // last_hidden_state: Tensor { - * // dims: [ 1, 549, 768 ], - * // type: 'float32', - * // data: Float32Array(421632) [-0.349443256855011, -0.39341306686401367, 0.022836603224277496, ...], - * // size: 421632 - * // } - * // } - * ``` - */ -export class WavLMModel extends WavLMPreTrainedModel {} - -/** - * WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). - */ -export class WavLMForCTC extends WavLMPreTrainedModel { - /** - * @param {Object} model_inputs - * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. - * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] - */ - async _call(model_inputs) { - return new CausalLMOutput(await super._call(model_inputs)); - } -} - -/** - * WavLM Model with a sequence classification head on top (a linear layer over the pooled output). - */ -export class WavLMForSequenceClassification extends WavLMPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -/** - * WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification. - * - * **Example:** Extract speaker embeddings with `WavLMForXVector`. - * ```javascript - * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; - * - * // Read and preprocess audio - * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sv'); - * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav'; - * const audio = await read_audio(url, 16000); - * const inputs = await processor(audio); - * - * // Run model with inputs - * const model = await AutoModel.from_pretrained('Xenova/wavlm-base-plus-sv'); - * const outputs = await model(inputs); - * // { - * // logits: Tensor { - * // dims: [ 1, 512 ], - * // type: 'float32', - * // data: Float32Array(512) [0.5847219228744507, ...], - * // size: 512 - * // }, - * // embeddings: Tensor { - * // dims: [ 1, 512 ], - * // type: 'float32', - * // data: Float32Array(512) [-0.09079201519489288, ...], - * // size: 512 - * // } - * // } - * ``` - */ -export class WavLMForXVector extends WavLMPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits and speaker embeddings. - */ - async _call(model_inputs) { - return new XVectorOutput(await super._call(model_inputs)); - } -} - -/** - * WavLM Model with a frame classification head on top for tasks like Speaker Diarization. - * - * **Example:** Perform speaker diarization with `WavLMForAudioFrameClassification`. - * ```javascript - * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers'; - * - * // Read and preprocess audio - * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sd'); - * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav'; - * const audio = await read_audio(url, 16000); - * const inputs = await processor(audio); - * - * // Run model with inputs - * const model = await AutoModelForAudioFrameClassification.from_pretrained('Xenova/wavlm-base-plus-sd'); - * const { logits } = await model(inputs); - * // { - * // logits: Tensor { - * // dims: [ 1, 549, 2 ], // [batch_size, num_frames, num_speakers] - * // type: 'float32', - * // data: Float32Array(1098) [-3.5301010608673096, ...], - * // size: 1098 - * // } - * // } - * - * const labels = logits[0].sigmoid().tolist().map( - * frames => frames.map(speaker => speaker > 0.5 ? 1 : 0) - * ); - * console.log(labels); // labels is a one-hot array of shape (num_frames, num_speakers) - * // [ - * // [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], - * // [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], - * // [0, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], - * // ... - * // ] - * ``` - */ -export class WavLMForAudioFrameClassification extends WavLMPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} An object containing the model's output logits for sequence classification. - */ - async _call(model_inputs) { - return new TokenClassifierOutput(await super._call(model_inputs)); - } -} - -export class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel {} -export class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel {} - -////////////////////////////////////////////////// -// SpeechT5 models -/** - * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. - */ -export class SpeechT5PreTrainedModel extends PreTrainedModel {} - -/** - * The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets. - */ -export class SpeechT5Model extends SpeechT5PreTrainedModel {} - -/** - * SpeechT5 Model with a speech encoder and a text decoder. - * - * **Example:** Generate speech from text with `SpeechT5ForSpeechToText`. - * ```javascript - * import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@huggingface/transformers'; - * - * // Load the tokenizer and processor - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts'); - * const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts'); - * - * // Load the models - * // NOTE: We use the full-precision versions as they are more accurate - * const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { dtype: 'fp32' }); - * const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { dtype: 'fp32' }); - * - * // Load speaker embeddings from URL - * const speaker_embeddings_data = new Float32Array( - * await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer() - * ); - * const speaker_embeddings = new Tensor( - * 'float32', - * speaker_embeddings_data, - * [1, speaker_embeddings_data.length] - * ) - * - * // Run tokenization - * const { input_ids } = tokenizer('Hello, my dog is cute'); - * - * // Generate waveform - * const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder }); - * console.log(waveform) - * // Tensor { - * // dims: [ 26112 ], - * // type: 'float32', - * // size: 26112, - * // data: Float32Array(26112) [ -0.00043630177970044315, -0.00018082228780258447, ... ], - * // } - * ``` - */ -export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel {} - -/** - * SpeechT5 Model with a text encoder and a speech decoder. - */ -export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel { - /** - * @typedef {Object} SpeechOutput - * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape - * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided - * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. - * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape - * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`. - */ - - /** - * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder. - * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary. - * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings. - * @param {Object} options Optional parameters for generating speech. - * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value. - * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence. - * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence. - * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram. - * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers. - * @returns {Promise} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors. - */ - async generate_speech( - input_values, - speaker_embeddings, - { - threshold = 0.5, - minlenratio = 0.0, - maxlenratio = 20.0, - vocoder = null, - // output_cross_attentions = false, // TODO add - } = {}, - ) { - const model_inputs = { - input_ids: input_values, - }; - - const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs); - - // @ts-expect-error TS2339 - const r = encoder_outputs.dims[1] / this.config.reduction_factor; - const maxlen = Math.floor(r * maxlenratio); - const minlen = Math.floor(r * minlenratio); - - // @ts-expect-error TS2339 - const num_mel_bins = this.config.num_mel_bins; - - let spectrogramParts = []; - let past_key_values = null; - let decoder_outputs = null; - let idx = 0; - - while (true) { - ++idx; - - const use_cache_branch = boolTensor(!!decoder_outputs); - let output_sequence; - if (decoder_outputs) { - output_sequence = decoder_outputs.output_sequence_out; - } else { - output_sequence = new Tensor('float32', new Float32Array(num_mel_bins), [1, 1, num_mel_bins]); - } - let decoderFeeds = { - use_cache_branch, - output_sequence, - encoder_attention_mask: encoder_attention_mask, - speaker_embeddings: speaker_embeddings, - encoder_hidden_states: encoder_outputs, - }; - - this.addPastKeyValues(decoderFeeds, past_key_values); - decoder_outputs = await sessionRun(this.sessions['decoder_model_merged'], decoderFeeds); - past_key_values = this.getPastKeyValues(decoder_outputs, past_key_values); - - const { prob, spectrum } = decoder_outputs; - spectrogramParts.push(spectrum); - - if ( - idx >= minlen && - // Finished when stop token or maximum length is reached. - (Array.from(prob.data).filter((p) => p >= threshold).length > 0 || idx >= maxlen) - ) { - break; - } - } - - const spectrogram = cat(spectrogramParts); - const { waveform } = await sessionRun(vocoder.sessions['model'], { spectrogram }); - - return { - spectrogram, - waveform, - // cross_attentions: null, // TODO add - }; - } -} - -/** - * HiFi-GAN vocoder. - * - * See [SpeechT5ForSpeechToText](./models#module_models.SpeechT5ForSpeechToText) for example usage. - */ -export class SpeechT5HifiGan extends PreTrainedModel { - main_input_name = 'spectrogram'; -} -////////////////////////////////////////////////// - -export class SupertonicPreTrainedModel extends PreTrainedModel {} -export class SupertonicForConditionalGeneration extends SupertonicPreTrainedModel { - async generate_speech({ - // Required inputs - input_ids, - attention_mask, - style, - - // Optional inputs - num_inference_steps = 5, - speed = 1.05, - }) { - // @ts-expect-error TS2339 - const { sampling_rate, chunk_compress_factor, base_chunk_size, latent_dim } = this.config; - - // 1. Text Encoder - const { last_hidden_state, durations } = await sessionRun(this.sessions['text_encoder'], { - input_ids, - attention_mask, - style, - }); - durations.div_(speed); // Apply speed factor to duration - - // 2. Latent Denoiser - const wav_len_max = durations.max().item() * sampling_rate; - const chunk_size = base_chunk_size * chunk_compress_factor; - const latent_len = Math.floor((wav_len_max + chunk_size - 1) / chunk_size); - const batch_size = input_ids.dims[0]; - const latent_mask = ones([batch_size, latent_len]); - const num_steps = full([batch_size], num_inference_steps); - - let noisy_latents = randn([batch_size, latent_dim * chunk_compress_factor, latent_len]); - for (let step = 0; step < num_inference_steps; ++step) { - const timestep = full([batch_size], step); - ({ denoised_latents: noisy_latents } = await sessionRun(this.sessions['latent_denoiser'], { - style, - noisy_latents, - latent_mask, - encoder_outputs: last_hidden_state, - attention_mask, - timestep, - num_inference_steps: num_steps, - })); - } - - // 3. Voice Decoder - const { waveform } = await sessionRun(this.sessions['voice_decoder'], { - latents: noisy_latents, - }); - return { - waveform, - durations, - }; - } -} - -////////////////////////////////////////////////// -// TrOCR models -export class TrOCRPreTrainedModel extends PreTrainedModel {} - -/** - * The TrOCR Decoder with a language modeling head. - */ -export class TrOCRForCausalLM extends TrOCRPreTrainedModel {} - -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Mistral models -/** - * The bare Mistral Model outputting raw hidden-states without any specific head on top. - */ -export class MistralPreTrainedModel extends PreTrainedModel {} - -export class MistralModel extends MistralPreTrainedModel {} - -export class MistralForCausalLM extends MistralPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// ERNIE-4.5 models -export class Ernie4_5_PretrainedModel extends PreTrainedModel {} - -export class Ernie4_5_Model extends Ernie4_5_PretrainedModel {} - -export class Ernie4_5_ForCausalLM extends Ernie4_5_PretrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Starcoder2 models -/** - * The bare Starcoder2 Model outputting raw hidden-states without any specific head on top. - */ -export class Starcoder2PreTrainedModel extends PreTrainedModel {} - -export class Starcoder2Model extends Starcoder2PreTrainedModel {} - -export class Starcoder2ForCausalLM extends Starcoder2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Falcon models -/** - * The bare Falcon Model outputting raw hidden-states without any specific head on top. - */ -export class FalconPreTrainedModel extends PreTrainedModel {} - -export class FalconModel extends FalconPreTrainedModel {} - -export class FalconForCausalLM extends FalconPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// CLAP models -export class ClapPreTrainedModel extends PreTrainedModel {} - -export class ClapModel extends ClapPreTrainedModel {} - -/** - * CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output). - * - * **Example:** Compute text embeddings with `ClapTextModelWithProjection`. - * - * ```javascript - * import { AutoTokenizer, ClapTextModelWithProjection } from '@huggingface/transformers'; - * - * // Load tokenizer and text model - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused'); - * const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); - * - * // Run tokenization - * const texts = ['a sound of a cat', 'a sound of a dog']; - * const text_inputs = tokenizer(texts, { padding: true, truncation: true }); - * - * // Compute embeddings - * const { text_embeds } = await text_model(text_inputs); - * // Tensor { - * // dims: [ 2, 512 ], - * // type: 'float32', - * // data: Float32Array(1024) [ ... ], - * // size: 1024 - * // } - * ``` - */ -export class ClapTextModelWithProjection extends ClapPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'text_model', - }); - } -} - -/** - * CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output). - * - * **Example:** Compute audio embeddings with `ClapAudioModelWithProjection`. - * - * ```javascript - * import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@huggingface/transformers'; - * - * // Load processor and audio model - * const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused'); - * const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); - * - * // Read audio and run processor - * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav'); - * const audio_inputs = await processor(audio); - * - * // Compute embeddings - * const { audio_embeds } = await audio_model(audio_inputs); - * // Tensor { - * // dims: [ 1, 512 ], - * // type: 'float32', - * // data: Float32Array(512) [ ... ], - * // size: 512 - * // } - * ``` - */ -export class ClapAudioModelWithProjection extends ClapPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'audio_model', - }); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// VITS models -export class VitsPreTrainedModel extends PreTrainedModel {} - -/** - * The complete VITS model, for text-to-speech synthesis. - * - * **Example:** Generate speech from text with `VitsModel`. - * ```javascript - * import { AutoTokenizer, VitsModel } from '@huggingface/transformers'; - * - * // Load the tokenizer and model - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/mms-tts-eng'); - * const model = await VitsModel.from_pretrained('Xenova/mms-tts-eng'); - * - * // Run tokenization - * const inputs = tokenizer('I love transformers'); - * - * // Generate waveform - * const { waveform } = await model(inputs); - * // Tensor { - * // dims: [ 1, 35328 ], - * // type: 'float32', - * // data: Float32Array(35328) [ ... ], - * // size: 35328, - * // } - * ``` - */ -export class VitsModel extends VitsPreTrainedModel { - /** - * Calls the model on new inputs. - * @param {Object} model_inputs The inputs to the model. - * @returns {Promise} The outputs for the VITS model. - */ - async _call(model_inputs) { - return new VitsModelOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Segformer models -export class SegformerPreTrainedModel extends PreTrainedModel {} - -/** - * The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. - */ -export class SegformerModel extends SegformerPreTrainedModel {} - -/** - * SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet. - */ -export class SegformerForImageClassification extends SegformerPreTrainedModel {} - -/** - * SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes. - */ -export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel {} - -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// StableLm models -export class StableLmPreTrainedModel extends PreTrainedModel {} - -/** - * The bare StableLm Model transformer outputting raw hidden-states without any specific head on top. - */ -export class StableLmModel extends StableLmPreTrainedModel {} - -/** - * StableLm Model with a `language modeling` head on top for Causal Language Modeling (with past). - */ -export class StableLmForCausalLM extends StableLmPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class EfficientNetPreTrainedModel extends PreTrainedModel {} - -/** - * The bare EfficientNet model outputting raw features without any specific head on top. - */ -export class EfficientNetModel extends EfficientNetPreTrainedModel {} - -/** - * EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features). - */ -export class EfficientNetForImageClassification extends EfficientNetPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Musicgen models -export class MusicgenPreTrainedModel extends PreTrainedModel {} - -/** - * The bare Musicgen decoder model outputting raw hidden-states without any specific head on top. - */ -export class MusicgenModel extends MusicgenPreTrainedModel {} - -/** - * The MusicGen decoder model with a language modelling head on top. - */ -export class MusicgenForCausalLM extends MusicgenPreTrainedModel {} - -/** - * The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder, - * for music generation tasks with one or both of text and audio prompts. - * - * **Example:** Generate music from text with `Xenova/musicgen-small`. - * ```javascript - * import { AutoTokenizer, MusicgenForConditionalGeneration } from '@huggingface/transformers'; - * - * // Load tokenizer and model - * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/musicgen-small'); - * const model = await MusicgenForConditionalGeneration.from_pretrained( - * 'Xenova/musicgen-small', { dtype: 'fp32' } - * ); - * - * // Prepare text input - * const prompt = '80s pop track with bassy drums and synth'; - * const inputs = tokenizer(prompt); - * - * // Generate audio - * const audio_values = await model.generate({ - * ...inputs, - * max_new_tokens: 512, - * do_sample: true, - * guidance_scale: 3, - * }); - * - * // (Optional) Write the output to a WAV file - * import wavefile from 'wavefile'; - * import fs from 'fs'; - * - * const wav = new wavefile.WaveFile(); - * wav.fromScratch(1, model.config.audio_encoder.sampling_rate, '32f', audio_values.data); - * fs.writeFileSync('musicgen_out.wav', wav.toBuffer()); - * ``` - */ -export class MusicgenForConditionalGeneration extends PreTrainedModel { - // NOTE: not MusicgenPreTrainedModel - forward_params = [ - 'input_ids', - 'attention_mask', - 'encoder_outputs', - 'decoder_input_ids', - 'decoder_attention_mask', - 'past_key_values', - ]; - - /** - * Apply the pattern mask to the final ids, - * then revert the pattern delay mask by filtering the pad token id in a single step. - * @param {Tensor} outputs The output tensor from the model. - * @returns {Tensor} The filtered output tensor. - */ - _apply_and_filter_by_delay_pattern_mask(outputs) { - const [bs_x_codebooks, seqLength] = outputs.dims; - // @ts-expect-error TS2339 - const num_codebooks = this.config.decoder.num_codebooks; - const upperBound = seqLength - num_codebooks; - - let newDataSize = 0; - for (let i = 0; i < outputs.size; ++i) { - // @ts-expect-error TS2339 - if (outputs.data[i] === this.config.decoder.pad_token_id) { - continue; - } - - const row = i % seqLength; - const col = Math.floor(i / seqLength) % num_codebooks; - - const diff = row - col; - if (diff > 0 && diff <= upperBound) { - outputs.data[newDataSize++] = outputs.data[i]; - } - } - - const batch_size = Math.floor(bs_x_codebooks / num_codebooks); - const inferred = newDataSize / (batch_size * num_codebooks); - // TODO: assert `inferred` is an integer - return new Tensor(outputs.type, outputs.data.slice(0, newDataSize), [batch_size, num_codebooks, inferred]); - } - - prepare_inputs_for_generation(input_ids, model_inputs, generation_config) { - // apply the delay pattern mask - let clonedInputIds = structuredClone(input_ids); - for (let i = 0; i < clonedInputIds.length; ++i) { - for (let j = 0; j < clonedInputIds[i].length; ++j) { - // @ts-expect-error TS2339 - if (i % this.config.decoder.num_codebooks >= j) { - // @ts-expect-error TS2339 - clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id); - } - } - } - // for classifier free guidance we need to replicate the decoder args across the batch dim - // (we'll split these before sampling) - if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { - // [batch, seqLength] -> [2 * batch, seqLength] - clonedInputIds = clonedInputIds.concat(clonedInputIds); - } - - const prepped = super.prepare_inputs_for_generation(clonedInputIds, model_inputs, generation_config); - return prepped; - } - - /** - * Generates sequences of token ids for models with a language modeling head. - * @param {import('./generation/parameters.js').GenerationFunctionParameters} options - * @returns {Promise} The output of the model, which can contain the generated token ids, attentions, and scores. - */ - async generate(options) { - const output_ids = await super.generate(options); - - // apply the pattern mask to the final ids - // tensor: int64[1,batch_size,4,chunk_length] - const audio_codes = this._apply_and_filter_by_delay_pattern_mask(/** @type {Tensor} */ (output_ids)).unsqueeze_( - 0, - ); // append the frame dimension back to the audio codes - - const { audio_values } = await sessionRun(this.sessions['encodec_decode'], { audio_codes }); - - return audio_values; - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MobileNetV1 models -export class MobileNetV1PreTrainedModel extends PreTrainedModel {} - -/** - * The bare MobileNetV1 model outputting raw hidden-states without any specific head on top. - */ -export class MobileNetV1Model extends MobileNetV1PreTrainedModel {} - -/** - * MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features), - * e.g. for ImageNet. - */ -export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} - -export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MobileNetV2 models -export class MobileNetV2PreTrainedModel extends PreTrainedModel {} - -/** - * The bare MobileNetV2 model outputting raw hidden-states without any specific head on top. - */ -export class MobileNetV2Model extends MobileNetV2PreTrainedModel {} - -/** - * MobileNetV2 model with an image classification head on top (a linear layer on top of the pooled features), - * e.g. for ImageNet. - */ -export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MobileNetV3 models -export class MobileNetV3PreTrainedModel extends PreTrainedModel {} - -/** - * The bare MobileNetV3 model outputting raw hidden-states without any specific head on top. - */ -export class MobileNetV3Model extends MobileNetV3PreTrainedModel {} - -/** - * MobileNetV3 model with an image classification head on top (a linear layer on top of the pooled features), - * e.g. for ImageNet. - */ -export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// MobileNetV4 models -export class MobileNetV4PreTrainedModel extends PreTrainedModel {} - -/** - * The bare MobileNetV4 model outputting raw hidden-states without any specific head on top. - */ -export class MobileNetV4Model extends MobileNetV4PreTrainedModel {} - -/** - * MobileNetV4 model with an image classification head on top (a linear layer on top of the pooled features), - * e.g. for ImageNet. - */ -export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new SequenceClassifierOutput(await super._call(model_inputs)); - } -} -export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Decision Transformer models -export class DecisionTransformerPreTrainedModel extends PreTrainedModel {} - -/** - * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting. - * Refer to the paper for more details: https://huggingface.co/papers/2106.01345 - */ -export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel {} - -////////////////////////////////////////////////// - -export class MultiModalityPreTrainedModel extends PreTrainedModel {} -export class MultiModalityCausalLM extends MultiModalityPreTrainedModel { - forward_params = [ - // prepare_inputs_embeds - 'input_ids', - 'pixel_values', - 'images_seq_mask', - 'images_emb_mask', - - // language_model - 'attention_mask', - 'position_ids', - 'past_key_values', - ]; - - /** - * @param {ConstructorParameters} args - */ - constructor(...args) { - super(...args); - - // State-based approach to switch out which heads to use during generation - this._generation_mode = 'text'; - } - - async forward(model_inputs) { - const mode = this._generation_mode ?? 'text'; - - // TODO support re-using PKVs for input_ids.dims[1] !== 1 - // if (model_inputs.past_key_values) { - // // && model_inputs.input_ids.dims[1] === 1 - // } - - let output_1; - if (mode === 'text' || !model_inputs.past_key_values) { - const session = this.sessions['prepare_inputs_embeds']; - const prep_inputs = pick(model_inputs, session.inputNames); - output_1 = await sessionRun(session, prep_inputs); - } else { - const session = this.sessions['gen_img_embeds']; - const prep_inputs = pick( - { - image_ids: model_inputs.input_ids, - }, - session.inputNames, - ); - output_1 = await sessionRun(session, prep_inputs); - } - - const input_2 = { ...model_inputs, ...output_1 }; - const output_2 = await decoderForward(this, input_2); - - const head = this.sessions[mode === 'text' ? 'lm_head' : 'gen_head']; - if (!head) { - throw new Error(`Unable to find "${head}" generation head`); - } - - const output_3 = await sessionRun(head, pick(output_2, head.inputNames)); - - return { - ...output_1, - ...output_2, - ...output_3, - }; - } - - /** - * @param {import('./generation/parameters.js').GenerationFunctionParameters} options - */ - async generate(options) { - this._generation_mode = 'text'; - return super.generate(options); - } - - /** - * @param {import('./generation/parameters.js').GenerationFunctionParameters} options - */ - async generate_images(options) { - this._generation_mode = 'image'; - - const start_num_tokens = (options.inputs ?? options[this.main_input_name]).dims[1]; - const all_tokens = await super.generate(options); - - const generated_tokens = /** @type {Tensor} */ (all_tokens).slice(null, [start_num_tokens, null]); - - const image_decode = this.sessions['image_decode']; - const { decoded_image } = await sessionRun(image_decode, { - generated_tokens, - }); - - // Equivalent to `np.clip((dec + 1) / 2 * 255, 0, 255)` - const clamped = decoded_image - .add_(1) - .mul_(255 / 2) - .clamp_(0, 255) - .to('uint8'); - - // Return as a list of images - const images = []; - for (const tensor of clamped) { - const img = RawImage.fromTensor(tensor); - images.push(img); - } - return images; - } -} - -export class MgpstrModelOutput extends ModelOutput { - constructor({ char_logits, bpe_logits, wp_logits }) { - super(); - this.char_logits = char_logits; - this.bpe_logits = bpe_logits; - this.wp_logits = wp_logits; - } - - get logits() { - return [this.char_logits, this.bpe_logits, this.wp_logits]; - } -} - -export class MgpstrPreTrainedModel extends PreTrainedModel {} - -/** - * MGP-STR Model transformer with three classification heads on top - * (three A^3 modules and three linear layer on top of the transformer encoder output) for scene text recognition (STR). - */ -export class MgpstrForSceneTextRecognition extends MgpstrPreTrainedModel { - /** - * @param {any} model_inputs - */ - async _call(model_inputs) { - return new MgpstrModelOutput(await super._call(model_inputs)); - } -} - -////////////////////////////////////////////////// -// PatchTST Transformer models -export class PatchTSTPreTrainedModel extends PreTrainedModel {} - -/** - * The bare PatchTST Model outputting raw hidden-states without any specific head. - */ -export class PatchTSTModel extends PatchTSTPreTrainedModel {} - -/** - * The PatchTST for prediction model. - */ -export class PatchTSTForPrediction extends PatchTSTPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// PatchTSMixer Transformer models -export class PatchTSMixerPreTrainedModel extends PreTrainedModel {} - -/** - * The bare PatchTSMixer Model outputting raw hidden-states without any specific head. - */ -export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel {} - -/** - * The PatchTSMixer for prediction model. - */ -export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel {} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class UltravoxPreTrainedModel extends PreTrainedModel { - forward_params = ['input_ids', 'attention_mask', 'position_ids', 'audio_values', 'past_key_values']; -} - -export class UltravoxModel extends UltravoxPreTrainedModel { - _merge_input_ids_with_audio_features(kwargs) { - const audio_hidden_size = kwargs.audio_features.dims.at(-1); - const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size); - - return default_merge_input_ids_with_audio_features({ - // @ts-ignore - audio_token_id: this.config.ignore_index ?? this.config.audio_token_id, - ...kwargs, - audio_features: reshaped_audio_features, - }); - } -} -////////////////////////////////////////////////// - -export class VoxtralForConditionalGeneration extends UltravoxModel {} - -////////////////////////////////////////////////// -// Mimi models -export class MimiPreTrainedModel extends PreTrainedModel { - main_input_name = 'input_values'; - forward_params = ['input_values']; -} - -export class MimiEncoderOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`. - */ - constructor({ audio_codes }) { - super(); - this.audio_codes = audio_codes; - } -} - -export class MimiDecoderOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`. - */ - constructor({ audio_values }) { - super(); - this.audio_values = audio_values; - } -} - -/** - * The Mimi neural audio codec model. - */ -export class MimiModel extends MimiPreTrainedModel { - /** - * Encodes the input audio waveform into discrete codes. - * @param {Object} inputs Model inputs - * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`). - * @returns {Promise} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`. - */ - async encode(inputs) { - return new MimiEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs)); - } - - /** - * Decodes the given frames into an output audio waveform. - * @param {MimiEncoderOutput} inputs The encoded audio codes. - * @returns {Promise} The output tensor of shape `(batch_size, num_channels, sequence_length)`. - */ - async decode(inputs) { - return new MimiDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs)); - } -} - -export class MimiEncoderModel extends MimiPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'encoder_model', - }); - } -} -export class MimiDecoderModel extends MimiPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'decoder_model', - }); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Dac models -export class DacPreTrainedModel extends PreTrainedModel { - main_input_name = 'input_values'; - forward_params = ['input_values']; -} - -export class DacEncoderOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`. - */ - constructor({ audio_codes }) { - super(); - this.audio_codes = audio_codes; - } -} - -export class DacDecoderOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`. - */ - constructor({ audio_values }) { - super(); - this.audio_values = audio_values; - } -} - -/** - * The DAC (Descript Audio Codec) model. - */ -export class DacModel extends DacPreTrainedModel { - /** - * Encodes the input audio waveform into discrete codes. - * @param {Object} inputs Model inputs - * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`). - * @returns {Promise} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`. - */ - async encode(inputs) { - return new DacEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs)); - } - - /** - * Decodes the given frames into an output audio waveform. - * @param {DacEncoderOutput} inputs The encoded audio codes. - * @returns {Promise} The output tensor of shape `(batch_size, num_channels, sequence_length)`. - */ - async decode(inputs) { - return new DacDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs)); - } -} - -export class DacEncoderModel extends DacPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'encoder_model', - }); - } -} -export class DacDecoderModel extends DacPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'decoder_model', - }); - } -} -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -// Snac models -export class SnacPreTrainedModel extends PreTrainedModel { - main_input_name = 'input_values'; - forward_params = ['input_values']; -} - -/** - * The SNAC (Multi-Scale Neural Audio Codec) model. - */ -export class SnacModel extends SnacPreTrainedModel { - /** - * Encodes the input audio waveform into discrete codes. - * @param {Object} inputs Model inputs - * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`). - * @returns {Promise>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`. - */ - async encode(inputs) { - return await sessionRun(this.sessions['encoder_model'], inputs); - } - - /** - * Decodes the given frames into an output audio waveform. - * @param {Record} inputs The encoded audio codes. - * @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`. - */ - async decode(inputs) { - return await sessionRun(this.sessions['decoder_model'], inputs); - } -} - -export class SnacEncoderModel extends SnacPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'encoder_model', - }); - } -} -export class SnacDecoderModel extends SnacPreTrainedModel { - /** @type {typeof PreTrainedModel.from_pretrained} */ - static async from_pretrained(pretrained_model_name_or_path, options = {}) { - return super.from_pretrained(pretrained_model_name_or_path, { - ...options, - // Update default model file name if not provided - model_file_name: options.model_file_name ?? 'decoder_model', - }); - } -} -////////////////////////////////////////////////// - -export class ChatterboxPreTrainedModel extends PreTrainedModel { - forward_params = [ - 'input_ids', - 'inputs_embeds', - 'attention_mask', - 'position_ids', - 'audio_values', - 'exaggeration', - 'audio_features', - 'audio_tokens', - 'speaker_embeddings', - 'speaker_features', - 'past_key_values', - ]; - main_input_name = 'input_ids'; - - _return_dict_in_generate_keys = ['audio_tokens', 'speaker_embeddings', 'speaker_features']; -} -export class ChatterboxModel extends ChatterboxPreTrainedModel { - /** - * @param {Tensor} audio_values - * @returns {Promise<{audio_features: Tensor, audio_tokens: Tensor, speaker_embeddings: Tensor, speaker_features: Tensor}>} - */ - async encode_speech(audio_values) { - return sessionRun(this.sessions['speech_encoder'], { - audio_values, - }); - } - - async forward({ - // Produced by the tokenizer/processor: - input_ids = null, - attention_mask = null, - audio_values = null, - exaggeration = null, - - // Used during generation: - position_ids = null, - inputs_embeds = null, - past_key_values = null, - - // Generic generation parameters - generation_config = null, - logits_processor = null, - - // Speaker embeddings/features (useful for re-using pre-computed speaker data) - audio_features = null, // float32[batch_size,sequence_length,1024] - audio_tokens = null, // int64[batch_size,audio_sequence_length] - speaker_embeddings = null, // float32[batch_size,192] - speaker_features = null, // float32[batch_size,feature_dim,80] - - // TODO: needed? - ...kwargs - }) { - let speech_encoder_outputs; - if (!inputs_embeds) { - const expected_inputs = this.sessions['embed_tokens'].inputNames; - const embed_model_inputs = { input_ids }; - if (expected_inputs.includes('exaggeration')) { - // Support the following types for exaggeration: - // 1. null/undefined (no exaggeration): use the default of 0.5 - // 2. number: broadcast to (batch_size,) - // 3. number[]: convert to Tensor of shape (batch_size,) - // 4. Tensor of shape (batch_size, 1) - if (!(exaggeration instanceof Tensor)) { - const batch_size = input_ids.dims[0]; - if (exaggeration == null) { - exaggeration = full([batch_size], 0.5); - } else if (typeof exaggeration === 'number') { - exaggeration = full([batch_size], exaggeration); - } else if (Array.isArray(exaggeration)) { - exaggeration = new Tensor('float32', exaggeration, [batch_size]); - } else { - throw new Error('Unsupported type for `exaggeration` input'); - } - } - embed_model_inputs.exaggeration = exaggeration; - } - if (expected_inputs.includes('position_ids')) { - embed_model_inputs.position_ids = position_ids; - } - - ({ inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], embed_model_inputs)); - - if (audio_features && audio_tokens && speaker_embeddings && speaker_features) { - // Use pre-computed speech encoder outputs - speech_encoder_outputs = { audio_features, audio_tokens, speaker_embeddings, speaker_features }; - } - - if (speech_encoder_outputs || audio_values) { - speech_encoder_outputs ??= await this.encode_speech(audio_values); - - // Update LLM inputs - inputs_embeds = cat([speech_encoder_outputs.audio_features, inputs_embeds], 1); - attention_mask = ones([inputs_embeds.dims[0], inputs_embeds.dims[1]]); - } else { - const target_length = inputs_embeds.dims[1]; - if (!past_key_values || target_length !== 1) { - throw new Error('Incorrect state encountered during generation.'); - } - const past_length = Object.values(past_key_values)[0].dims.at(-2); - attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]); - } - } - - const outputs = await decoderForward( - this, - { - inputs_embeds, - past_key_values, - attention_mask, - generation_config, - logits_processor, - }, - false, - ); - return { - ...outputs, - ...speech_encoder_outputs, - }; - } - - /** @type {PreTrainedModel['generate']} */ - async generate(params) { - const { sequences, audio_tokens, speaker_embeddings, speaker_features } = /** @type {any} */ ( - await super.generate({ - ...params, - return_dict_in_generate: true, - }) - ); - - const new_tokens = sequences.slice(null, [ - params.input_ids.dims[1], // Exclude start of speech token - -1, // Exclude end of speech token - ]); - - const SILENCE_TOKEN = 4299n; - const silence_tokens = full([new_tokens.dims[0], 3], SILENCE_TOKEN); // Add 3 silence tokens - const speech_tokens = cat([audio_tokens, new_tokens, silence_tokens], 1); - - const { waveform } = await sessionRun(this.sessions['conditional_decoder'], { - speech_tokens, - speaker_features, - speaker_embeddings, - }); - return waveform; - } -} +// Re-export all model classes from registry +export * from './models/registry.js'; -////////////////////////////////////////////////// -// AutoModels, used to simplify construction of PreTrainedModels -// (uses config to instantiate correct class) +import { + CUSTOM_ARCHITECTURES, + MODEL_CLASS_TYPE_MAPPING, + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, + MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, + MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_MASKED_LM_MAPPING_NAMES, + MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, + MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, + MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, + MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, + MODEL_FOR_CTC_MAPPING_NAMES, + MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, + MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, + MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, + MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, + MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, + MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, + MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, + MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, + MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, +} from './models/registry.js'; /** * Base class of all AutoModels. Contains the `from_pretrained` function @@ -8174,583 +157,6 @@ export class PretrainedMixin { } } -const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ - ['bert', ['BertModel', BertModel]], - ['neobert', ['NeoBertModel', NeoBertModel]], - ['modernbert', ['ModernBertModel', ModernBertModel]], - ['nomic_bert', ['NomicBertModel', NomicBertModel]], - ['roformer', ['RoFormerModel', RoFormerModel]], - ['electra', ['ElectraModel', ElectraModel]], - ['esm', ['EsmModel', EsmModel]], - ['convbert', ['ConvBertModel', ConvBertModel]], - ['camembert', ['CamembertModel', CamembertModel]], - ['deberta', ['DebertaModel', DebertaModel]], - ['deberta-v2', ['DebertaV2Model', DebertaV2Model]], - ['mpnet', ['MPNetModel', MPNetModel]], - ['albert', ['AlbertModel', AlbertModel]], - ['distilbert', ['DistilBertModel', DistilBertModel]], - ['roberta', ['RobertaModel', RobertaModel]], - ['xlm', ['XLMModel', XLMModel]], - ['xlm-roberta', ['XLMRobertaModel', XLMRobertaModel]], - ['clap', ['ClapModel', ClapModel]], - ['clip', ['CLIPModel', CLIPModel]], - ['clipseg', ['CLIPSegModel', CLIPSegModel]], - ['chinese_clip', ['ChineseCLIPModel', ChineseCLIPModel]], - ['siglip', ['SiglipModel', SiglipModel]], - ['jina_clip', ['JinaCLIPModel', JinaCLIPModel]], - ['mobilebert', ['MobileBertModel', MobileBertModel]], - ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]], - ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]], - ['wav2vec2-bert', ['Wav2Vec2BertModel', Wav2Vec2BertModel]], - ['unispeech', ['UniSpeechModel', UniSpeechModel]], - ['unispeech-sat', ['UniSpeechSatModel', UniSpeechSatModel]], - ['hubert', ['HubertModel', HubertModel]], - ['wavlm', ['WavLMModel', WavLMModel]], - ['audio-spectrogram-transformer', ['ASTModel', ASTModel]], - ['vits', ['VitsModel', VitsModel]], - ['pyannote', ['PyAnnoteModel', PyAnnoteModel]], - ['wespeaker-resnet', ['WeSpeakerResNetModel', WeSpeakerResNetModel]], - - ['detr', ['DetrModel', DetrModel]], - ['rt_detr', ['RTDetrModel', RTDetrModel]], - ['rt_detr_v2', ['RTDetrV2Model', RTDetrV2Model]], - ['rf_detr', ['RFDetrModel', RFDetrModel]], - ['d_fine', ['DFineModel', DFineModel]], - ['table-transformer', ['TableTransformerModel', TableTransformerModel]], - ['vit', ['ViTModel', ViTModel]], - ['ijepa', ['IJepaModel', IJepaModel]], - ['pvt', ['PvtModel', PvtModel]], - ['vit_msn', ['ViTMSNModel', ViTMSNModel]], - ['vit_mae', ['ViTMAEModel', ViTMAEModel]], - ['groupvit', ['GroupViTModel', GroupViTModel]], - ['fastvit', ['FastViTModel', FastViTModel]], - ['mobilevit', ['MobileViTModel', MobileViTModel]], - ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]], - ['owlvit', ['OwlViTModel', OwlViTModel]], - ['owlv2', ['Owlv2Model', Owlv2Model]], - ['beit', ['BeitModel', BeitModel]], - ['deit', ['DeiTModel', DeiTModel]], - ['hiera', ['HieraModel', HieraModel]], - ['convnext', ['ConvNextModel', ConvNextModel]], - ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]], - ['dinov2', ['Dinov2Model', Dinov2Model]], - ['dinov2_with_registers', ['Dinov2WithRegistersModel', Dinov2WithRegistersModel]], - ['dinov3_vit', ['DINOv3ViTModel', DINOv3ViTModel]], - ['dinov3_convnext', ['DINOv3ConvNextModel', DINOv3ConvNextModel]], - ['resnet', ['ResNetModel', ResNetModel]], - ['swin', ['SwinModel', SwinModel]], - ['swin2sr', ['Swin2SRModel', Swin2SRModel]], - ['donut-swin', ['DonutSwinModel', DonutSwinModel]], - ['yolos', ['YolosModel', YolosModel]], - ['dpt', ['DPTModel', DPTModel]], - ['glpn', ['GLPNModel', GLPNModel]], - - ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]], - ['efficientnet', ['EfficientNetModel', EfficientNetModel]], - - ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]], - ['patchtst', ['PatchTSTForPrediction', PatchTSTModel]], - ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerModel]], - - ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]], - ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]], - ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]], - ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]], - - ['maskformer', ['MaskFormerModel', MaskFormerModel]], - ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]], - - ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]], -]); - -const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([ - ['t5', ['T5Model', T5Model]], - ['longt5', ['LongT5Model', LongT5Model]], - ['mt5', ['MT5Model', MT5Model]], - ['bart', ['BartModel', BartModel]], - ['mbart', ['MBartModel', MBartModel]], - ['marian', ['MarianModel', MarianModel]], - ['whisper', ['WhisperModel', WhisperModel]], - ['m2m_100', ['M2M100Model', M2M100Model]], - ['blenderbot', ['BlenderbotModel', BlenderbotModel]], - ['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]], -]); - -const MODEL_MAPPING_NAMES_AUTO_ENCODER = new Map([ - ['mimi', ['MimiModel', MimiModel]], - ['dac', ['DacModel', DacModel]], - ['snac', ['SnacModel', SnacModel]], -]); - -const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([ - ['bloom', ['BloomModel', BloomModel]], - ['jais', ['JAISModel', JAISModel]], - ['gpt2', ['GPT2Model', GPT2Model]], - ['gpt_oss', ['GptOssModel', GptOssModel]], - ['gptj', ['GPTJModel', GPTJModel]], - ['gpt_bigcode', ['GPTBigCodeModel', GPTBigCodeModel]], - ['gpt_neo', ['GPTNeoModel', GPTNeoModel]], - ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]], - ['codegen', ['CodeGenModel', CodeGenModel]], - ['llama', ['LlamaModel', LlamaModel]], - ['apertus', ['ApertusModel', ApertusModel]], - ['nanochat', ['NanoChatModel', NanoChatModel]], - ['arcee', ['ArceeModel', ArceeModel]], - ['lfm2', ['Lfm2Model', Lfm2Model]], - ['smollm3', ['SmolLM3Model', SmolLM3Model]], - ['exaone', ['ExaoneModel', ExaoneModel]], - ['olmo', ['OlmoModel', OlmoModel]], - ['olmo2', ['Olmo2Model', Olmo2Model]], - ['olmo3', ['Olmo3Model', Olmo3Model]], - ['mobilellm', ['MobileLLMModel', MobileLLMModel]], - ['granite', ['GraniteModel', GraniteModel]], - ['granitemoehybrid', ['GraniteMoeHybridModel', GraniteMoeHybridModel]], - ['cohere', ['CohereModel', CohereModel]], - ['gemma', ['GemmaModel', GemmaModel]], - ['gemma2', ['Gemma2Model', Gemma2Model]], - ['vaultgemma', ['VaultGemmaModel', VaultGemmaModel]], - ['gemma3_text', ['Gemma3Model', Gemma3Model]], - ['helium', ['HeliumModel', HeliumModel]], - ['glm', ['GlmModel', GlmModel]], - ['openelm', ['OpenELMModel', OpenELMModel]], - ['qwen2', ['Qwen2Model', Qwen2Model]], - ['qwen3', ['Qwen3Model', Qwen3Model]], - ['phi', ['PhiModel', PhiModel]], - ['phi3', ['Phi3Model', Phi3Model]], - ['mpt', ['MptModel', MptModel]], - ['opt', ['OPTModel', OPTModel]], - ['mistral', ['MistralModel', MistralModel]], - ['ernie4_5', ['Ernie4_5_Model', Ernie4_5_Model]], - ['starcoder2', ['Starcoder2Model', Starcoder2Model]], - ['falcon', ['FalconModel', FalconModel]], - ['stablelm', ['StableLmModel', StableLmModel]], - ['modernbert-decoder', ['ModernBertDecoderModel', ModernBertDecoderModel]], -]); - -const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([ - ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]], - ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]], - ['lite-whisper', ['LiteWhisperForConditionalGeneration', LiteWhisperForConditionalGeneration]], - ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]], -]); - -const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([ - ['speecht5', ['SpeechT5ForTextToSpeech', SpeechT5ForTextToSpeech]], -]); - -const MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = new Map([ - ['vits', ['VitsModel', VitsModel]], - ['musicgen', ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration]], - ['supertonic', ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration]], -]); - -const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['bert', ['BertForSequenceClassification', BertForSequenceClassification]], - ['neobert', ['NeoBertForSequenceClassification', NeoBertForSequenceClassification]], - ['modernbert', ['ModernBertForSequenceClassification', ModernBertForSequenceClassification]], - ['roformer', ['RoFormerForSequenceClassification', RoFormerForSequenceClassification]], - ['electra', ['ElectraForSequenceClassification', ElectraForSequenceClassification]], - ['esm', ['EsmForSequenceClassification', EsmForSequenceClassification]], - ['convbert', ['ConvBertForSequenceClassification', ConvBertForSequenceClassification]], - ['camembert', ['CamembertForSequenceClassification', CamembertForSequenceClassification]], - ['deberta', ['DebertaForSequenceClassification', DebertaForSequenceClassification]], - ['deberta-v2', ['DebertaV2ForSequenceClassification', DebertaV2ForSequenceClassification]], - ['mpnet', ['MPNetForSequenceClassification', MPNetForSequenceClassification]], - ['albert', ['AlbertForSequenceClassification', AlbertForSequenceClassification]], - ['distilbert', ['DistilBertForSequenceClassification', DistilBertForSequenceClassification]], - ['roberta', ['RobertaForSequenceClassification', RobertaForSequenceClassification]], - ['xlm', ['XLMForSequenceClassification', XLMForSequenceClassification]], - ['xlm-roberta', ['XLMRobertaForSequenceClassification', XLMRobertaForSequenceClassification]], - ['bart', ['BartForSequenceClassification', BartForSequenceClassification]], - ['mbart', ['MBartForSequenceClassification', MBartForSequenceClassification]], - ['mobilebert', ['MobileBertForSequenceClassification', MobileBertForSequenceClassification]], - ['squeezebert', ['SqueezeBertForSequenceClassification', SqueezeBertForSequenceClassification]], -]); - -const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['bert', ['BertForTokenClassification', BertForTokenClassification]], - ['neobert', ['NeoBertForTokenClassification', NeoBertForTokenClassification]], - ['modernbert', ['ModernBertForTokenClassification', ModernBertForTokenClassification]], - ['roformer', ['RoFormerForTokenClassification', RoFormerForTokenClassification]], - ['electra', ['ElectraForTokenClassification', ElectraForTokenClassification]], - ['esm', ['EsmForTokenClassification', EsmForTokenClassification]], - ['convbert', ['ConvBertForTokenClassification', ConvBertForTokenClassification]], - ['camembert', ['CamembertForTokenClassification', CamembertForTokenClassification]], - ['deberta', ['DebertaForTokenClassification', DebertaForTokenClassification]], - ['deberta-v2', ['DebertaV2ForTokenClassification', DebertaV2ForTokenClassification]], - ['mpnet', ['MPNetForTokenClassification', MPNetForTokenClassification]], - ['distilbert', ['DistilBertForTokenClassification', DistilBertForTokenClassification]], - ['roberta', ['RobertaForTokenClassification', RobertaForTokenClassification]], - ['xlm', ['XLMForTokenClassification', XLMForTokenClassification]], - ['xlm-roberta', ['XLMRobertaForTokenClassification', XLMRobertaForTokenClassification]], -]); - -const MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = new Map([ - ['t5', ['T5ForConditionalGeneration', T5ForConditionalGeneration]], - ['longt5', ['LongT5ForConditionalGeneration', LongT5ForConditionalGeneration]], - ['mt5', ['MT5ForConditionalGeneration', MT5ForConditionalGeneration]], - ['bart', ['BartForConditionalGeneration', BartForConditionalGeneration]], - ['mbart', ['MBartForConditionalGeneration', MBartForConditionalGeneration]], - ['marian', ['MarianMTModel', MarianMTModel]], - ['m2m_100', ['M2M100ForConditionalGeneration', M2M100ForConditionalGeneration]], - ['blenderbot', ['BlenderbotForConditionalGeneration', BlenderbotForConditionalGeneration]], - ['blenderbot-small', ['BlenderbotSmallForConditionalGeneration', BlenderbotSmallForConditionalGeneration]], -]); - -const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([ - ['bloom', ['BloomForCausalLM', BloomForCausalLM]], - ['gpt2', ['GPT2LMHeadModel', GPT2LMHeadModel]], - ['gpt_oss', ['GptOssForCausalLM', GptOssForCausalLM]], - ['jais', ['JAISLMHeadModel', JAISLMHeadModel]], - ['gptj', ['GPTJForCausalLM', GPTJForCausalLM]], - ['gpt_bigcode', ['GPTBigCodeForCausalLM', GPTBigCodeForCausalLM]], - ['gpt_neo', ['GPTNeoForCausalLM', GPTNeoForCausalLM]], - ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]], - ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]], - ['llama', ['LlamaForCausalLM', LlamaForCausalLM]], - ['nanochat', ['NanoChatForCausalLM', NanoChatForCausalLM]], - ['apertus', ['ApertusForCausalLM', ApertusForCausalLM]], - ['llama4_text', ['Llama4ForCausalLM', Llama4ForCausalLM]], - ['arcee', ['ArceeForCausalLM', ArceeForCausalLM]], - ['lfm2', ['Lfm2ForCausalLM', Lfm2ForCausalLM]], - ['smollm3', ['SmolLM3ForCausalLM', SmolLM3ForCausalLM]], - ['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]], - ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]], - ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]], - ['olmo3', ['Olmo3ForCausalLM', Olmo3ForCausalLM]], - ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]], - ['granite', ['GraniteForCausalLM', GraniteForCausalLM]], - ['granitemoehybrid', ['GraniteMoeHybridForCausalLM', GraniteMoeHybridForCausalLM]], - ['cohere', ['CohereForCausalLM', CohereForCausalLM]], - ['gemma', ['GemmaForCausalLM', GemmaForCausalLM]], - ['gemma2', ['Gemma2ForCausalLM', Gemma2ForCausalLM]], - ['vaultgemma', ['VaultGemmaForCausalLM', VaultGemmaForCausalLM]], - ['gemma3_text', ['Gemma3ForCausalLM', Gemma3ForCausalLM]], - ['helium', ['HeliumForCausalLM', HeliumForCausalLM]], - ['glm', ['GlmForCausalLM', GlmForCausalLM]], - ['openelm', ['OpenELMForCausalLM', OpenELMForCausalLM]], - ['qwen2', ['Qwen2ForCausalLM', Qwen2ForCausalLM]], - ['qwen3', ['Qwen3ForCausalLM', Qwen3ForCausalLM]], - ['phi', ['PhiForCausalLM', PhiForCausalLM]], - ['phi3', ['Phi3ForCausalLM', Phi3ForCausalLM]], - ['mpt', ['MptForCausalLM', MptForCausalLM]], - ['opt', ['OPTForCausalLM', OPTForCausalLM]], - ['mbart', ['MBartForCausalLM', MBartForCausalLM]], - ['mistral', ['MistralForCausalLM', MistralForCausalLM]], - ['ernie4_5', ['Ernie4_5_ForCausalLM', Ernie4_5_ForCausalLM]], - ['starcoder2', ['Starcoder2ForCausalLM', Starcoder2ForCausalLM]], - ['falcon', ['FalconForCausalLM', FalconForCausalLM]], - ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]], - ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]], - ['modernbert-decoder', ['ModernBertDecoderForCausalLM', ModernBertDecoderForCausalLM]], - - // Also image-text-to-text - ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]], -]); - -const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([ - ['multi_modality', ['MultiModalityCausalLM', MultiModalityCausalLM]], -]); - -const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([ - ['bert', ['BertForMaskedLM', BertForMaskedLM]], - ['neobert', ['NeoBertForMaskedLM', NeoBertForMaskedLM]], - ['modernbert', ['ModernBertForMaskedLM', ModernBertForMaskedLM]], - ['roformer', ['RoFormerForMaskedLM', RoFormerForMaskedLM]], - ['electra', ['ElectraForMaskedLM', ElectraForMaskedLM]], - ['esm', ['EsmForMaskedLM', EsmForMaskedLM]], - ['convbert', ['ConvBertForMaskedLM', ConvBertForMaskedLM]], - ['camembert', ['CamembertForMaskedLM', CamembertForMaskedLM]], - ['deberta', ['DebertaForMaskedLM', DebertaForMaskedLM]], - ['deberta-v2', ['DebertaV2ForMaskedLM', DebertaV2ForMaskedLM]], - ['mpnet', ['MPNetForMaskedLM', MPNetForMaskedLM]], - ['albert', ['AlbertForMaskedLM', AlbertForMaskedLM]], - ['distilbert', ['DistilBertForMaskedLM', DistilBertForMaskedLM]], - ['roberta', ['RobertaForMaskedLM', RobertaForMaskedLM]], - ['xlm', ['XLMWithLMHeadModel', XLMWithLMHeadModel]], - ['xlm-roberta', ['XLMRobertaForMaskedLM', XLMRobertaForMaskedLM]], - ['mobilebert', ['MobileBertForMaskedLM', MobileBertForMaskedLM]], - ['squeezebert', ['SqueezeBertForMaskedLM', SqueezeBertForMaskedLM]], -]); - -const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ - ['bert', ['BertForQuestionAnswering', BertForQuestionAnswering]], - ['neobert', ['NeoBertForQuestionAnswering', NeoBertForQuestionAnswering]], - ['roformer', ['RoFormerForQuestionAnswering', RoFormerForQuestionAnswering]], - ['electra', ['ElectraForQuestionAnswering', ElectraForQuestionAnswering]], - ['convbert', ['ConvBertForQuestionAnswering', ConvBertForQuestionAnswering]], - ['camembert', ['CamembertForQuestionAnswering', CamembertForQuestionAnswering]], - ['deberta', ['DebertaForQuestionAnswering', DebertaForQuestionAnswering]], - ['deberta-v2', ['DebertaV2ForQuestionAnswering', DebertaV2ForQuestionAnswering]], - ['mpnet', ['MPNetForQuestionAnswering', MPNetForQuestionAnswering]], - ['albert', ['AlbertForQuestionAnswering', AlbertForQuestionAnswering]], - ['distilbert', ['DistilBertForQuestionAnswering', DistilBertForQuestionAnswering]], - ['roberta', ['RobertaForQuestionAnswering', RobertaForQuestionAnswering]], - ['xlm', ['XLMForQuestionAnswering', XLMForQuestionAnswering]], - ['xlm-roberta', ['XLMRobertaForQuestionAnswering', XLMRobertaForQuestionAnswering]], - ['mobilebert', ['MobileBertForQuestionAnswering', MobileBertForQuestionAnswering]], - ['squeezebert', ['SqueezeBertForQuestionAnswering', SqueezeBertForQuestionAnswering]], -]); - -const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([ - ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]], - ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]], - ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]], -]); - -const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([ - ['llava', ['LlavaForConditionalGeneration', LlavaForConditionalGeneration]], - ['llava_onevision', ['LlavaOnevisionForConditionalGeneration', LlavaOnevisionForConditionalGeneration]], - ['moondream1', ['Moondream1ForConditionalGeneration', Moondream1ForConditionalGeneration]], - ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]], - ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]], - ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]], - ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]], - ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]], - ['llava_qwen2', ['LlavaQwen2ForCausalLM', LlavaQwen2ForCausalLM]], - ['gemma3n', ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration]], -]); - -const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([ - ['ultravox', ['UltravoxModel', UltravoxModel]], - ['voxtral', ['VoxtralForConditionalGeneration', VoxtralForConditionalGeneration]], -]); - -const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ - ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]], -]); - -const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['vit', ['ViTForImageClassification', ViTForImageClassification]], - ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]], - ['pvt', ['PvtForImageClassification', PvtForImageClassification]], - ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]], - ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]], - ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]], - ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]], - ['beit', ['BeitForImageClassification', BeitForImageClassification]], - ['deit', ['DeiTForImageClassification', DeiTForImageClassification]], - ['hiera', ['HieraForImageClassification', HieraForImageClassification]], - ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]], - ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]], - ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]], - ['dinov2_with_registers', ['Dinov2WithRegistersForImageClassification', Dinov2WithRegistersForImageClassification]], - ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]], - ['swin', ['SwinForImageClassification', SwinForImageClassification]], - ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]], - ['efficientnet', ['EfficientNetForImageClassification', EfficientNetForImageClassification]], - ['mobilenet_v1', ['MobileNetV1ForImageClassification', MobileNetV1ForImageClassification]], - ['mobilenet_v2', ['MobileNetV2ForImageClassification', MobileNetV2ForImageClassification]], - ['mobilenet_v3', ['MobileNetV3ForImageClassification', MobileNetV3ForImageClassification]], - ['mobilenet_v4', ['MobileNetV4ForImageClassification', MobileNetV4ForImageClassification]], -]); - -const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([ - ['detr', ['DetrForObjectDetection', DetrForObjectDetection]], - ['rt_detr', ['RTDetrForObjectDetection', RTDetrForObjectDetection]], - ['rt_detr_v2', ['RTDetrV2ForObjectDetection', RTDetrV2ForObjectDetection]], - ['rf_detr', ['RFDetrForObjectDetection', RFDetrForObjectDetection]], - ['d_fine', ['DFineForObjectDetection', DFineForObjectDetection]], - ['table-transformer', ['TableTransformerForObjectDetection', TableTransformerForObjectDetection]], - ['yolos', ['YolosForObjectDetection', YolosForObjectDetection]], -]); - -const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([ - ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]], - ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]], - ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]], -]); - -const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([ - // TODO: Do not add new models here - ['detr', ['DetrForSegmentation', DetrForSegmentation]], - ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]], -]); - -const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([ - ['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]], - ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]], - - ['swin', ['SwinForSemanticSegmentation', SwinForSemanticSegmentation]], - ['mobilenet_v1', ['MobileNetV1ForSemanticSegmentation', MobileNetV1ForSemanticSegmentation]], - ['mobilenet_v2', ['MobileNetV2ForSemanticSegmentation', MobileNetV2ForSemanticSegmentation]], - ['mobilenet_v3', ['MobileNetV3ForSemanticSegmentation', MobileNetV3ForSemanticSegmentation]], - ['mobilenet_v4', ['MobileNetV4ForSemanticSegmentation', MobileNetV4ForSemanticSegmentation]], -]); - -const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([ - ['detr', ['DetrForSegmentation', DetrForSegmentation]], - ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]], -]); - -const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([ - ['sam', ['SamModel', SamModel]], - ['sam2', ['Sam2Model', Sam2Model]], - ['edgetam', ['EdgeTamModel', EdgeTamModel]], - ['sam3_tracker', ['Sam3TrackerModel', Sam3TrackerModel]], -]); - -const MODEL_FOR_CTC_MAPPING_NAMES = new Map([ - ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]], - ['wav2vec2-bert', ['Wav2Vec2BertForCTC', Wav2Vec2BertForCTC]], - ['unispeech', ['UniSpeechForCTC', UniSpeechForCTC]], - ['unispeech-sat', ['UniSpeechSatForCTC', UniSpeechSatForCTC]], - ['wavlm', ['WavLMForCTC', WavLMForCTC]], - ['hubert', ['HubertForCTC', HubertForCTC]], - ['parakeet_ctc', ['ParakeetForCTC', ParakeetForCTC]], -]); - -const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['wav2vec2', ['Wav2Vec2ForSequenceClassification', Wav2Vec2ForSequenceClassification]], - ['wav2vec2-bert', ['Wav2Vec2BertForSequenceClassification', Wav2Vec2BertForSequenceClassification]], - ['unispeech', ['UniSpeechForSequenceClassification', UniSpeechForSequenceClassification]], - ['unispeech-sat', ['UniSpeechSatForSequenceClassification', UniSpeechSatForSequenceClassification]], - ['wavlm', ['WavLMForSequenceClassification', WavLMForSequenceClassification]], - ['hubert', ['HubertForSequenceClassification', HubertForSequenceClassification]], - ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]], -]); - -const MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = new Map([['wavlm', ['WavLMForXVector', WavLMForXVector]]]); - -const MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = new Map([ - ['unispeech-sat', ['UniSpeechSatForAudioFrameClassification', UniSpeechSatForAudioFrameClassification]], - ['wavlm', ['WavLMForAudioFrameClassification', WavLMForAudioFrameClassification]], - ['wav2vec2', ['Wav2Vec2ForAudioFrameClassification', Wav2Vec2ForAudioFrameClassification]], - ['pyannote', ['PyAnnoteForAudioFrameClassification', PyAnnoteForAudioFrameClassification]], -]); - -const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([ - ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]], -]); - -const MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = new Map([ - ['patchtst', ['PatchTSTForPrediction', PatchTSTForPrediction]], - ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerForPrediction]], -]); - -const MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = new Map([ - ['swin2sr', ['Swin2SRForImageSuperResolution', Swin2SRForImageSuperResolution]], -]); - -const MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = new Map([ - ['dpt', ['DPTForDepthEstimation', DPTForDepthEstimation]], - ['depth_anything', ['DepthAnythingForDepthEstimation', DepthAnythingForDepthEstimation]], - ['glpn', ['GLPNForDepthEstimation', GLPNForDepthEstimation]], - ['sapiens', ['SapiensForDepthEstimation', SapiensForDepthEstimation]], - ['depth_pro', ['DepthProForDepthEstimation', DepthProForDepthEstimation]], - ['metric3d', ['Metric3DForDepthEstimation', Metric3DForDepthEstimation]], - ['metric3dv2', ['Metric3Dv2ForDepthEstimation', Metric3Dv2ForDepthEstimation]], -]); - -const MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES = new Map([ - ['sapiens', ['SapiensForNormalEstimation', SapiensForNormalEstimation]], -]); - -const MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES = new Map([ - ['vitpose', ['VitPoseForPoseEstimation', VitPoseForPoseEstimation]], -]); - -// NOTE: This is custom to Transformers.js, and is necessary because certain models -// (e.g., CLIP) are split into vision and text components -const MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = new Map([ - ['clip', ['CLIPVisionModelWithProjection', CLIPVisionModelWithProjection]], - ['siglip', ['SiglipVisionModel', SiglipVisionModel]], - ['jina_clip', ['JinaCLIPVisionModel', JinaCLIPVisionModel]], -]); - -const MODEL_CLASS_TYPE_MAPPING = [ - // MODEL_MAPPING_NAMES: - [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly], - [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder], - [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly], - [MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_TYPES.AutoEncoder], - - [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], - [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], - [MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.DecoderOnly], - [MODEL_FOR_MULTIMODALITY_MAPPING_NAMES, MODEL_TYPES.MultiModality], - [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq], - [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText], - [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.AudioTextToText], - [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES.MaskGeneration], - [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], - [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], - - // Custom: - [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], -]; - -for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) { - // @ts-ignore - for (const [name, model] of mappings.values()) { - MODEL_TYPE_MAPPING.set(name, type); - MODEL_CLASS_TO_NAME_MAPPING.set(model, name); - MODEL_NAME_TO_CLASS_MAPPING.set(name, model); - } -} - -const CUSTOM_MAPPING = [ - // OVERRIDE: - // TODO: Refactor to allow class to specify model - ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen], - ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V], - - ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly], - ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly], - ['JinaCLIPTextModel', JinaCLIPTextModel, MODEL_TYPES.EncoderOnly], - ['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly], - ['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly], - - ['DacEncoderModel', DacEncoderModel, MODEL_TYPES.EncoderOnly], - ['DacDecoderModel', DacDecoderModel, MODEL_TYPES.EncoderOnly], - ['MimiEncoderModel', MimiEncoderModel, MODEL_TYPES.EncoderOnly], - ['MimiDecoderModel', MimiDecoderModel, MODEL_TYPES.EncoderOnly], - ['SnacEncoderModel', SnacEncoderModel, MODEL_TYPES.EncoderOnly], - ['SnacDecoderModel', SnacDecoderModel, MODEL_TYPES.EncoderOnly], - - ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration, MODEL_TYPES.ImageAudioTextToText], - ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic], - ['ChatterboxModel', ChatterboxModel, MODEL_TYPES.Chatterbox], -]; -for (const [name, model, type] of CUSTOM_MAPPING) { - MODEL_TYPE_MAPPING.set(name, type); - MODEL_CLASS_TO_NAME_MAPPING.set(model, name); - MODEL_NAME_TO_CLASS_MAPPING.set(name, model); -} - -const CUSTOM_ARCHITECTURES = new Map([ - ['modnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], - ['birefnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], - ['isnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], - ['ben', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], -]); -for (const [name, mapping] of CUSTOM_ARCHITECTURES.entries()) { - mapping.set(name, ['PreTrainedModel', PreTrainedModel]); - MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly); - MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, name); - MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel); -} - /** * Helper class which is used to instantiate pretrained models with the `from_pretrained` function. * The chosen model class is determined by the type specified in the model config. @@ -8997,164 +403,5 @@ export class AutoModelForAudioTextToText extends PretrainedMixin { static MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES]; } -////////////////////////////////////////////////// - -////////////////////////////////////////////////// -export class Seq2SeqLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits The output logits of the model. - * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model. - * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model. - * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. - * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. - */ - constructor({ logits, past_key_values, encoder_outputs, decoder_attentions = null, cross_attentions = null }) { - super(); - this.logits = logits; - this.past_key_values = past_key_values; - this.encoder_outputs = encoder_outputs; - this.decoder_attentions = decoder_attentions; - this.cross_attentions = cross_attentions; - } -} - -/** - * Base class for outputs of sentence classification models. - */ -export class SequenceClassifierOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). - * @param {Record} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. - * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - */ - constructor({ logits, ...attentions }) { - super(); - this.logits = logits; - const attentions_list = Object.values(attentions); - if (attentions_list.length > 0) { - // Only set attentions if they are not empty - this.attentions = attentions_list; - } - } -} - -/** - * Base class for outputs of XVector models. - */ -export class XVectorOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification hidden states before AMSoftmax, of shape `(batch_size, config.xvector_output_dim)`. - * @param {Tensor} output.embeddings Utterance embeddings used for vector similarity-based retrieval, of shape `(batch_size, config.xvector_output_dim)`. - */ - constructor({ logits, embeddings }) { - super(); - this.logits = logits; - this.embeddings = embeddings; - } -} - -/** - * Base class for outputs of token classification models. - */ -export class TokenClassifierOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Classification scores (before SoftMax). - */ - constructor({ logits }) { - super(); - this.logits = logits; - } -} - -/** - * Base class for masked language models outputs. - */ -export class MaskedLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - */ - constructor({ logits }) { - super(); - this.logits = logits; - } -} - -/** - * Base class for outputs of question answering models. - */ -export class QuestionAnsweringModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.start_logits Span-start scores (before SoftMax). - * @param {Tensor} output.end_logits Span-end scores (before SoftMax). - */ - constructor({ start_logits, end_logits }) { - super(); - this.start_logits = start_logits; - this.end_logits = end_logits; - } -} - -/** - * Base class for causal language model (or autoregressive) outputs. - */ -export class CausalLMOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). - */ - constructor({ logits }) { - super(); - this.logits = logits; - } -} - -/** - * Base class for causal language model (or autoregressive) outputs. - */ -export class CausalLMOutputWithPast extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). - * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks) - * that can be used (see `past_key_values` input) to speed up sequential decoding. - */ - constructor({ logits, past_key_values }) { - super(); - this.logits = logits; - this.past_key_values = past_key_values; - } -} - -export class ImageMattingOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`. - */ - constructor({ alphas }) { - super(); - this.alphas = alphas; - } -} - -/** - * Describes the outputs for the VITS model. - */ -export class VitsModelOutput extends ModelOutput { - /** - * @param {Object} output The output of the model. - * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`. - * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model. - * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform. - */ - constructor({ waveform, spectrogram }) { - super(); - this.waveform = waveform; - this.spectrogram = spectrogram; - } -} +// Re-export PreTrainedModel for backwards compatibility +export { PreTrainedModel }; diff --git a/src/models/beit/image_processing_beit.js b/src/models/beit/image_processing_beit.js deleted file mode 100644 index 5b952a248..000000000 --- a/src/models/beit/image_processing_beit.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class BeitFeatureExtractor extends ImageProcessor {} diff --git a/src/models/bit/image_processing_bit.js b/src/models/bit/image_processing_bit.js deleted file mode 100644 index 7a59a3987..000000000 --- a/src/models/bit/image_processing_bit.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class BitImageProcessor extends ImageProcessor {} diff --git a/src/models/chinese_clip/image_processing_chinese_clip.js b/src/models/chinese_clip/image_processing_chinese_clip.js deleted file mode 100644 index 3feed9f62..000000000 --- a/src/models/chinese_clip/image_processing_chinese_clip.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class ChineseCLIPFeatureExtractor extends ImageProcessor {} diff --git a/src/models/dinov3_vit/image_processing_dinov3_vit.js b/src/models/dinov3_vit/image_processing_dinov3_vit.js deleted file mode 100644 index abf5fac51..000000000 --- a/src/models/dinov3_vit/image_processing_dinov3_vit.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class DINOv3ViTImageProcessor extends ImageProcessor {} diff --git a/src/models/feature-extractors.js b/src/models/feature-extractors.js new file mode 100644 index 000000000..54a071d3f --- /dev/null +++ b/src/models/feature-extractors.js @@ -0,0 +1,18 @@ +export * from './model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js'; +export * from './model-processors/encodec/feature_extraction_encodec.js'; +export * from './model-processors/chatterbox/feature_extraction_chatterbox.js'; +export * from './model-processors/clap/feature_extraction_clap.js'; +export * from './model-processors/dac/feature_extraction_dac.js'; +export * from './model-processors/gemma3n/feature_extraction_gemma3n.js'; +export * from './model-processors/moonshine/feature_extraction_moonshine.js'; +export * from './model-processors/parakeet/feature_extraction_parakeet.js'; +export * from './model-processors/pyannote/feature_extraction_pyannote.js'; +export * from './model-processors/seamless_m4t/feature_extraction_seamless_m4t.js'; +export * from './model-processors/snac/feature_extraction_snac.js'; +export * from './model-processors/speecht5/feature_extraction_speecht5.js'; +export * from './model-processors/wav2vec2/feature_extraction_wav2vec2.js'; +export * from './model-processors/wespeaker/feature_extraction_wespeaker.js'; +export * from './model-processors/whisper/feature_extraction_whisper.js'; + +// For legacy support, ImageFeatureExtractor is an alias for ImageProcessor +export { ImageProcessor as ImageFeatureExtractor } from '../base/image_processors_utils.js'; diff --git a/src/models/feature_extractors.js b/src/models/feature_extractors.js deleted file mode 100644 index 2d19945ae..000000000 --- a/src/models/feature_extractors.js +++ /dev/null @@ -1,18 +0,0 @@ -export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js'; -export * from './encodec/feature_extraction_encodec.js'; -export * from './chatterbox/feature_extraction_chatterbox.js'; -export * from './clap/feature_extraction_clap.js'; -export * from './dac/feature_extraction_dac.js'; -export * from './gemma3n/feature_extraction_gemma3n.js'; -export * from './moonshine/feature_extraction_moonshine.js'; -export * from './parakeet/feature_extraction_parakeet.js'; -export * from './pyannote/feature_extraction_pyannote.js'; -export * from './seamless_m4t/feature_extraction_seamless_m4t.js'; -export * from './snac/feature_extraction_snac.js'; -export * from './speecht5/feature_extraction_speecht5.js'; -export * from './wav2vec2/feature_extraction_wav2vec2.js'; -export * from './wespeaker/feature_extraction_wespeaker.js'; -export * from './whisper/feature_extraction_whisper.js'; - -// For legacy support, ImageFeatureExtractor is an alias for ImageProcessor -export { ImageProcessor as ImageFeatureExtractor } from '../base/image_processors_utils.js'; diff --git a/src/models/glpn/image_processing_glpn.js b/src/models/glpn/image_processing_glpn.js deleted file mode 100644 index dd6b0ad4f..000000000 --- a/src/models/glpn/image_processing_glpn.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class GLPNFeatureExtractor extends ImageProcessor {} diff --git a/src/models/image-processors.js b/src/models/image-processors.js new file mode 100644 index 000000000..fb60330db --- /dev/null +++ b/src/models/image-processors.js @@ -0,0 +1,42 @@ +export * from './model-processors/beit/image_processing_beit.js'; +export * from './model-processors/bit/image_processing_bit.js'; +export * from './model-processors/chinese_clip/image_processing_chinese_clip.js'; +export * from './model-processors/clip/image_processing_clip.js'; +export * from './model-processors/convnext/image_processing_convnext.js'; +export * from './model-processors/deit/image_processing_deit.js'; +export * from './model-processors/detr/image_processing_detr.js'; +export * from './model-processors/dinov3_vit/image_processing_dinov3_vit.js'; +export * from './model-processors/donut/image_processing_donut.js'; +export * from './model-processors/dpt/image_processing_dpt.js'; +export * from './model-processors/efficientnet/image_processing_efficientnet.js'; +export * from './model-processors/glpn/image_processing_glpn.js'; +export * from './model-processors/grounding_dino/image_processing_grounding_dino.js'; +export * from './model-processors/idefics3/image_processing_idefics3.js'; +export * from './model-processors/janus/image_processing_janus.js'; +export * from './model-processors/jina_clip/image_processing_jina_clip.js'; +export * from './model-processors/llava_onevision/image_processing_llava_onevision.js'; +export * from './model-processors/mask2former/image_processing_mask2former.js'; +export * from './model-processors/maskformer/image_processing_maskformer.js'; +export * from './model-processors/mobilenet_v1/image_processing_mobilenet_v1.js'; +export * from './model-processors/mobilenet_v2/image_processing_mobilenet_v2.js'; +export * from './model-processors/mobilenet_v3/image_processing_mobilenet_v3.js'; +export * from './model-processors/mobilenet_v4/image_processing_mobilenet_v4.js'; +export * from './model-processors/mobilevit/image_processing_mobilevit.js'; +export * from './model-processors/nougat/image_processing_nougat.js'; +export * from './model-processors/owlv2/image_processing_owlv2.js'; +export * from './model-processors/owlvit/image_processing_owlvit.js'; +export * from './model-processors/phi3_v/image_processing_phi3_v.js'; +export * from './model-processors/pvt/image_processing_pvt.js'; +export * from './model-processors/qwen2_vl/image_processing_qwen2_vl.js'; +export * from './model-processors/rt_detr/image_processing_rt_detr.js'; +export * from './model-processors/sam/image_processing_sam.js'; +export * from './model-processors/sam2/image_processing_sam2.js'; +export * from './model-processors/sam3/image_processing_sam3.js'; +export * from './model-processors/segformer/image_processing_segformer.js'; +export * from './model-processors/siglip/image_processing_siglip.js'; +export * from './model-processors/smolvlm/image_processing_smolvlm.js'; +export * from './model-processors/swin2sr/image_processing_swin2sr.js'; +export * from './model-processors/vit/image_processing_vit.js'; +export * from './model-processors/vitmatte/image_processing_vitmatte.js'; +export * from './model-processors/vitpose/image_processing_vitpose.js'; +export * from './model-processors/yolos/image_processing_yolos.js'; diff --git a/src/models/image_processors.js b/src/models/image_processors.js deleted file mode 100644 index ef58ba41a..000000000 --- a/src/models/image_processors.js +++ /dev/null @@ -1,42 +0,0 @@ -export * from './beit/image_processing_beit.js'; -export * from './bit/image_processing_bit.js'; -export * from './chinese_clip/image_processing_chinese_clip.js'; -export * from './clip/image_processing_clip.js'; -export * from './convnext/image_processing_convnext.js'; -export * from './deit/image_processing_deit.js'; -export * from './detr/image_processing_detr.js'; -export * from './dinov3_vit/image_processing_dinov3_vit.js'; -export * from './donut/image_processing_donut.js'; -export * from './dpt/image_processing_dpt.js'; -export * from './efficientnet/image_processing_efficientnet.js'; -export * from './glpn/image_processing_glpn.js'; -export * from './grounding_dino/image_processing_grounding_dino.js'; -export * from './idefics3/image_processing_idefics3.js'; -export * from './janus/image_processing_janus.js'; -export * from './jina_clip/image_processing_jina_clip.js'; -export * from './llava_onevision/image_processing_llava_onevision.js'; -export * from './mask2former/image_processing_mask2former.js'; -export * from './maskformer/image_processing_maskformer.js'; -export * from './mobilenet_v1/image_processing_mobilenet_v1.js'; -export * from './mobilenet_v2/image_processing_mobilenet_v2.js'; -export * from './mobilenet_v3/image_processing_mobilenet_v3.js'; -export * from './mobilenet_v4/image_processing_mobilenet_v4.js'; -export * from './mobilevit/image_processing_mobilevit.js'; -export * from './nougat/image_processing_nougat.js'; -export * from './owlv2/image_processing_owlv2.js'; -export * from './owlvit/image_processing_owlvit.js'; -export * from './phi3_v/image_processing_phi3_v.js'; -export * from './pvt/image_processing_pvt.js'; -export * from './qwen2_vl/image_processing_qwen2_vl.js'; -export * from './rt_detr/image_processing_rt_detr.js'; -export * from './sam/image_processing_sam.js'; -export * from './sam2/image_processing_sam2.js'; -export * from './sam3/image_processing_sam3.js'; -export * from './segformer/image_processing_segformer.js'; -export * from './siglip/image_processing_siglip.js'; -export * from './smolvlm/image_processing_smolvlm.js'; -export * from './swin2sr/image_processing_swin2sr.js'; -export * from './vit/image_processing_vit.js'; -export * from './vitmatte/image_processing_vitmatte.js'; -export * from './vitpose/image_processing_vitpose.js'; -export * from './yolos/image_processing_yolos.js'; diff --git a/src/models/llava_onevision/image_processing_llava_onevision.js b/src/models/llava_onevision/image_processing_llava_onevision.js deleted file mode 100644 index 95075666a..000000000 --- a/src/models/llava_onevision/image_processing_llava_onevision.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class LlavaOnevisionImageProcessor extends ImageProcessor {} diff --git a/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js b/src/models/model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js similarity index 94% rename from src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js rename to src/models/model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js index 057e72a19..0f98d5e09 100644 --- a/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +++ b/src/models/model-processors/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js @@ -1,6 +1,6 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; -import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js'; export class ASTFeatureExtractor extends FeatureExtractor { constructor(config) { diff --git a/src/models/auto/feature_extraction_auto.js b/src/models/model-processors/auto/feature_extraction_auto.js similarity index 79% rename from src/models/auto/feature_extraction_auto.js rename to src/models/model-processors/auto/feature_extraction_auto.js index c10f5bbef..e1a76f9aa 100644 --- a/src/models/auto/feature_extraction_auto.js +++ b/src/models/model-processors/auto/feature_extraction_auto.js @@ -1,7 +1,7 @@ -import { FEATURE_EXTRACTOR_NAME, GITHUB_ISSUE_URL } from '../../utils/constants.js'; -import { getModelJSON } from '../../utils/hub.js'; -import { FeatureExtractor } from '../../base/feature_extraction_utils.js'; -import * as AllFeatureExtractors from '../feature_extractors.js'; +import { FEATURE_EXTRACTOR_NAME, GITHUB_ISSUE_URL } from '../../../utils/constants.js'; +import { getModelJSON } from '../../../utils/hub.js'; +import { FeatureExtractor } from '../../../base/feature_extraction_utils.js'; +import * as AllFeatureExtractors from '../../feature-extractors.js'; export class AutoFeatureExtractor { /** @type {typeof FeatureExtractor.from_pretrained} */ diff --git a/src/models/auto/image_processing_auto.js b/src/models/model-processors/auto/image_processing_auto.js similarity index 80% rename from src/models/auto/image_processing_auto.js rename to src/models/model-processors/auto/image_processing_auto.js index 1a9348dc3..a841d25de 100644 --- a/src/models/auto/image_processing_auto.js +++ b/src/models/model-processors/auto/image_processing_auto.js @@ -1,7 +1,7 @@ -import { GITHUB_ISSUE_URL, IMAGE_PROCESSOR_NAME } from '../../utils/constants.js'; -import { getModelJSON } from '../../utils/hub.js'; -import { ImageProcessor } from '../../base/image_processors_utils.js'; -import * as AllImageProcessors from '../image_processors.js'; +import { getModelJSON } from '../../../utils/hub.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; +import * as AllImageProcessors from '../../image-processors.js'; +import { GITHUB_ISSUE_URL, IMAGE_PROCESSOR_NAME } from '../../../utils/constants.js'; export class AutoImageProcessor { /** @type {typeof ImageProcessor.from_pretrained} */ diff --git a/src/models/auto/processing_auto.js b/src/models/model-processors/auto/processing_auto.js similarity index 87% rename from src/models/auto/processing_auto.js rename to src/models/model-processors/auto/processing_auto.js index 75b2b7335..5d2908516 100644 --- a/src/models/auto/processing_auto.js +++ b/src/models/model-processors/auto/processing_auto.js @@ -1,13 +1,13 @@ -import { IMAGE_PROCESSOR_NAME } from '../../utils/constants.js'; -import { getModelJSON } from '../../utils/hub.js'; -import { Processor } from '../../base/processing_utils.js'; +import { IMAGE_PROCESSOR_NAME } from '../../../utils/constants.js'; +import { getModelJSON } from '../../../utils/hub.js'; +import { Processor } from '../../../base/processing_utils.js'; -import * as AllProcessors from '../processors.js'; -import * as AllImageProcessors from '../image_processors.js'; -import * as AllFeatureExtractors from '../feature_extractors.js'; +import * as AllProcessors from '../../processors.js'; +import * as AllImageProcessors from '../../image-processors.js'; +import * as AllFeatureExtractors from '../../feature-extractors.js'; /** - * @typedef {import('../../base/processing_utils.js').PretrainedProcessorOptions} PretrainedProcessorOptions + * @typedef {import('../../../base/processing_utils.js').PretrainedProcessorOptions} PretrainedProcessorOptions */ /** diff --git a/src/models/model-processors/beit/image_processing_beit.js b/src/models/model-processors/beit/image_processing_beit.js new file mode 100644 index 000000000..9eb07ef95 --- /dev/null +++ b/src/models/model-processors/beit/image_processing_beit.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class BeitFeatureExtractor extends ImageProcessor {} diff --git a/src/models/model-processors/bit/image_processing_bit.js b/src/models/model-processors/bit/image_processing_bit.js new file mode 100644 index 000000000..75925ed6c --- /dev/null +++ b/src/models/model-processors/bit/image_processing_bit.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class BitImageProcessor extends ImageProcessor {} diff --git a/src/models/chatterbox/feature_extraction_chatterbox.js b/src/models/model-processors/chatterbox/feature_extraction_chatterbox.js similarity index 82% rename from src/models/chatterbox/feature_extraction_chatterbox.js rename to src/models/model-processors/chatterbox/feature_extraction_chatterbox.js index 39cf46ffd..5675a9b27 100644 --- a/src/models/chatterbox/feature_extraction_chatterbox.js +++ b/src/models/model-processors/chatterbox/feature_extraction_chatterbox.js @@ -1,5 +1,5 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; export class ChatterboxFeatureExtractor extends FeatureExtractor { /** diff --git a/src/models/chatterbox/processing_chatterbox.js b/src/models/model-processors/chatterbox/processing_chatterbox.js similarity index 82% rename from src/models/chatterbox/processing_chatterbox.js rename to src/models/model-processors/chatterbox/processing_chatterbox.js index 546a3b63d..d54e43b55 100644 --- a/src/models/chatterbox/processing_chatterbox.js +++ b/src/models/model-processors/chatterbox/processing_chatterbox.js @@ -1,6 +1,6 @@ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { Processor } from '../../base/processing_utils.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { Processor } from '../../../base/processing_utils.js'; /** * Represents a ChatterboxProcessor that extracts features from an audio input. diff --git a/src/models/model-processors/chinese_clip/image_processing_chinese_clip.js b/src/models/model-processors/chinese_clip/image_processing_chinese_clip.js new file mode 100644 index 000000000..97f5d09a2 --- /dev/null +++ b/src/models/model-processors/chinese_clip/image_processing_chinese_clip.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class ChineseCLIPFeatureExtractor extends ImageProcessor {} diff --git a/src/models/clap/feature_extraction_clap.js b/src/models/model-processors/clap/feature_extraction_clap.js similarity index 97% rename from src/models/clap/feature_extraction_clap.js rename to src/models/model-processors/clap/feature_extraction_clap.js index 605748616..c700a6ecd 100644 --- a/src/models/clap/feature_extraction_clap.js +++ b/src/models/model-processors/clap/feature_extraction_clap.js @@ -1,6 +1,6 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; -import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js'; export class ClapFeatureExtractor extends FeatureExtractor { constructor(config) { diff --git a/src/models/clip/image_processing_clip.js b/src/models/model-processors/clip/image_processing_clip.js similarity index 62% rename from src/models/clip/image_processing_clip.js rename to src/models/model-processors/clip/image_processing_clip.js index d40bb44c4..f649e09e9 100644 --- a/src/models/clip/image_processing_clip.js +++ b/src/models/model-processors/clip/image_processing_clip.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class CLIPImageProcessor extends ImageProcessor {} export class CLIPFeatureExtractor extends CLIPImageProcessor {} diff --git a/src/models/convnext/image_processing_convnext.js b/src/models/model-processors/convnext/image_processing_convnext.js similarity index 95% rename from src/models/convnext/image_processing_convnext.js rename to src/models/model-processors/convnext/image_processing_convnext.js index b5812c9c8..67e5de5b5 100644 --- a/src/models/convnext/image_processing_convnext.js +++ b/src/models/model-processors/convnext/image_processing_convnext.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class ConvNextImageProcessor extends ImageProcessor { constructor(config) { diff --git a/src/models/dac/feature_extraction_dac.js b/src/models/model-processors/dac/feature_extraction_dac.js similarity index 100% rename from src/models/dac/feature_extraction_dac.js rename to src/models/model-processors/dac/feature_extraction_dac.js diff --git a/src/models/deit/image_processing_deit.js b/src/models/model-processors/deit/image_processing_deit.js similarity index 62% rename from src/models/deit/image_processing_deit.js rename to src/models/model-processors/deit/image_processing_deit.js index 7313495c7..faa5c1563 100644 --- a/src/models/deit/image_processing_deit.js +++ b/src/models/model-processors/deit/image_processing_deit.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class DeiTImageProcessor extends ImageProcessor {} export class DeiTFeatureExtractor extends DeiTImageProcessor {} diff --git a/src/models/detr/image_processing_detr.js b/src/models/model-processors/detr/image_processing_detr.js similarity index 79% rename from src/models/detr/image_processing_detr.js rename to src/models/model-processors/detr/image_processing_detr.js index 7c8a653b0..cd441764c 100644 --- a/src/models/detr/image_processing_detr.js +++ b/src/models/model-processors/detr/image_processing_detr.js @@ -3,21 +3,21 @@ import { post_process_object_detection, post_process_panoptic_segmentation, post_process_instance_segmentation, -} from '../../base/image_processors_utils.js'; +} from '../../../base/image_processors_utils.js'; -import { full } from '../../utils/tensor.js'; +import { full } from '../../../utils/tensor.js'; /** * @typedef {object} DetrFeatureExtractorResultProps - * @property {import('../../utils/tensor.js').Tensor} pixel_mask - * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult + * @property {import('../../../utils/tensor.js').Tensor} pixel_mask + * @typedef {import('../../../base/image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult */ export class DetrImageProcessor extends ImageProcessor { /** * Calls the feature extraction process on an array of images, preprocesses * each image, and concatenates the resulting features into a single Tensor. - * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from. + * @param {import('../../../utils/image.js').RawImage[]} images The image(s) to extract features from. * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. */ async _call(images) { diff --git a/src/models/model-processors/dinov3_vit/image_processing_dinov3_vit.js b/src/models/model-processors/dinov3_vit/image_processing_dinov3_vit.js new file mode 100644 index 000000000..534120b78 --- /dev/null +++ b/src/models/model-processors/dinov3_vit/image_processing_dinov3_vit.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class DINOv3ViTImageProcessor extends ImageProcessor {} diff --git a/src/models/donut/image_processing_donut.js b/src/models/model-processors/donut/image_processing_donut.js similarity index 93% rename from src/models/donut/image_processing_donut.js rename to src/models/model-processors/donut/image_processing_donut.js index 353c0bdd4..e778ff350 100644 --- a/src/models/donut/image_processing_donut.js +++ b/src/models/model-processors/donut/image_processing_donut.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class DonutImageProcessor extends ImageProcessor { pad_image(pixelData, imgDims, padSize, options = {}) { diff --git a/src/models/dpt/image_processing_dpt.js b/src/models/model-processors/dpt/image_processing_dpt.js similarity index 67% rename from src/models/dpt/image_processing_dpt.js rename to src/models/model-processors/dpt/image_processing_dpt.js index c854ae6fd..7789d3bf2 100644 --- a/src/models/dpt/image_processing_dpt.js +++ b/src/models/model-processors/dpt/image_processing_dpt.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class DPTImageProcessor extends ImageProcessor {} export class DPTFeatureExtractor extends DPTImageProcessor {} // NOTE: extends DPTImageProcessor diff --git a/src/models/efficientnet/image_processing_efficientnet.js b/src/models/model-processors/efficientnet/image_processing_efficientnet.js similarity index 81% rename from src/models/efficientnet/image_processing_efficientnet.js rename to src/models/model-processors/efficientnet/image_processing_efficientnet.js index 837af8840..ef39f10aa 100644 --- a/src/models/efficientnet/image_processing_efficientnet.js +++ b/src/models/model-processors/efficientnet/image_processing_efficientnet.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class EfficientNetImageProcessor extends ImageProcessor { constructor(config) { diff --git a/src/models/encodec/feature_extraction_encodec.js b/src/models/model-processors/encodec/feature_extraction_encodec.js similarity index 87% rename from src/models/encodec/feature_extraction_encodec.js rename to src/models/model-processors/encodec/feature_extraction_encodec.js index bb60a5d63..83a1430a1 100644 --- a/src/models/encodec/feature_extraction_encodec.js +++ b/src/models/model-processors/encodec/feature_extraction_encodec.js @@ -1,5 +1,5 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; export class EncodecFeatureExtractor extends FeatureExtractor { /** diff --git a/src/models/florence2/processing_florence2.js b/src/models/model-processors/florence2/processing_florence2.js similarity index 97% rename from src/models/florence2/processing_florence2.js rename to src/models/model-processors/florence2/processing_florence2.js index 13edc10a0..e817a5b4d 100644 --- a/src/models/florence2/processing_florence2.js +++ b/src/models/model-processors/florence2/processing_florence2.js @@ -1,6 +1,6 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; export class Florence2Processor extends Processor { static tokenizer_class = AutoTokenizer; diff --git a/src/models/gemma3n/feature_extraction_gemma3n.js b/src/models/model-processors/gemma3n/feature_extraction_gemma3n.js similarity index 95% rename from src/models/gemma3n/feature_extraction_gemma3n.js rename to src/models/model-processors/gemma3n/feature_extraction_gemma3n.js index 5d77ebf24..d6fc75a95 100644 --- a/src/models/gemma3n/feature_extraction_gemma3n.js +++ b/src/models/model-processors/gemma3n/feature_extraction_gemma3n.js @@ -1,6 +1,6 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { full, Tensor } from '../../utils/tensor.js'; -import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { full, Tensor } from '../../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js'; export class Gemma3nAudioFeatureExtractor extends FeatureExtractor { constructor(config) { diff --git a/src/models/gemma3n/processing_gemma3n.js b/src/models/model-processors/gemma3n/processing_gemma3n.js similarity index 92% rename from src/models/gemma3n/processing_gemma3n.js rename to src/models/model-processors/gemma3n/processing_gemma3n.js index a3b47741d..e8162f6f6 100644 --- a/src/models/gemma3n/processing_gemma3n.js +++ b/src/models/model-processors/gemma3n/processing_gemma3n.js @@ -1,9 +1,9 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { RawImage } from '../../utils/image.js'; -import { RawAudio } from '../../utils/audio.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { RawImage } from '../../../utils/image.js'; +import { RawAudio } from '../../../utils/audio.js'; export class Gemma3nProcessor extends Processor { static image_processor_class = AutoImageProcessor; diff --git a/src/models/model-processors/glpn/image_processing_glpn.js b/src/models/model-processors/glpn/image_processing_glpn.js new file mode 100644 index 000000000..cf6b23967 --- /dev/null +++ b/src/models/model-processors/glpn/image_processing_glpn.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class GLPNFeatureExtractor extends ImageProcessor {} diff --git a/src/models/grounding_dino/image_processing_grounding_dino.js b/src/models/model-processors/grounding_dino/image_processing_grounding_dino.js similarity index 60% rename from src/models/grounding_dino/image_processing_grounding_dino.js rename to src/models/model-processors/grounding_dino/image_processing_grounding_dino.js index 0042ab763..2eceb7bcc 100644 --- a/src/models/grounding_dino/image_processing_grounding_dino.js +++ b/src/models/model-processors/grounding_dino/image_processing_grounding_dino.js @@ -1,17 +1,17 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; -import { ones } from '../../utils/tensor.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; +import { ones } from '../../../utils/tensor.js'; /** * @typedef {object} GroundingDinoFeatureExtractorResultProps - * @property {import('../../utils/tensor.js').Tensor} pixel_mask - * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult + * @property {import('../../../utils/tensor.js').Tensor} pixel_mask + * @typedef {import('../../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult */ export class GroundingDinoImageProcessor extends ImageProcessor { /** * Calls the feature extraction process on an array of images, preprocesses * each image, and concatenates the resulting features into a single Tensor. - * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from. + * @param {import('../../../utils/image.js').RawImage[]} images The image(s) to extract features from. * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. */ async _call(images) { diff --git a/src/models/grounding_dino/processing_grounding_dino.js b/src/models/model-processors/grounding_dino/processing_grounding_dino.js similarity index 85% rename from src/models/grounding_dino/processing_grounding_dino.js rename to src/models/model-processors/grounding_dino/processing_grounding_dino.js index 8ad0eade0..d1dea7b05 100644 --- a/src/models/grounding_dino/processing_grounding_dino.js +++ b/src/models/model-processors/grounding_dino/processing_grounding_dino.js @@ -1,12 +1,12 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { center_to_corners_format } from '../../base/image_processors_utils.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { center_to_corners_format } from '../../../base/image_processors_utils.js'; /** * Get token ids of phrases from posmaps and input_ids. - * @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`. - * @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`. + * @param {import('../../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`. + * @param {import('../../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`. */ function get_phrases_from_posmap(posmaps, input_ids) { const left_idx = 0; @@ -28,7 +28,7 @@ export class GroundingDinoProcessor extends Processor { static image_processor_class = AutoImageProcessor; /** - * @typedef {import('../../utils/image.js').RawImage} RawImage + * @typedef {import('../../../utils/image.js').RawImage} RawImage */ /** * diff --git a/src/models/idefics3/image_processing_idefics3.js b/src/models/model-processors/idefics3/image_processing_idefics3.js similarity index 96% rename from src/models/idefics3/image_processing_idefics3.js rename to src/models/model-processors/idefics3/image_processing_idefics3.js index a2ada459b..709208f4c 100644 --- a/src/models/idefics3/image_processing_idefics3.js +++ b/src/models/model-processors/idefics3/image_processing_idefics3.js @@ -1,5 +1,5 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; -import { cat, full, interpolate_4d, slice, stack } from '../../utils/tensor.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; +import { cat, full, interpolate_4d, slice, stack } from '../../../utils/tensor.js'; export class Idefics3ImageProcessor extends ImageProcessor { constructor(config) { @@ -10,8 +10,8 @@ export class Idefics3ImageProcessor extends ImageProcessor { } /** - * @typedef {import('../../utils/image.js').RawImage} RawImage - * @typedef {import('../../utils/tensor.js').Tensor} Tensor + * @typedef {import('../../../utils/image.js').RawImage} RawImage + * @typedef {import('../../../utils/tensor.js').Tensor} Tensor */ /** diff --git a/src/models/idefics3/processing_idefics3.js b/src/models/model-processors/idefics3/processing_idefics3.js similarity index 95% rename from src/models/idefics3/processing_idefics3.js rename to src/models/model-processors/idefics3/processing_idefics3.js index fb8898cf8..423d4ebf6 100644 --- a/src/models/idefics3/processing_idefics3.js +++ b/src/models/model-processors/idefics3/processing_idefics3.js @@ -1,8 +1,8 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { RawImage } from '../../utils/image.js'; -import { count } from '../../utils/core.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { RawImage } from '../../../utils/image.js'; +import { count } from '../../../utils/core.js'; /** * Prompt with expanded image tokens for when the image is split into patches. diff --git a/src/models/janus/image_processing_janus.js b/src/models/model-processors/janus/image_processing_janus.js similarity index 89% rename from src/models/janus/image_processing_janus.js rename to src/models/model-processors/janus/image_processing_janus.js index 96d00e795..30c4d685b 100644 --- a/src/models/janus/image_processing_janus.js +++ b/src/models/model-processors/janus/image_processing_janus.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class VLMImageProcessor extends ImageProcessor { constructor(config) { diff --git a/src/models/janus/processing_janus.js b/src/models/model-processors/janus/processing_janus.js similarity index 89% rename from src/models/janus/processing_janus.js rename to src/models/model-processors/janus/processing_janus.js index 54c797f9f..3a3301f14 100644 --- a/src/models/janus/processing_janus.js +++ b/src/models/model-processors/janus/processing_janus.js @@ -1,9 +1,9 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { mergeArrays } from '../../utils/core.js'; -import { Tensor } from '../../utils/tensor.js'; -import { RawImage } from '../../utils/image.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { mergeArrays } from '../../../utils/core.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { RawImage } from '../../../utils/image.js'; export class VLChatProcessor extends Processor { static image_processor_class = AutoImageProcessor; @@ -22,7 +22,7 @@ export class VLChatProcessor extends Processor { /** * @typedef {Object} MultimodalMessageProperties Additional properties for multimodal messages. * @property {(RawImage | string | URL)[]} [images] The images in the message. - * @typedef {(import('../../tokenizers.js').Message & MultimodalMessageProperties)[]} MultimodalConversation The conversation possibly containing multimodal inputs. + * @typedef {(import('../../../tokenizers.js').Message & MultimodalMessageProperties)[]} MultimodalConversation The conversation possibly containing multimodal inputs. */ /** @@ -38,7 +38,7 @@ export class VLChatProcessor extends Processor { * @param {Object} options Additional options for processing. * @param {RawImage|RawImage[]} [options.images] The images to process, if not set in the conversation. * @param {string} [options.chat_template="default"] The chat template to use. - * @returns {Promise} The processed input. + * @returns {Promise} The processed input. */ async _call(conversation, { images = null, chat_template = 'default' } = {}) { if (!images) { diff --git a/src/models/jina_clip/image_processing_jina_clip.js b/src/models/model-processors/jina_clip/image_processing_jina_clip.js similarity index 91% rename from src/models/jina_clip/image_processing_jina_clip.js rename to src/models/model-processors/jina_clip/image_processing_jina_clip.js index 7b901f5ee..9613a3710 100644 --- a/src/models/jina_clip/image_processing_jina_clip.js +++ b/src/models/model-processors/jina_clip/image_processing_jina_clip.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class JinaCLIPImageProcessor extends ImageProcessor { constructor(config) { diff --git a/src/models/jina_clip/processing_jina_clip.js b/src/models/model-processors/jina_clip/processing_jina_clip.js similarity index 84% rename from src/models/jina_clip/processing_jina_clip.js rename to src/models/model-processors/jina_clip/processing_jina_clip.js index ef3d3ffd8..e0dadb756 100644 --- a/src/models/jina_clip/processing_jina_clip.js +++ b/src/models/model-processors/jina_clip/processing_jina_clip.js @@ -1,6 +1,6 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; export class JinaCLIPProcessor extends Processor { static tokenizer_class = AutoTokenizer; diff --git a/src/models/llava/processing_llava.js b/src/models/model-processors/llava/processing_llava.js similarity index 87% rename from src/models/llava/processing_llava.js rename to src/models/model-processors/llava/processing_llava.js index 4f70edca8..849c82c54 100644 --- a/src/models/llava/processing_llava.js +++ b/src/models/model-processors/llava/processing_llava.js @@ -1,6 +1,6 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; export class LlavaProcessor extends Processor { static tokenizer_class = AutoTokenizer; @@ -8,7 +8,7 @@ export class LlavaProcessor extends Processor { static uses_processor_config = true; /** - * @typedef {import('../../utils/image.js').RawImage} RawImage + * @typedef {import('../../../utils/image.js').RawImage} RawImage */ // `images` is required, `text` is optional diff --git a/src/models/model-processors/llava_onevision/image_processing_llava_onevision.js b/src/models/model-processors/llava_onevision/image_processing_llava_onevision.js new file mode 100644 index 000000000..1caf43c6b --- /dev/null +++ b/src/models/model-processors/llava_onevision/image_processing_llava_onevision.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class LlavaOnevisionImageProcessor extends ImageProcessor {} diff --git a/src/models/mask2former/image_processing_mask2former.js b/src/models/model-processors/mask2former/image_processing_mask2former.js similarity index 100% rename from src/models/mask2former/image_processing_mask2former.js rename to src/models/model-processors/mask2former/image_processing_mask2former.js diff --git a/src/models/maskformer/image_processing_maskformer.js b/src/models/model-processors/maskformer/image_processing_maskformer.js similarity index 92% rename from src/models/maskformer/image_processing_maskformer.js rename to src/models/model-processors/maskformer/image_processing_maskformer.js index 95e70f6e8..5996c913f 100644 --- a/src/models/maskformer/image_processing_maskformer.js +++ b/src/models/model-processors/maskformer/image_processing_maskformer.js @@ -2,7 +2,7 @@ import { ImageProcessor, post_process_panoptic_segmentation, post_process_instance_segmentation, -} from '../../base/image_processors_utils.js'; +} from '../../../base/image_processors_utils.js'; export class MaskFormerImageProcessor extends ImageProcessor { /** @type {typeof post_process_panoptic_segmentation} */ diff --git a/src/models/mgp_str/processing_mgp_str.js b/src/models/model-processors/mgp_str/processing_mgp_str.js similarity index 89% rename from src/models/mgp_str/processing_mgp_str.js rename to src/models/model-processors/mgp_str/processing_mgp_str.js index a312b17d2..763f7d579 100644 --- a/src/models/mgp_str/processing_mgp_str.js +++ b/src/models/model-processors/mgp_str/processing_mgp_str.js @@ -1,7 +1,7 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { max, softmax } from '../../utils/maths.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { max, softmax } from '../../../utils/maths.js'; const DECODE_TYPE_MAPPING = { char: ['char_decode', 1], @@ -13,21 +13,21 @@ export class MgpstrProcessor extends Processor { static image_processor_class = AutoImageProcessor; /** - * @returns {import('../../tokenizers.js').MgpstrTokenizer} The character tokenizer. + * @returns {import('../../../tokenizers.js').MgpstrTokenizer} The character tokenizer. */ get char_tokenizer() { return this.components.char_tokenizer; } /** - * @returns {import('../../tokenizers.js').GPT2Tokenizer} The BPE tokenizer. + * @returns {import('../../../tokenizers.js').GPT2Tokenizer} The BPE tokenizer. */ get bpe_tokenizer() { return this.components.bpe_tokenizer; } /** - * @returns {import('../../tokenizers.js').BertTokenizer} The WordPiece tokenizer. + * @returns {import('../../../tokenizers.js').BertTokenizer} The WordPiece tokenizer. */ get wp_tokenizer() { return this.components.wp_tokenizer; @@ -35,7 +35,7 @@ export class MgpstrProcessor extends Processor { /** * Helper function to decode the model prediction logits. - * @param {import('../../utils/tensor.js').Tensor} pred_logits Model prediction logits. + * @param {import('../../../utils/tensor.js').Tensor} pred_logits Model prediction logits. * @param {string} format Type of model prediction. Must be one of ['char', 'bpe', 'wp']. * @returns {[string[], number[]]} The decoded sentences and their confidence scores. */ @@ -108,7 +108,7 @@ export class MgpstrProcessor extends Processor { /** * Convert a list of lists of token ids into a list of strings by calling decode. - * @param {[import('../../utils/tensor.js').Tensor, import('../../utils/tensor.js').Tensor, import('../../utils/tensor.js').Tensor]} sequences List of tokenized input ids. + * @param {[import('../../../utils/tensor.js').Tensor, import('../../../utils/tensor.js').Tensor, import('../../../utils/tensor.js').Tensor]} sequences List of tokenized input ids. * @returns {{generated_text: string[], scores: number[], char_preds: string[], bpe_preds: string[], wp_preds: string[]}} * Dictionary of all the outputs of the decoded results. * - generated_text: The final results after fusion of char, bpe, and wp. diff --git a/src/models/mobilenet_v1/image_processing_mobilenet_v1.js b/src/models/model-processors/mobilenet_v1/image_processing_mobilenet_v1.js similarity index 66% rename from src/models/mobilenet_v1/image_processing_mobilenet_v1.js rename to src/models/model-processors/mobilenet_v1/image_processing_mobilenet_v1.js index d11dbed3d..4913f7033 100644 --- a/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +++ b/src/models/model-processors/mobilenet_v1/image_processing_mobilenet_v1.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class MobileNetV1ImageProcessor extends ImageProcessor {} export class MobileNetV1FeatureExtractor extends MobileNetV1ImageProcessor {} diff --git a/src/models/mobilenet_v2/image_processing_mobilenet_v2.js b/src/models/model-processors/mobilenet_v2/image_processing_mobilenet_v2.js similarity index 66% rename from src/models/mobilenet_v2/image_processing_mobilenet_v2.js rename to src/models/model-processors/mobilenet_v2/image_processing_mobilenet_v2.js index 687d888ca..34a8a2ac2 100644 --- a/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +++ b/src/models/model-processors/mobilenet_v2/image_processing_mobilenet_v2.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class MobileNetV2ImageProcessor extends ImageProcessor {} export class MobileNetV2FeatureExtractor extends MobileNetV2ImageProcessor {} diff --git a/src/models/mobilenet_v3/image_processing_mobilenet_v3.js b/src/models/model-processors/mobilenet_v3/image_processing_mobilenet_v3.js similarity index 66% rename from src/models/mobilenet_v3/image_processing_mobilenet_v3.js rename to src/models/model-processors/mobilenet_v3/image_processing_mobilenet_v3.js index 0da34b58b..1859ef91d 100644 --- a/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +++ b/src/models/model-processors/mobilenet_v3/image_processing_mobilenet_v3.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class MobileNetV3ImageProcessor extends ImageProcessor {} export class MobileNetV3FeatureExtractor extends MobileNetV3ImageProcessor {} diff --git a/src/models/mobilenet_v4/image_processing_mobilenet_v4.js b/src/models/model-processors/mobilenet_v4/image_processing_mobilenet_v4.js similarity index 66% rename from src/models/mobilenet_v4/image_processing_mobilenet_v4.js rename to src/models/model-processors/mobilenet_v4/image_processing_mobilenet_v4.js index c838ffbdb..d7f8e3e0a 100644 --- a/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +++ b/src/models/model-processors/mobilenet_v4/image_processing_mobilenet_v4.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class MobileNetV4ImageProcessor extends ImageProcessor {} export class MobileNetV4FeatureExtractor extends MobileNetV4ImageProcessor {} diff --git a/src/models/mobilevit/image_processing_mobilevit.js b/src/models/model-processors/mobilevit/image_processing_mobilevit.js similarity index 65% rename from src/models/mobilevit/image_processing_mobilevit.js rename to src/models/model-processors/mobilevit/image_processing_mobilevit.js index df2877ca4..164d71f49 100644 --- a/src/models/mobilevit/image_processing_mobilevit.js +++ b/src/models/model-processors/mobilevit/image_processing_mobilevit.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class MobileViTImageProcessor extends ImageProcessor {} export class MobileViTFeatureExtractor extends MobileViTImageProcessor {} diff --git a/src/models/moonshine/feature_extraction_moonshine.js b/src/models/model-processors/moonshine/feature_extraction_moonshine.js similarity index 82% rename from src/models/moonshine/feature_extraction_moonshine.js rename to src/models/model-processors/moonshine/feature_extraction_moonshine.js index 6702d9021..55912ed4e 100644 --- a/src/models/moonshine/feature_extraction_moonshine.js +++ b/src/models/model-processors/moonshine/feature_extraction_moonshine.js @@ -1,5 +1,5 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; export class MoonshineFeatureExtractor extends FeatureExtractor { /** diff --git a/src/models/moonshine/processing_moonshine.js b/src/models/model-processors/moonshine/processing_moonshine.js similarity index 84% rename from src/models/moonshine/processing_moonshine.js rename to src/models/model-processors/moonshine/processing_moonshine.js index 895c276b6..65d1d7e34 100644 --- a/src/models/moonshine/processing_moonshine.js +++ b/src/models/model-processors/moonshine/processing_moonshine.js @@ -1,6 +1,6 @@ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { Processor } from '../../base/processing_utils.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { Processor } from '../../../base/processing_utils.js'; /** * Represents a MoonshineProcessor that extracts features from an audio input. diff --git a/src/models/nougat/image_processing_nougat.js b/src/models/model-processors/nougat/image_processing_nougat.js similarity index 100% rename from src/models/nougat/image_processing_nougat.js rename to src/models/model-processors/nougat/image_processing_nougat.js diff --git a/src/models/owlv2/image_processing_owlv2.js b/src/models/model-processors/owlv2/image_processing_owlv2.js similarity index 100% rename from src/models/owlv2/image_processing_owlv2.js rename to src/models/model-processors/owlv2/image_processing_owlv2.js diff --git a/src/models/owlvit/image_processing_owlvit.js b/src/models/model-processors/owlvit/image_processing_owlvit.js similarity index 89% rename from src/models/owlvit/image_processing_owlvit.js rename to src/models/model-processors/owlvit/image_processing_owlvit.js index 767c312f8..2bcd147fb 100644 --- a/src/models/owlvit/image_processing_owlvit.js +++ b/src/models/model-processors/owlvit/image_processing_owlvit.js @@ -1,4 +1,4 @@ -import { ImageProcessor, post_process_object_detection } from '../../base/image_processors_utils.js'; +import { ImageProcessor, post_process_object_detection } from '../../../base/image_processors_utils.js'; export class OwlViTImageProcessor extends ImageProcessor { /** @type {typeof post_process_object_detection} */ diff --git a/src/models/owlvit/processing_owlvit.js b/src/models/model-processors/owlvit/processing_owlvit.js similarity index 65% rename from src/models/owlvit/processing_owlvit.js rename to src/models/model-processors/owlvit/processing_owlvit.js index 6f673746e..8722b5af3 100644 --- a/src/models/owlvit/processing_owlvit.js +++ b/src/models/model-processors/owlvit/processing_owlvit.js @@ -1,6 +1,6 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; export class OwlViTProcessor extends Processor { static tokenizer_class = AutoTokenizer; static image_processor_class = AutoImageProcessor; diff --git a/src/models/paligemma/processing_paligemma.js b/src/models/model-processors/paligemma/processing_paligemma.js similarity index 93% rename from src/models/paligemma/processing_paligemma.js rename to src/models/model-processors/paligemma/processing_paligemma.js index 1aa3b1167..68dc83bfb 100644 --- a/src/models/paligemma/processing_paligemma.js +++ b/src/models/model-processors/paligemma/processing_paligemma.js @@ -1,6 +1,6 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; const IMAGE_TOKEN = ''; @@ -14,7 +14,7 @@ export class PaliGemmaProcessor extends Processor { static uses_processor_config = false; /** - * @typedef {import('../../utils/image.js').RawImage} RawImage + * @typedef {import('../../../utils/image.js').RawImage} RawImage */ // `images` is required, `text` is optional diff --git a/src/models/parakeet/feature_extraction_parakeet.js b/src/models/model-processors/parakeet/feature_extraction_parakeet.js similarity index 96% rename from src/models/parakeet/feature_extraction_parakeet.js rename to src/models/model-processors/parakeet/feature_extraction_parakeet.js index e4862270b..c986c8b3d 100644 --- a/src/models/parakeet/feature_extraction_parakeet.js +++ b/src/models/model-processors/parakeet/feature_extraction_parakeet.js @@ -1,6 +1,6 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; -import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js'; const EPSILON = 1e-5; diff --git a/src/models/phi3_v/image_processing_phi3_v.js b/src/models/model-processors/phi3_v/image_processing_phi3_v.js similarity index 98% rename from src/models/phi3_v/image_processing_phi3_v.js rename to src/models/model-processors/phi3_v/image_processing_phi3_v.js index 50a804e98..eb52c96c3 100644 --- a/src/models/phi3_v/image_processing_phi3_v.js +++ b/src/models/model-processors/phi3_v/image_processing_phi3_v.js @@ -1,5 +1,5 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; -import { cat, interpolate_4d, slice, stack, Tensor } from '../../utils/tensor.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; +import { cat, interpolate_4d, slice, stack, Tensor } from '../../../utils/tensor.js'; const IMAGE_SIZE = 336; const SLICE_AXES = [2, 3]; // axes to slice on diff --git a/src/models/phi3_v/processing_phi3_v.js b/src/models/model-processors/phi3_v/processing_phi3_v.js similarity index 91% rename from src/models/phi3_v/processing_phi3_v.js rename to src/models/model-processors/phi3_v/processing_phi3_v.js index 18d3eb15a..5fb681330 100644 --- a/src/models/phi3_v/processing_phi3_v.js +++ b/src/models/model-processors/phi3_v/processing_phi3_v.js @@ -1,7 +1,7 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { RawImage } from '../../utils/image.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { RawImage } from '../../../utils/image.js'; const IMAGE_TOKEN = '<|image|>'; const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g; diff --git a/src/models/model-processors/pvt/image_processing_pvt.js b/src/models/model-processors/pvt/image_processing_pvt.js new file mode 100644 index 000000000..2aa79cd4c --- /dev/null +++ b/src/models/model-processors/pvt/image_processing_pvt.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class PvtImageProcessor extends ImageProcessor {} diff --git a/src/models/pyannote/feature_extraction_pyannote.js b/src/models/model-processors/pyannote/feature_extraction_pyannote.js similarity index 89% rename from src/models/pyannote/feature_extraction_pyannote.js rename to src/models/model-processors/pyannote/feature_extraction_pyannote.js index 3231a8ea9..dd4aaaed8 100644 --- a/src/models/pyannote/feature_extraction_pyannote.js +++ b/src/models/model-processors/pyannote/feature_extraction_pyannote.js @@ -1,6 +1,6 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; -import { max, softmax } from '../../utils/maths.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { max, softmax } from '../../../utils/maths.js'; export class PyAnnoteFeatureExtractor extends FeatureExtractor { /** @@ -32,7 +32,7 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor { /** * Post-processes the speaker diarization logits output by the model. - * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model. + * @param {import('../../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model. * @param {number} num_samples Number of samples in the input audio. * @returns {Array>} The post-processed speaker diarization results. */ diff --git a/src/models/pyannote/processing_pyannote.js b/src/models/model-processors/pyannote/processing_pyannote.js similarity index 93% rename from src/models/pyannote/processing_pyannote.js rename to src/models/model-processors/pyannote/processing_pyannote.js index e6f19cdb3..d49d04142 100644 --- a/src/models/pyannote/processing_pyannote.js +++ b/src/models/model-processors/pyannote/processing_pyannote.js @@ -1,4 +1,4 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js'; export class PyAnnoteProcessor extends Processor { diff --git a/src/models/qwen2_vl/image_processing_qwen2_vl.js b/src/models/model-processors/qwen2_vl/image_processing_qwen2_vl.js similarity index 92% rename from src/models/qwen2_vl/image_processing_qwen2_vl.js rename to src/models/model-processors/qwen2_vl/image_processing_qwen2_vl.js index 809b24566..f6a77f436 100644 --- a/src/models/qwen2_vl/image_processing_qwen2_vl.js +++ b/src/models/model-processors/qwen2_vl/image_processing_qwen2_vl.js @@ -1,5 +1,5 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; -import { cat, Tensor } from '../../utils/tensor.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; +import { cat, Tensor } from '../../../utils/tensor.js'; export class Qwen2VLImageProcessor extends ImageProcessor { async _call(images, ...args) { diff --git a/src/models/qwen2_vl/processing_qwen2_vl.js b/src/models/model-processors/qwen2_vl/processing_qwen2_vl.js similarity index 90% rename from src/models/qwen2_vl/processing_qwen2_vl.js rename to src/models/model-processors/qwen2_vl/processing_qwen2_vl.js index dd4775721..cf53b3807 100644 --- a/src/models/qwen2_vl/processing_qwen2_vl.js +++ b/src/models/model-processors/qwen2_vl/processing_qwen2_vl.js @@ -1,7 +1,7 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { RawImage } from '../../utils/image.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { RawImage } from '../../../utils/image.js'; export class Qwen2VLProcessor extends Processor { static image_processor_class = AutoImageProcessor; diff --git a/src/models/rt_detr/image_processing_rt_detr.js b/src/models/model-processors/rt_detr/image_processing_rt_detr.js similarity index 87% rename from src/models/rt_detr/image_processing_rt_detr.js rename to src/models/model-processors/rt_detr/image_processing_rt_detr.js index 1a8f8ee2d..da4b7b0e3 100644 --- a/src/models/rt_detr/image_processing_rt_detr.js +++ b/src/models/model-processors/rt_detr/image_processing_rt_detr.js @@ -1,4 +1,4 @@ -import { ImageProcessor, post_process_object_detection } from '../../base/image_processors_utils.js'; +import { ImageProcessor, post_process_object_detection } from '../../../base/image_processors_utils.js'; export class RTDetrImageProcessor extends ImageProcessor { /** @type {typeof post_process_object_detection} */ diff --git a/src/models/sam/image_processing_sam.js b/src/models/model-processors/sam/image_processing_sam.js similarity index 93% rename from src/models/sam/image_processing_sam.js rename to src/models/model-processors/sam/image_processing_sam.js index d4fa856ff..14c8276db 100644 --- a/src/models/sam/image_processing_sam.js +++ b/src/models/model-processors/sam/image_processing_sam.js @@ -1,13 +1,13 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; -import { calculateDimensions } from '../../utils/core.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; +import { calculateDimensions } from '../../../utils/core.js'; -import { interpolate_4d, Tensor } from '../../utils/tensor.js'; +import { interpolate_4d, Tensor } from '../../../utils/tensor.js'; /** * @typedef {object} SamImageProcessorResult * @property {Tensor} pixel_values - * @property {import("../../base/image_processors_utils.js").HeightWidth[]} original_sizes - * @property {import("../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes + * @property {import("../../../base/image_processors_utils.js").HeightWidth[]} original_sizes + * @property {import("../../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes * @property {Tensor} [input_points] * @property {Tensor} [input_labels] * @property {Tensor} [input_boxes] @@ -17,8 +17,8 @@ export class SamImageProcessor extends ImageProcessor { /** * * @param {any} input_points - * @param {import("../../base/image_processors_utils.js").HeightWidth[]} original_sizes - * @param {import("../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes + * @param {import("../../../base/image_processors_utils.js").HeightWidth[]} original_sizes + * @param {import("../../../base/image_processors_utils.js").HeightWidth[]} reshaped_input_sizes * @returns {Tensor} */ reshape_input_points(input_points, original_sizes, reshaped_input_sizes, is_bounding_box = false) { @@ -204,7 +204,7 @@ export class SamImageProcessor extends ImageProcessor { /** * Generates a list of crop boxes of different sizes. Each layer has (2**i)**2 boxes for the ith layer. - * @param {import("../../utils/image.js").RawImage} image Input original image + * @param {import("../../../utils/image.js").RawImage} image Input original image * @param {number} target_size Target size of the resized image * @param {Object} options Options for generating crop boxes * @param {number} [options.crop_n_layers] If >0, mask prediction will be run again on crops of the image. diff --git a/src/models/sam/processing_sam.js b/src/models/model-processors/sam/processing_sam.js similarity index 89% rename from src/models/sam/processing_sam.js rename to src/models/model-processors/sam/processing_sam.js index de9e856ce..077db3458 100644 --- a/src/models/sam/processing_sam.js +++ b/src/models/model-processors/sam/processing_sam.js @@ -1,4 +1,4 @@ -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; import { AutoImageProcessor } from '../auto/image_processing_auto.js'; export class SamProcessor extends Processor { diff --git a/src/models/sam2/image_processing_sam2.js b/src/models/model-processors/sam2/image_processing_sam2.js similarity index 100% rename from src/models/sam2/image_processing_sam2.js rename to src/models/model-processors/sam2/image_processing_sam2.js diff --git a/src/models/sam2/processing_sam2.js b/src/models/model-processors/sam2/processing_sam2.js similarity index 100% rename from src/models/sam2/processing_sam2.js rename to src/models/model-processors/sam2/processing_sam2.js diff --git a/src/models/sam3/image_processing_sam3.js b/src/models/model-processors/sam3/image_processing_sam3.js similarity index 100% rename from src/models/sam3/image_processing_sam3.js rename to src/models/model-processors/sam3/image_processing_sam3.js diff --git a/src/models/sapiens/image_processing_sapiens.js b/src/models/model-processors/sapiens/image_processing_sapiens.js similarity index 89% rename from src/models/sapiens/image_processing_sapiens.js rename to src/models/model-processors/sapiens/image_processing_sapiens.js index 15b755ff5..9df6582bd 100644 --- a/src/models/sapiens/image_processing_sapiens.js +++ b/src/models/model-processors/sapiens/image_processing_sapiens.js @@ -1,4 +1,4 @@ -import { ImageProcessor, post_process_semantic_segmentation } from '../../base/image_processors_utils.js'; +import { ImageProcessor, post_process_semantic_segmentation } from '../../../base/image_processors_utils.js'; export class SapiensImageProcessor extends ImageProcessor { /** @type {typeof post_process_semantic_segmentation} */ diff --git a/src/models/seamless_m4t/feature_extraction_seamless_m4t.js b/src/models/model-processors/seamless_m4t/feature_extraction_seamless_m4t.js similarity index 97% rename from src/models/seamless_m4t/feature_extraction_seamless_m4t.js rename to src/models/model-processors/seamless_m4t/feature_extraction_seamless_m4t.js index 41d9b7f4e..32c623239 100644 --- a/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +++ b/src/models/model-processors/seamless_m4t/feature_extraction_seamless_m4t.js @@ -1,6 +1,6 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; -import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js'; export class SeamlessM4TFeatureExtractor extends FeatureExtractor { constructor(config) { diff --git a/src/models/segformer/image_processing_segformer.js b/src/models/model-processors/segformer/image_processing_segformer.js similarity index 89% rename from src/models/segformer/image_processing_segformer.js rename to src/models/model-processors/segformer/image_processing_segformer.js index 1d23bc045..969af8eb8 100644 --- a/src/models/segformer/image_processing_segformer.js +++ b/src/models/model-processors/segformer/image_processing_segformer.js @@ -1,4 +1,4 @@ -import { ImageProcessor, post_process_semantic_segmentation } from '../../base/image_processors_utils.js'; +import { ImageProcessor, post_process_semantic_segmentation } from '../../../base/image_processors_utils.js'; export class SegformerImageProcessor extends ImageProcessor { /** @type {typeof post_process_semantic_segmentation} */ diff --git a/src/models/model-processors/siglip/image_processing_siglip.js b/src/models/model-processors/siglip/image_processing_siglip.js new file mode 100644 index 000000000..3c126f5f1 --- /dev/null +++ b/src/models/model-processors/siglip/image_processing_siglip.js @@ -0,0 +1,3 @@ +import { ImageProcessor } from '../../../base/image_processors_utils.js'; + +export class SiglipImageProcessor extends ImageProcessor {} diff --git a/src/models/smolvlm/image_processing_smolvlm.js b/src/models/model-processors/smolvlm/image_processing_smolvlm.js similarity index 100% rename from src/models/smolvlm/image_processing_smolvlm.js rename to src/models/model-processors/smolvlm/image_processing_smolvlm.js diff --git a/src/models/smolvlm/processing_smolvlm.js b/src/models/model-processors/smolvlm/processing_smolvlm.js similarity index 100% rename from src/models/smolvlm/processing_smolvlm.js rename to src/models/model-processors/smolvlm/processing_smolvlm.js diff --git a/src/models/snac/feature_extraction_snac.js b/src/models/model-processors/snac/feature_extraction_snac.js similarity index 100% rename from src/models/snac/feature_extraction_snac.js rename to src/models/model-processors/snac/feature_extraction_snac.js diff --git a/src/models/model-processors/speecht5/feature_extraction_speecht5.js b/src/models/model-processors/speecht5/feature_extraction_speecht5.js new file mode 100644 index 000000000..669d3e1f5 --- /dev/null +++ b/src/models/model-processors/speecht5/feature_extraction_speecht5.js @@ -0,0 +1,3 @@ +import { FeatureExtractor } from '../../../base/feature_extraction_utils.js'; + +export class SpeechT5FeatureExtractor extends FeatureExtractor {} diff --git a/src/models/speecht5/processing_speecht5.js b/src/models/model-processors/speecht5/processing_speecht5.js similarity index 82% rename from src/models/speecht5/processing_speecht5.js rename to src/models/model-processors/speecht5/processing_speecht5.js index 72824b2b0..2be9e8f42 100644 --- a/src/models/speecht5/processing_speecht5.js +++ b/src/models/model-processors/speecht5/processing_speecht5.js @@ -1,5 +1,5 @@ -import { Processor } from '../../base/processing_utils.js'; -import { AutoTokenizer } from '../../tokenizers.js'; +import { Processor } from '../../../base/processing_utils.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; export class SpeechT5Processor extends Processor { diff --git a/src/models/swin2sr/image_processing_swin2sr.js b/src/models/model-processors/swin2sr/image_processing_swin2sr.js similarity index 94% rename from src/models/swin2sr/image_processing_swin2sr.js rename to src/models/model-processors/swin2sr/image_processing_swin2sr.js index 7e5c810d1..c71ad06cf 100644 --- a/src/models/swin2sr/image_processing_swin2sr.js +++ b/src/models/model-processors/swin2sr/image_processing_swin2sr.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class Swin2SRImageProcessor extends ImageProcessor { pad_image(pixelData, imgDims, padSize, options = {}) { diff --git a/src/models/ultravox/processing_ultravox.js b/src/models/model-processors/ultravox/processing_ultravox.js similarity index 94% rename from src/models/ultravox/processing_ultravox.js rename to src/models/model-processors/ultravox/processing_ultravox.js index 80bee8ec8..05fe8d16f 100644 --- a/src/models/ultravox/processing_ultravox.js +++ b/src/models/model-processors/ultravox/processing_ultravox.js @@ -1,6 +1,6 @@ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { Processor } from '../../base/processing_utils.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { Processor } from '../../../base/processing_utils.js'; /** * Represents a UltravoxProcessor that extracts features from an audio input. diff --git a/src/models/vit/image_processing_vit.js b/src/models/model-processors/vit/image_processing_vit.js similarity index 61% rename from src/models/vit/image_processing_vit.js rename to src/models/model-processors/vit/image_processing_vit.js index 63864507f..15b63aef2 100644 --- a/src/models/vit/image_processing_vit.js +++ b/src/models/model-processors/vit/image_processing_vit.js @@ -1,4 +1,4 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class ViTImageProcessor extends ImageProcessor {} export class ViTFeatureExtractor extends ViTImageProcessor {} diff --git a/src/models/vitmatte/image_processing_vitmatte.js b/src/models/model-processors/vitmatte/image_processing_vitmatte.js similarity index 72% rename from src/models/vitmatte/image_processing_vitmatte.js rename to src/models/model-processors/vitmatte/image_processing_vitmatte.js index d08b9b132..78ac2098e 100644 --- a/src/models/vitmatte/image_processing_vitmatte.js +++ b/src/models/model-processors/vitmatte/image_processing_vitmatte.js @@ -1,14 +1,14 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; -import { stack, cat } from '../../utils/tensor.js'; +import { stack, cat } from '../../../utils/tensor.js'; export class VitMatteImageProcessor extends ImageProcessor { /** * Calls the feature extraction process on an array of images, preprocesses * each image, and concatenates the resulting features into a single Tensor. - * @param {import("../../utils/image.js").RawImage[]} images The image(s) to extract features from. - * @param {import("../../utils/image.js").RawImage[]} trimaps The trimaps(s) to extract features from. - * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. + * @param {import("../../../utils/image.js").RawImage[]} images The image(s) to extract features from. + * @param {import("../../../utils/image.js").RawImage[]} trimaps The trimaps(s) to extract features from. + * @returns {Promise} An object containing the concatenated pixel values of the preprocessed images. */ async _call(images, trimaps) { if (!Array.isArray(images)) { diff --git a/src/models/vitpose/image_processing_vitpose.js b/src/models/model-processors/vitpose/image_processing_vitpose.js similarity index 95% rename from src/models/vitpose/image_processing_vitpose.js rename to src/models/model-processors/vitpose/image_processing_vitpose.js index c19c486cd..cd6335534 100644 --- a/src/models/vitpose/image_processing_vitpose.js +++ b/src/models/model-processors/vitpose/image_processing_vitpose.js @@ -1,11 +1,11 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; +import { ImageProcessor } from '../../../base/image_processors_utils.js'; export class VitPoseImageProcessor extends ImageProcessor { /** * Transform the heatmaps into keypoint predictions and transform them back to the image. * NOTE: This is a naive implementation and does not include advanced post-processing techniques, * so the results may not be as accurate as the original implementation. - * @param {import('../../utils/tensor.js').Tensor} outputs The model outputs. + * @param {import('../../../utils/tensor.js').Tensor} outputs The model outputs. * @param {[number, number, number, number][][]} boxes List or array of bounding boxes for each image. * Each box should be a list of 4 floats representing the bounding box coordinates in COCO format (top_left_x, top_left_y, width, height). * @returns {{ diff --git a/src/models/voxtral/processing_voxtral.js b/src/models/model-processors/voxtral/processing_voxtral.js similarity index 95% rename from src/models/voxtral/processing_voxtral.js rename to src/models/model-processors/voxtral/processing_voxtral.js index d002f4eca..f3d379c43 100644 --- a/src/models/voxtral/processing_voxtral.js +++ b/src/models/model-processors/voxtral/processing_voxtral.js @@ -1,7 +1,7 @@ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { Processor } from '../../base/processing_utils.js'; -import { cat } from '../../utils/tensor.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { Processor } from '../../../base/processing_utils.js'; +import { cat } from '../../../utils/tensor.js'; const AUDIO_TOKEN = '[AUDIO]'; const BEGIN_AUDIO_TOKEN = '[BEGIN_AUDIO]'; diff --git a/src/models/wav2vec2/feature_extraction_wav2vec2.js b/src/models/model-processors/wav2vec2/feature_extraction_wav2vec2.js similarity index 91% rename from src/models/wav2vec2/feature_extraction_wav2vec2.js rename to src/models/model-processors/wav2vec2/feature_extraction_wav2vec2.js index 194b71359..83688169c 100644 --- a/src/models/wav2vec2/feature_extraction_wav2vec2.js +++ b/src/models/model-processors/wav2vec2/feature_extraction_wav2vec2.js @@ -1,5 +1,5 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; export class Wav2Vec2FeatureExtractor extends FeatureExtractor { /** diff --git a/src/models/wav2vec2/processing_wav2vec2.js b/src/models/model-processors/wav2vec2/processing_wav2vec2.js similarity index 82% rename from src/models/wav2vec2/processing_wav2vec2.js rename to src/models/model-processors/wav2vec2/processing_wav2vec2.js index 583eee66e..553dcb67d 100644 --- a/src/models/wav2vec2/processing_wav2vec2.js +++ b/src/models/model-processors/wav2vec2/processing_wav2vec2.js @@ -1,6 +1,6 @@ -import { AutoTokenizer } from '../../tokenizers.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; export class Wav2Vec2Processor extends Processor { static tokenizer_class = AutoTokenizer; diff --git a/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js b/src/models/model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js similarity index 82% rename from src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js rename to src/models/model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js index 157aca474..b768817b1 100644 --- a/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +++ b/src/models/model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js @@ -1,6 +1,6 @@ -import { AutoTokenizer } from '../../tokenizers.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { Processor } from '../../base/processing_utils.js'; +import { Processor } from '../../../base/processing_utils.js'; export class Wav2Vec2ProcessorWithLM extends Processor { static tokenizer_class = AutoTokenizer; diff --git a/src/models/wespeaker/feature_extraction_wespeaker.js b/src/models/model-processors/wespeaker/feature_extraction_wespeaker.js similarity index 95% rename from src/models/wespeaker/feature_extraction_wespeaker.js rename to src/models/model-processors/wespeaker/feature_extraction_wespeaker.js index 81145c0d6..d8439ca32 100644 --- a/src/models/wespeaker/feature_extraction_wespeaker.js +++ b/src/models/model-processors/wespeaker/feature_extraction_wespeaker.js @@ -1,6 +1,6 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; -import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js'; export class WeSpeakerFeatureExtractor extends FeatureExtractor { constructor(config) { diff --git a/src/models/whisper/common_whisper.js b/src/models/model-processors/whisper/common_whisper.js similarity index 100% rename from src/models/whisper/common_whisper.js rename to src/models/model-processors/whisper/common_whisper.js diff --git a/src/models/whisper/feature_extraction_whisper.js b/src/models/model-processors/whisper/feature_extraction_whisper.js similarity index 93% rename from src/models/whisper/feature_extraction_whisper.js rename to src/models/model-processors/whisper/feature_extraction_whisper.js index 0f8c85fc7..d049fe0f6 100644 --- a/src/models/whisper/feature_extraction_whisper.js +++ b/src/models/model-processors/whisper/feature_extraction_whisper.js @@ -1,7 +1,7 @@ -import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; -import { Tensor } from '../../utils/tensor.js'; -import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; -import { max } from '../../utils/maths.js'; +import { FeatureExtractor, validate_audio_inputs } from '../../../base/feature_extraction_utils.js'; +import { Tensor } from '../../../utils/tensor.js'; +import { mel_filter_bank, spectrogram, window_function } from '../../../utils/audio.js'; +import { max } from '../../../utils/maths.js'; export class WhisperFeatureExtractor extends FeatureExtractor { constructor(config) { diff --git a/src/models/whisper/generation_whisper.js b/src/models/model-processors/whisper/generation_whisper.js similarity index 90% rename from src/models/whisper/generation_whisper.js rename to src/models/model-processors/whisper/generation_whisper.js index 0fd1daa7d..d73f577d6 100644 --- a/src/models/whisper/generation_whisper.js +++ b/src/models/model-processors/whisper/generation_whisper.js @@ -1,4 +1,4 @@ -import { GenerationConfig } from '../../generation/configuration_utils.js'; +import { GenerationConfig } from '../../../generation/configuration_utils.js'; export class WhisperGenerationConfig extends GenerationConfig { /** @@ -84,5 +84,5 @@ export class WhisperGenerationConfig extends GenerationConfig { } /** - * @typedef {import('../../generation/parameters.js').GenerationFunctionParameters & {generation_config: WhisperGenerationConfig} & WhisperGenerationConfig} WhisperGenerationFunctionParameters + * @typedef {import('../../../generation/parameters.js').GenerationFunctionParameters & {generation_config: WhisperGenerationConfig} & WhisperGenerationConfig} WhisperGenerationFunctionParameters */ diff --git a/src/models/whisper/processing_whisper.js b/src/models/model-processors/whisper/processing_whisper.js similarity index 84% rename from src/models/whisper/processing_whisper.js rename to src/models/model-processors/whisper/processing_whisper.js index ef18dd7ce..952d06257 100644 --- a/src/models/whisper/processing_whisper.js +++ b/src/models/model-processors/whisper/processing_whisper.js @@ -1,6 +1,6 @@ import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js'; -import { AutoTokenizer } from '../../tokenizers.js'; -import { Processor } from '../../base/processing_utils.js'; +import { AutoTokenizer } from '../../../tokenizers.js'; +import { Processor } from '../../../base/processing_utils.js'; /** * Represents a WhisperProcessor that extracts features from an audio input. diff --git a/src/models/yolos/image_processing_yolos.js b/src/models/model-processors/yolos/image_processing_yolos.js similarity index 89% rename from src/models/yolos/image_processing_yolos.js rename to src/models/model-processors/yolos/image_processing_yolos.js index 2933536b6..253fe4f9f 100644 --- a/src/models/yolos/image_processing_yolos.js +++ b/src/models/model-processors/yolos/image_processing_yolos.js @@ -1,4 +1,4 @@ -import { ImageProcessor, post_process_object_detection } from '../../base/image_processors_utils.js'; +import { ImageProcessor, post_process_object_detection } from '../../../base/image_processors_utils.js'; export class YolosImageProcessor extends ImageProcessor { /** @type {typeof post_process_object_detection} */ diff --git a/src/models/output.js b/src/models/output.js new file mode 100644 index 000000000..97823d38e --- /dev/null +++ b/src/models/output.js @@ -0,0 +1,270 @@ +/** + * @typedef {import('../utils/tensor.js').Tensor} Tensor + */ + +export class ModelOutput {} + +/** + * Base class for model's outputs, with potential hidden states and attentions. + */ +export class BaseModelOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.last_hidden_state Sequence of hidden-states at the output of the last layer of the model. + * @param {Tensor} [output.hidden_states] Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + * @param {Tensor} [output.attentions] Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + */ + constructor({ last_hidden_state, hidden_states = null, attentions = null }) { + super(); + this.last_hidden_state = last_hidden_state; + this.hidden_states = hidden_states; + this.attentions = attentions; + } +} + +/** + * Base class for Segment-Anything model's output. + */ +export class SamImageSegmentationOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.iou_scores The output logits of the model. + * @param {Tensor} output.pred_masks Predicted boxes. + */ + constructor({ iou_scores, pred_masks }) { + super(); + this.iou_scores = iou_scores; + this.pred_masks = pred_masks; + } +} + +/** + * Base class for outputs of sentence classification models. + */ +export class SequenceClassifierOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits classification (or regression if config.num_labels==1) scores (before SoftMax). + * @param {Record} [output.attentions] Object of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. + * Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + */ + constructor({ logits, ...attentions }) { + super(); + this.logits = logits; + const attentions_list = Object.values(attentions); + if (attentions_list.length > 0) { + // Only set attentions if they are not empty + this.attentions = attentions_list; + } + } +} + +/** + * Base class for outputs of XVector models. + */ +export class XVectorOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification hidden states before AMSoftmax, of shape `(batch_size, config.xvector_output_dim)`. + * @param {Tensor} output.embeddings Utterance embeddings used for vector similarity-based retrieval, of shape `(batch_size, config.xvector_output_dim)`. + */ + constructor({ logits, embeddings }) { + super(); + this.logits = logits; + this.embeddings = embeddings; + } +} + +/** + * Base class for outputs of token classification models. + */ +export class TokenClassifierOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification scores (before SoftMax). + */ + constructor({ logits }) { + super(); + this.logits = logits; + } +} + +/** + * Base class for masked language models outputs. + */ +export class MaskedLMOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + */ + constructor({ logits }) { + super(); + this.logits = logits; + } +} + +/** + * Base class for outputs of question answering models. + */ +export class QuestionAnsweringModelOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.start_logits Span-start scores (before SoftMax). + * @param {Tensor} output.end_logits Span-end scores (before SoftMax). + */ + constructor({ start_logits, end_logits }) { + super(); + this.start_logits = start_logits; + this.end_logits = end_logits; + } +} + +/** + * Base class for causal language model (or autoregressive) outputs. + */ +export class CausalLMOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). + */ + constructor({ logits }) { + super(); + this.logits = logits; + } +} + +/** + * Base class for causal language model (or autoregressive) outputs. + */ +export class CausalLMOutputWithPast extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Prediction scores of the language modeling head (scores for each vocabulary token before softmax). + * @param {Tensor} output.past_key_values Contains pre-computed hidden-states (key and values in the self-attention blocks) + * that can be used (see `past_key_values` input) to speed up sequential decoding. + */ + constructor({ logits, past_key_values }) { + super(); + this.logits = logits; + this.past_key_values = past_key_values; + } +} + +export class Seq2SeqLMOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits The output logits of the model. + * @param {Tensor} output.past_key_values An tensor of key/value pairs that represent the previous state of the model. + * @param {Tensor} output.encoder_outputs The output of the encoder in a sequence-to-sequence model. + * @param {Tensor} [output.decoder_attentions] Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. + * @param {Tensor} [output.cross_attentions] Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the weighted average in the cross-attention heads. + */ + constructor({ logits, past_key_values, encoder_outputs, decoder_attentions = null, cross_attentions = null }) { + super(); + this.logits = logits; + this.past_key_values = past_key_values; + this.encoder_outputs = encoder_outputs; + this.decoder_attentions = decoder_attentions; + this.cross_attentions = cross_attentions; + } +} + +export class ImageMattingOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.alphas Estimated alpha values, of shape `(batch_size, num_channels, height, width)`. + */ + constructor({ alphas }) { + super(); + this.alphas = alphas; + } +} + +/** + * Describes the outputs for the VITS model. + */ +export class VitsModelOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.waveform The final audio waveform predicted by the model, of shape `(batch_size, sequence_length)`. + * @param {Tensor} output.spectrogram The log-mel spectrogram predicted at the output of the flow model. + * This spectrogram is passed to the Hi-Fi GAN decoder model to obtain the final audio waveform. + */ + constructor({ waveform, spectrogram }) { + super(); + this.waveform = waveform; + this.spectrogram = spectrogram; + } +} + +export class Sam2ImageSegmentationOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.iou_scores The output logits of the model. + * @param {Tensor} output.pred_masks Predicted boxes. + * @param {Tensor} output.object_score_logits Logits for the object score, indicating if an object is present. + */ + constructor({ iou_scores, pred_masks, object_score_logits }) { + super(); + this.iou_scores = iou_scores; + this.pred_masks = pred_masks; + this.object_score_logits = object_score_logits; + } +} + +export class MgpstrModelOutput extends ModelOutput { + constructor({ char_logits, bpe_logits, wp_logits }) { + super(); + this.char_logits = char_logits; + this.bpe_logits = bpe_logits; + this.wp_logits = wp_logits; + } + + get logits() { + return [this.char_logits, this.bpe_logits, this.wp_logits]; + } +} + +export class MimiEncoderOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`. + */ + constructor({ audio_codes }) { + super(); + this.audio_codes = audio_codes; + } +} + +export class MimiDecoderOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`. + */ + constructor({ audio_values }) { + super(); + this.audio_values = audio_values; + } +} + +export class DacEncoderOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.audio_codes Discrete code embeddings, of shape `(batch_size, num_quantizers, codes_length)`. + */ + constructor({ audio_codes }) { + super(); + this.audio_codes = audio_codes; + } +} + +export class DacDecoderOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.audio_values Decoded audio values, of shape `(batch_size, num_channels, sequence_length)`. + */ + constructor({ audio_values }) { + super(); + this.audio_values = audio_values; + } +} diff --git a/src/models/pre-trained-model.js b/src/models/pre-trained-model.js new file mode 100644 index 000000000..307686e04 --- /dev/null +++ b/src/models/pre-trained-model.js @@ -0,0 +1,1277 @@ +import { Callable } from '../utils/generic.js'; +import { constructSessions, sessionRun } from './session.js'; +import { AutoConfig, getCacheShapes } from '../configs.js'; +import { Tensor, DataTypeMap, full_like, cat, zeros_like, toI64Tensor, ones_like, ones } from '../utils/tensor.js'; +import { + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, +} from './registry.js'; +import { GITHUB_ISSUE_URL } from '../utils/constants.js'; +import { + decoderForward, + decoder_prepare_inputs_for_generation, + seq2seqForward, + encoder_decoder_prepare_inputs_for_generation, + imageTextToTextForward, + multimodal_text_to_text_prepare_inputs_for_generation, + audioTextToTextForward, + multimodality_prepare_inputs_for_generation, + autoEncoderForward, + chatterbox_prepare_inputs_for_generation, + encoderForward, + getOptionalConfigs, +} from './utils.js'; +import { + LogitsProcessorList, + ForcedBOSTokenLogitsProcessor, + ForcedEOSTokenLogitsProcessor, + SuppressTokensAtBeginLogitsProcessor, + NoRepeatNGramLogitsProcessor, + RepetitionPenaltyLogitsProcessor, + NoBadWordsLogitsProcessor, + MinLengthLogitsProcessor, + MinNewTokensLengthLogitsProcessor, + TemperatureLogitsWarper, + ClassifierFreeGuidanceLogitsProcessor, +} from '../generation/logits_process.js'; +import { GenerationConfig } from '../generation/configuration_utils.js'; +import { EosTokenCriteria, MaxLengthCriteria, StoppingCriteriaList } from '../generation/stopping_criteria.js'; +import { LogitsSampler } from '../generation/logits_sampler.js'; +import { pick } from '../utils/core.js'; +import { ModelOutput } from './output.js'; + +export const MODEL_TYPES = { + EncoderOnly: 0, + EncoderDecoder: 1, + Seq2Seq: 2, + Vision2Seq: 3, + DecoderOnly: 4, + MaskGeneration: 5, + ImageTextToText: 6, + Musicgen: 7, + MultiModality: 8, + Phi3V: 9, + AudioTextToText: 10, + AutoEncoder: 11, + ImageAudioTextToText: 12, + Supertonic: 13, + Chatterbox: 14, +}; + +const MODEL_TYPE_CONFIG = { + [MODEL_TYPES.DecoderOnly]: { + can_generate: true, + forward: decoderForward, + prepare_inputs: decoder_prepare_inputs_for_generation, + }, + [MODEL_TYPES.Seq2Seq]: { + can_generate: true, + forward: seq2seqForward, + prepare_inputs: encoder_decoder_prepare_inputs_for_generation, + }, + [MODEL_TYPES.Vision2Seq]: { + can_generate: true, + forward: seq2seqForward, + prepare_inputs: encoder_decoder_prepare_inputs_for_generation, + }, + [MODEL_TYPES.Musicgen]: { + can_generate: true, + forward: seq2seqForward, + prepare_inputs: encoder_decoder_prepare_inputs_for_generation, + }, + [MODEL_TYPES.EncoderDecoder]: { + can_generate: false, + forward: seq2seqForward, + prepare_inputs: null, + }, + [MODEL_TYPES.ImageTextToText]: { + can_generate: true, + forward: imageTextToTextForward, + prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation, + }, + [MODEL_TYPES.AudioTextToText]: { + can_generate: true, + forward: audioTextToTextForward, + prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation, + }, + [MODEL_TYPES.Phi3V]: { + can_generate: true, + forward: null, + prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation, + }, + [MODEL_TYPES.ImageAudioTextToText]: { + can_generate: true, + forward: null, + prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation, + }, + [MODEL_TYPES.MultiModality]: { + can_generate: true, + forward: null, + prepare_inputs: multimodality_prepare_inputs_for_generation, + }, + [MODEL_TYPES.AutoEncoder]: { + can_generate: false, + forward: autoEncoderForward, + prepare_inputs: null, + }, + [MODEL_TYPES.Chatterbox]: { + can_generate: true, + forward: encoderForward, + prepare_inputs: chatterbox_prepare_inputs_for_generation, + }, + default: { + can_generate: false, + forward: encoderForward, + prepare_inputs: null, + }, +}; + +export const MODEL_TYPE_MAPPING = new Map(); +export const MODEL_NAME_TO_CLASS_MAPPING = new Map(); +export const MODEL_CLASS_TO_NAME_MAPPING = new Map(); + +/** + * A base class for pre-trained models that provides the model configuration and an ONNX session. + */ +export class PreTrainedModel extends Callable { + main_input_name = 'input_ids'; + forward_params = ['input_ids', 'attention_mask']; + + _return_dict_in_generate_keys = null; + /** + * Creates a new instance of the `PreTrainedModel` class. + * @param {import('../configs.js').PretrainedConfig} config The model configuration. + * @param {Record} sessions The inference sessions for the model. + * @param {Record} configs Additional configuration files (e.g., generation_config.json). + */ + constructor(config, sessions, configs) { + super(); + + this.config = config; + this.sessions = sessions; + this.configs = configs; + + const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); + const modelType = MODEL_TYPE_MAPPING.get(modelName); + + // Get configuration for this model type + const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default; + + this.can_generate = typeConfig.can_generate; + this._forward = typeConfig.forward; + this._prepare_inputs_for_generation = typeConfig.prepare_inputs; + + if (this.can_generate) { + this.forward_params.push('past_key_values'); + } + + /** @type {import('../configs.js').TransformersJSConfig} */ + this.custom_config = this.config['transformers.js_config'] ?? {}; + } + + /** + * Disposes of all the ONNX sessions that were created during inference. + * @returns {Promise} An array of promises, one for each ONNX session that is being disposed. + * @todo Use https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/FinalizationRegistry + */ + async dispose() { + const promises = []; + for (const session of Object.values(this.sessions)) { + promises.push(session.release?.()); + } + return await Promise.all(promises); + } + + /** + * Instantiate one of the model classes of the library from a pretrained model. + * + * The model class to instantiate is selected based on the `model_type` property of the config object + * (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible) + * + * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either: + * - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + * Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + * user or organization name, like `dbmdz/bert-base-german-cased`. + * - A path to a *directory* containing model weights, e.g., `./my_model_directory/`. + * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the model. + * + * @returns {Promise} A new instance of the `PreTrainedModel` class. + */ + static async from_pretrained( + pretrained_model_name_or_path, + { + progress_callback = null, + config = null, + cache_dir = null, + local_files_only = false, + revision = 'main', + model_file_name = null, + subfolder = 'onnx', + device = null, + dtype = null, + use_external_data_format = null, + session_options = {}, + } = {}, + ) { + let options = { + progress_callback, + config, + cache_dir, + local_files_only, + revision, + model_file_name, + subfolder, + device, + dtype, + use_external_data_format, + session_options, + }; + + const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this); + const modelType = MODEL_TYPE_MAPPING.get(modelName); + + config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options); + + let info; + if (modelType === MODEL_TYPES.DecoderOnly) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + model: options.model_file_name ?? 'model', + }, + options, + 'model', + ), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + model: 'encoder_model', + decoder_model_merged: 'decoder_model_merged', + }, + options, + 'decoder_model_merged', + ), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.MaskGeneration) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + model: 'vision_encoder', + prompt_encoder_mask_decoder: 'prompt_encoder_mask_decoder', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.EncoderDecoder) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + model: 'encoder_model', + decoder_model_merged: 'decoder_model_merged', + }, + options, + 'decoder_model_merged', + ), + ]); + } else if (modelType === MODEL_TYPES.ImageTextToText) { + const sessions = { + embed_tokens: 'embed_tokens', + vision_encoder: 'vision_encoder', + decoder_model_merged: 'decoder_model_merged', + }; + if (config.is_encoder_decoder) { + sessions['model'] = 'encoder_model'; + } + info = await Promise.all([ + constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.AudioTextToText) { + const sessions = { + embed_tokens: 'embed_tokens', + audio_encoder: 'audio_encoder', + decoder_model_merged: 'decoder_model_merged', + }; + info = await Promise.all([ + constructSessions(pretrained_model_name_or_path, sessions, options, 'decoder_model_merged'), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.ImageAudioTextToText) { + const sessions = { + embed_tokens: 'embed_tokens', + audio_encoder: 'audio_encoder', + vision_encoder: 'vision_encoder', + decoder_model_merged: 'decoder_model_merged', + }; + info = await Promise.all([ + constructSessions(pretrained_model_name_or_path, sessions, options), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.Musicgen) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + model: 'text_encoder', + decoder_model_merged: 'decoder_model_merged', + encodec_decode: 'encodec_decode', + }, + options, + 'decoder_model_merged', + ), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.MultiModality) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + prepare_inputs_embeds: 'prepare_inputs_embeds', + model: 'language_model', + lm_head: 'lm_head', + gen_head: 'gen_head', + gen_img_embeds: 'gen_img_embeds', + image_decode: 'image_decode', + }, + options, + 'model', + ), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.Phi3V) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + prepare_inputs_embeds: 'prepare_inputs_embeds', + model: 'model', + vision_encoder: 'vision_encoder', + }, + options, + 'model', + ), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.Chatterbox) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + embed_tokens: 'embed_tokens', + speech_encoder: 'speech_encoder', + model: 'language_model', + conditional_decoder: 'conditional_decoder', + }, + options, + 'model', + ), + getOptionalConfigs( + pretrained_model_name_or_path, + { + generation_config: 'generation_config.json', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.AutoEncoder) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + encoder_model: 'encoder_model', + decoder_model: 'decoder_model', + }, + options, + ), + ]); + } else if (modelType === MODEL_TYPES.Supertonic) { + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + text_encoder: 'text_encoder', + latent_denoiser: 'latent_denoiser', + voice_decoder: 'voice_decoder', + }, + options, + ), + ]); + } else { + // should be MODEL_TYPES.EncoderOnly + if (modelType !== MODEL_TYPES.EncoderOnly) { + const type = modelName ?? config?.model_type; + if (type !== 'custom') { + console.warn( + `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`, + ); + } + } + info = await Promise.all([ + constructSessions( + pretrained_model_name_or_path, + { + model: options.model_file_name ?? 'model', + }, + options, + ), + ]); + } + + // @ts-ignore + return new this(config, ...info); + } + + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Object containing input tensors + * @returns {Promise} Object containing output tensors + */ + async _call(model_inputs) { + return await this.forward(model_inputs); + } + + /** + * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method + * will be chosen based on the model type. + * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model. + * @returns {Promise} The output data from the model in the format specified in the ONNX model. + * @throws {Error} This method must be implemented in subclasses. + */ + async forward(model_inputs) { + return await this._forward(this, model_inputs); + } + + /** + * Get the model's generation config, if it exists. + * @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`. + */ + get generation_config() { + return this.configs?.generation_config ?? null; + } + + /** + * @param {GenerationConfig} generation_config + * @param {number} input_ids_seq_length The starting sequence length for the input ids. + * @returns {LogitsProcessorList} + * @private + */ + _get_logits_processor( + generation_config, + input_ids_seq_length, + // encoder_input_ids, TODO + // prefix_allowed_tokens_fn, TODO + logits_processor = null, + ) { + const processors = new LogitsProcessorList(); + + // if (generation_config.diversity_penalty !== null && generation_config.diversity_penalty > 0.0) { + // processors.push(new HammingDiversityLogitsProcessor( + // generation_config.diversity_penalty, + // generation_config.num_beams, + // generation_config.num_beam_groups + // )); + // } + + // if (generation_config.encoder_repetition_penalty !== null && generation_config.encoder_repetition_penalty !== 1.0) { + // processors.push(new EncoderRepetitionPenaltyLogitsProcessor( + // generation_config.encoder_repetition_penalty, + // encoder_input_ids + // )); + // } + + if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1.0) { + processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty)); + } + + if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) { + processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size)); + } + + // if (generation_config.encoder_no_repeat_ngram_size !== null && generation_config.encoder_no_repeat_ngram_size > 0) { + // if (this.config.is_encoder_decoder) { + // processors.push(new EncoderNoRepeatNGramLogitsProcessor( + // generation_config.encoder_no_repeat_ngram_size, + // encoder_input_ids + // )); + // } else { + // throw new Error("It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture"); + // } + // } + + if (generation_config.bad_words_ids !== null) { + processors.push( + new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id), + ); + } + + if ( + generation_config.min_length !== null && + generation_config.eos_token_id !== null && + generation_config.min_length > 0 + ) { + processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id)); + } + + if ( + generation_config.min_new_tokens !== null && + generation_config.eos_token_id !== null && + generation_config.min_new_tokens > 0 + ) { + processors.push( + new MinNewTokensLengthLogitsProcessor( + input_ids_seq_length, + generation_config.min_new_tokens, + generation_config.eos_token_id, + ), + ); + } + + // if (prefix_allowed_tokens_fn !== null) { + // processors.push(new PrefixConstrainedLogitsProcessor( + // prefix_allowed_tokens_fn, + // generation_config.num_beams / generation_config.num_beam_groups + // )); + // } + + if (generation_config.forced_bos_token_id !== null) { + processors.push(new ForcedBOSTokenLogitsProcessor(generation_config.forced_bos_token_id)); + } + + if (generation_config.forced_eos_token_id !== null) { + processors.push( + new ForcedEOSTokenLogitsProcessor(generation_config.max_length, generation_config.forced_eos_token_id), + ); + } + + // if (generation_config.remove_invalid_values === true) { + // processors.push(new InfNanRemoveLogitsProcessor()); + // } + + // if (generation_config.exponential_decay_length_penalty !== null) { + // processors.push(new ExponentialDecayLengthPenalty( + // generation_config.exponential_decay_length_penalty, + // generation_config.eos_token_id, + // input_ids_seq_length + // )); + // } + + // if (generation_config.suppress_tokens !== null) { + // processors.push(new SuppressTokensLogitsProcessor(generation_config.suppress_tokens)); + // } + + if (generation_config.begin_suppress_tokens !== null) { + const begin_index = + input_ids_seq_length > 1 || generation_config.forced_bos_token_id === null + ? input_ids_seq_length + : input_ids_seq_length + 1; + + processors.push( + new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, begin_index), + ); + } + + // DEPRECATED: https://github.com/huggingface/transformers/pull/29485 + // if (generation_config.forced_decoder_ids !== null) { + // processors.push(new ForceTokensLogitsProcessor(generation_config.forced_decoder_ids)); + // } + + // 8. prepare batched CFG externally + if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { + processors.push(new ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale)); + } + + if (generation_config.temperature === 0 && generation_config.do_sample) { + console.warn( + '`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`.', + ); + generation_config.do_sample = false; + } + + if (generation_config.do_sample) { + if (generation_config.temperature !== null && generation_config.temperature !== 1.0) { + processors.push(new TemperatureLogitsWarper(generation_config.temperature)); + } + // TODO: Add TopPLogitsWarper and TopKLogitsWarper + // if (generation_config.top_k !== null && generation_config.top_k !== 0) { + // processors.push(new TopKLogitsWarper(generation_config.top_k)); + // } + // if (generation_config.top_p !== null && generation_config.top_p < 1.0) { + // processors.push(new TopPLogitsWarper(generation_config.top_p)); + // } + } + + if (logits_processor !== null) { + processors.extend(logits_processor); + } + + // `LogitNormalization` should always be the last logit processor, when present + // if (generation_config.renormalize_logits === true) { + // processors.push(new LogitNormalization()); + // } + + return processors; + } + + /** + * This function merges multiple generation configs together to form a final generation config to be used by the model for text generation. + * It first creates an empty `GenerationConfig` object, then it applies the model's own `generation_config` property to it. Finally, if a `generation_config` object was passed in the arguments, it overwrites the corresponding properties in the final config with those of the passed config object. + * @param {GenerationConfig|null} generation_config A `GenerationConfig` object containing generation parameters. + * @param {Object} kwargs Additional generation parameters to be used in place of those in the `generation_config` object. + * @returns {GenerationConfig} The final generation config object to be used by the model for text generation. + */ + _prepare_generation_config(generation_config, kwargs, cls = GenerationConfig) { + // Create empty generation config (contains defaults) + // We pass `this.config` so that if `eos_token_id` or `bos_token_id` exist in the model's config, we will use them + const config = { ...this.config }; + for (const key of ['decoder', 'generator', 'text_config']) { + // Special case: some models have generation attributes set in the decoder. + // Use them if still unset in the generation config. + if (key in config) { + Object.assign(config, config[key]); + } + } + + const gen_config = new cls(config); + + // Apply model's generation config, if it exists + Object.assign(gen_config, this.generation_config ?? {}); + + // Next, use any generation config specified by the user + // when calling `generate` + if (generation_config) { + Object.assign(gen_config, generation_config); + } + + // Finally, if any kwargs were passed, use them to overwrite + if (kwargs) { + Object.assign(gen_config, pick(kwargs, Object.getOwnPropertyNames(gen_config))); + } + + return gen_config; + } + + /** + * + * @param {GenerationConfig} generation_config + * @param {StoppingCriteriaList} [stopping_criteria=null] + */ + _get_stopping_criteria(generation_config, stopping_criteria = null) { + const criteria = new StoppingCriteriaList(); + + if (generation_config.max_length !== null) { + criteria.push( + new MaxLengthCriteria(generation_config.max_length, this.config.max_position_embeddings ?? null), + ); + } + // if (generation_config.max_time !== null) { + // criteria.push(new MaxTimeCriteria(generation_config.max_time)); + // } + if (generation_config.eos_token_id !== null) { + criteria.push(new EosTokenCriteria(generation_config.eos_token_id)); + } + + if (stopping_criteria) { + criteria.extend(stopping_criteria); + } + return criteria; + } + + /** + * Confirms that the model class is compatible with generation. + * If not, raises an exception that points to the right class to use. + */ + _validate_model_class() { + if (!this.can_generate) { + const generate_compatible_mappings = [ + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + // MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, // TODO + MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, + ]; + + const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor); + + const generate_compatible_classes = new Set(); + const modelType = this.config.model_type; + for (const model_mapping of generate_compatible_mappings) { + const supported_models = model_mapping.get(modelType); + if (supported_models) { + generate_compatible_classes.add(supported_models[0]); + } + } + + let errorMessage = `The current model class (${modelName}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`; + if (generate_compatible_classes.size > 0) { + errorMessage += ` Please use the following class instead: ${[...generate_compatible_classes].join(', ')}`; + } + throw Error(errorMessage); + } + } + + prepare_inputs_for_generation(...args) { + return this._prepare_inputs_for_generation(this, ...args); + } + + /** + * + * @param {Object} inputs + * @param {bigint[][]} inputs.generated_input_ids + * @param {Object} inputs.outputs + * @param {Object} inputs.model_inputs + * @param {boolean} inputs.is_encoder_decoder + * @returns {Object} The updated model inputs for the next generation iteration. + */ + _update_model_kwargs_for_generation({ generated_input_ids, outputs, model_inputs, is_encoder_decoder }) { + // update past_key_values + model_inputs['past_key_values'] = this.getPastKeyValues(outputs, model_inputs.past_key_values); + + // update inputs for next run + model_inputs['input_ids'] = new Tensor('int64', generated_input_ids.flat(), [generated_input_ids.length, 1]); + + if (!is_encoder_decoder) { + // update attention mask + model_inputs.attention_mask = cat( + [model_inputs.attention_mask, ones([model_inputs.attention_mask.dims[0], 1])], + 1, + ); + } else if ('decoder_attention_mask' in model_inputs) { + // TODO: update decoder attention mask if the model requires it + } + + // force recreate position_ids in next iteration + model_inputs['position_ids'] = null; + + return model_inputs; + } + + /** + * This function extracts the model-specific `inputs` for generation. + * @param {Object} params + * @param {Tensor} [params.inputs=null] + * @param {number} [params.bos_token_id=null] + * @param {Record} [params.model_kwargs] + * @returns {{inputs_tensor: Tensor, model_inputs: Record, model_input_name: string}} The model-specific inputs for generation. + */ + _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) { + const model_inputs = pick(model_kwargs, this.forward_params); + const input_name = this.main_input_name; + if (input_name in model_inputs) { + if (inputs) { + throw new Error( + '`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. ' + + 'Make sure to either pass {inputs} or {input_name}=...', + ); + } + } else { + model_inputs[input_name] = inputs; + } + + const inputs_tensor = model_inputs[input_name]; + + return { inputs_tensor, model_inputs, model_input_name: input_name }; + } + + async _prepare_encoder_decoder_kwargs_for_generation({ + inputs_tensor, + model_inputs, + model_input_name, + generation_config, + }) { + if ( + this.sessions['model'].inputNames.includes('inputs_embeds') && + !model_inputs.inputs_embeds && + '_prepare_inputs_embeds' in this + ) { + // Encoder expects `inputs_embeds` instead of `input_ids` + const { input_ids, pixel_values, attention_mask, ...kwargs } = model_inputs; + // @ts-ignore + const prepared_inputs = await this._prepare_inputs_embeds(model_inputs); + model_inputs = { + ...kwargs, + ...pick(prepared_inputs, ['inputs_embeds', 'attention_mask']), + }; + } + let { last_hidden_state } = await encoderForward(this, model_inputs); + + // for classifier free guidance we need to add a 'null' input to our encoder hidden states + if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { + last_hidden_state = cat([last_hidden_state, full_like(last_hidden_state, 0.0)], 0); + + if ('attention_mask' in model_inputs) { + model_inputs['attention_mask'] = cat( + [model_inputs['attention_mask'], zeros_like(model_inputs['attention_mask'])], + 0, + ); + } + } else if (model_inputs.decoder_input_ids) { + // Ensure that the encoder outputs have the same batch size as the decoder inputs, + // allowing for more efficient batched generation for single inputs + const decoder_input_ids_batch_size = toI64Tensor(model_inputs.decoder_input_ids).dims[0]; + if (decoder_input_ids_batch_size !== last_hidden_state.dims[0]) { + if (last_hidden_state.dims[0] !== 1) { + throw new Error( + `The encoder outputs have a different batch size (${last_hidden_state.dims[0]}) than the decoder inputs (${decoder_input_ids_batch_size}).`, + ); + } + last_hidden_state = cat( + Array.from({ length: decoder_input_ids_batch_size }, () => last_hidden_state), + 0, + ); + } + } + model_inputs['encoder_outputs'] = last_hidden_state; + + return model_inputs; + } + + /** + * Prepares `decoder_input_ids` for generation with encoder-decoder models + * @param {*} param0 + */ + _prepare_decoder_input_ids_for_generation({ + batch_size, + model_input_name, + model_kwargs, + decoder_start_token_id, + bos_token_id, + generation_config, + }) { + let { decoder_input_ids, ...model_inputs } = model_kwargs; + + // Prepare input ids if the user has not defined `decoder_input_ids` manually. + if (!(decoder_input_ids instanceof Tensor)) { + if (!decoder_input_ids) { + decoder_start_token_id ??= bos_token_id; + + if (this.config.model_type === 'musicgen') { + // Custom logic (TODO: move to Musicgen class) + decoder_input_ids = Array.from( + { + // @ts-expect-error TS2339 + length: batch_size * this.config.decoder.num_codebooks, + }, + () => [decoder_start_token_id], + ); + } else if (Array.isArray(decoder_start_token_id)) { + if (decoder_start_token_id.length !== batch_size) { + throw new Error( + `\`decoder_start_token_id\` expcted to have length ${batch_size} but got ${decoder_start_token_id.length}`, + ); + } + decoder_input_ids = decoder_start_token_id; + } else { + decoder_input_ids = Array.from( + { + length: batch_size, + }, + () => [decoder_start_token_id], + ); + } + } else if (!Array.isArray(decoder_input_ids[0])) { + // Correct batch size + decoder_input_ids = Array.from( + { + length: batch_size, + }, + () => decoder_input_ids, + ); + } + decoder_input_ids = toI64Tensor(decoder_input_ids); + } + + model_kwargs['decoder_attention_mask'] = ones_like(decoder_input_ids); + + return { input_ids: decoder_input_ids, model_inputs }; + } + + /** + * Generates sequences of token ids for models with a language modeling head. + * @param {import('../generation/parameters.js').GenerationFunctionParameters} options + * @returns {Promise} The output of the model, which can contain the generated token ids, attentions, and scores. + */ + async generate({ + inputs = null, + generation_config = null, + logits_processor = null, + stopping_criteria = null, + streamer = null, + + // inputs_attention_mask = null, + ...kwargs + }) { + this._validate_model_class(); + + // Update generation config with defaults and kwargs + generation_config = this._prepare_generation_config(generation_config, kwargs); + + // 3. Define model inputs + let { inputs_tensor, model_inputs, model_input_name } = this._prepare_model_inputs({ + inputs, + model_kwargs: kwargs, + }); + + const is_encoder_decoder = this.config.is_encoder_decoder; + + // 4. Define other model kwargs + if (!is_encoder_decoder) { + // decoder-only models should use left-padding for generation + } else if (!('encoder_outputs' in model_inputs)) { + // if model is encoder decoder encoder_outputs are created + // and added to `model_kwargs` + model_inputs = await this._prepare_encoder_decoder_kwargs_for_generation({ + inputs_tensor, + model_inputs, + model_input_name, + generation_config, + }); + } + + // 5. Prepare `input_ids` which will be used for auto-regressive generation + // TODO: Update to align with HF transformers' implementation + let input_ids; + if (is_encoder_decoder) { + // Generating from the encoder outputs + ({ input_ids, model_inputs } = this._prepare_decoder_input_ids_for_generation({ + batch_size: model_inputs[model_input_name].dims.at(0), + model_input_name, + model_kwargs: model_inputs, + decoder_start_token_id: generation_config.decoder_start_token_id, + bos_token_id: generation_config.bos_token_id, + generation_config, + })); + } else { + input_ids = model_inputs[model_input_name]; + } + + // 6. Prepare `max_length` depending on other stopping criteria. + let input_ids_length = input_ids.dims.at(-1); + + if (generation_config.max_new_tokens !== null) { + generation_config.max_length = input_ids_length + generation_config.max_new_tokens; + } + + // input_ids_length = model_inputs[model_input_name].dims.at(1); + // // inputs instanceof Tensor ? : inputs.length; + + // // decoder-only + // if (input_ids_length === 0) { + // throw Error("Must supply a non-empty array of input token ids.") + // } + + // let decoder_input_ids = + // generation_config.decoder_input_ids + // ?? generation_config.decoder_start_token_id + // ?? generation_config.bos_token_id + // ?? generation_config.eos_token_id; + + // Update logits processor + // 8. prepare distribution pre_processing samplers + const prepared_logits_processor = this._get_logits_processor( + generation_config, + input_ids_length, + logits_processor, + ); + + // 9. prepare stopping criteria + const prepared_stopping_criteria = this._get_stopping_criteria(generation_config, stopping_criteria); + + // /** @type {number[]} */ + // let eos_token_ids = generation_config.eos_token_id; + // if (eos_token_ids !== null && !Array.isArray(eos_token_ids)) { + // eos_token_ids = [eos_token_ids]; + // } + + const numInputs = model_inputs[model_input_name].dims.at(0); + + // TODO: + // done is a list of booleans to keep track of which inputs are done + // const done = new Array(numInputs).fill(false); + // For efficiency purposes, we remove completed rows from model_inputs + // when the beam is complete, and we keep track of the row index + // const rowIndexToBatchIndex = new Map(); + + const sampler = LogitsSampler.getSampler(generation_config); + + // TODO make > numInputs + const scores = new Array(numInputs).fill(0); + /** @type {bigint[][]} */ + const all_input_ids = input_ids.tolist(); + if (streamer) { + streamer.put(all_input_ids); + } + // const all_generated_input_ids = Array.from({ length: numInputs }, () => []); + + // NOTE: For now, we don't support spawning new beams + // TODO: when we do, we simply copy past key values and accumulate into single large tensor + + //////////////////////////////////////////////////// + // Generic search which handles 4 generation modes: + // - GenerationMode.GREEDY_SEARCH + // - GenerationMode.SAMPLE + // - GenerationMode.BEAM_SEARCH + // - GenerationMode.BEAM_SAMPLE + //////////////////////////////////////////////////// + let outputs; + let attentions = {}; + let return_dict_items = {}; + while (true) { + // prepare model inputs + model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config); + outputs = await this.forward(model_inputs); + + if (generation_config.return_dict_in_generate) { + if (generation_config.output_attentions) { + // Get attentions if they are present + const token_attentions = this.getAttentions(outputs); + for (const key in token_attentions) { + if (!(key in attentions)) { + attentions[key] = []; + } + attentions[key].push(token_attentions[key]); + } + } else if (this._return_dict_in_generate_keys) { + Object.assign(return_dict_items, pick(outputs, this._return_dict_in_generate_keys)); + } + } + + // Logits are of the form [batch_size, out_seq_length, vocab_size] + // In most cases, this will be [batch_size, 1, vocab_size] + // So, we select the last token's logits: + // (equivalent to `logits = outputs.logits[:, -1, :]`) + // The `.to('float32')` is necessary for models with float16 logits, + // and is a no-op for float32 logits. + // TODO: Support float16 sampling in the sampler directly + const logits = outputs.logits.slice(null, -1, null).to('float32'); + + const next_tokens_scores = prepared_logits_processor(all_input_ids, logits); + + /** @type {[bigint][]} */ + const generated_input_ids = []; + // const new_kv_cache = [];// NOTE: Only used for beam search when concatenating new kv + // Loop over each batch + for (let batch_idx = 0; batch_idx < next_tokens_scores.dims.at(0); ++batch_idx) { + const logs = next_tokens_scores[batch_idx]; + + const sampledTokens = await sampler(logs); + for (const [newTokenId, logProb] of sampledTokens) { + const bigint = BigInt(newTokenId); + // TODO: If branching, use previous beam as a starting point + // update generated ids, model inputs, and length for next step + scores[batch_idx] += logProb; + all_input_ids[batch_idx].push(bigint); + generated_input_ids.push([bigint]); + + // TODO: Support beam search + break; + } + } + if (streamer) { + streamer.put(generated_input_ids); + } + + const stop = prepared_stopping_criteria(all_input_ids); + if (stop.every((x) => x)) { + break; + } + + model_inputs = this._update_model_kwargs_for_generation({ + generated_input_ids, + outputs, + model_inputs, + is_encoder_decoder, + }); + } + + if (streamer) { + streamer.end(); + } + + // Retrieve and dispose all final past key values (including encoder attentions) + const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true); + + // TODO: ensure all_input_ids is padded correctly... + const sequences = new Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]); + + if (generation_config.return_dict_in_generate) { + return { + sequences, + past_key_values, + ...attentions, + ...return_dict_items, + // TODO: + // scores, + // logits, + }; + } else { + // Dispose all remaining tensors + for (const tensor of Object.values(outputs)) { + if (tensor.location === 'gpu-buffer') { + tensor.dispose(); + } + } + return sequences; + } + } + + /** + * Returns an object containing past key values from the given decoder results object. + * + * @param {Object} decoderResults The decoder results object. + * @param {Object} pastKeyValues The previous past key values. + * @returns {Object} An object containing past key values. + */ + getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) { + const pkvs = Object.create(null); + + for (const name in decoderResults) { + if (name.startsWith('present')) { + const newName = name + // Hybrid cache architecture + .replace('present_ssm', 'past_ssm') // Mamba + .replace('present_conv', 'past_conv') // LFM2 + + // Standard cache architecture + .replace('present', 'past_key_values'); + const is_encoder_pkv = name.includes('encoder'); + if (is_encoder_pkv && pastKeyValues) { + // Optimization introduced by optimum to reuse past key values. + // So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values. + // https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704 + pkvs[newName] = pastKeyValues[newName]; + } else { + // decoder or using first encoder PKVs + pkvs[newName] = decoderResults[name]; + } + + if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) { + // - Always dispose decoder PKVs + // - Only dispose encoder past key values when requested (after generation) + const t = pastKeyValues[newName]; + if (t.location === 'gpu-buffer') { + t.dispose(); + } + } + } + } + return pkvs; + } + + /** + * Returns an object containing attentions from the given model output object. + * + * @param {Object} model_output The output of the model. + * @returns {{cross_attentions?: Tensor[]}} An object containing attentions. + */ + getAttentions(model_output) { + const attentions = {}; + + for (const attnName of ['cross_attentions', 'encoder_attentions', 'decoder_attentions']) { + for (const name in model_output) { + if (name.startsWith(attnName)) { + if (!(attnName in attentions)) { + attentions[attnName] = []; + } + attentions[attnName].push(model_output[name]); + } + } + } + return attentions; + } + + /** + * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values. + * + * @param {Object} decoderFeeds The decoder feeds object to add past key values to. + * @param {Object} pastKeyValues An object containing past key values. + */ + addPastKeyValues(decoderFeeds, pastKeyValues) { + if (pastKeyValues) { + Object.assign(decoderFeeds, pastKeyValues); + } else { + const session = this.sessions['decoder_model_merged'] ?? this.sessions['model']; + const batch_size = (decoderFeeds[this.main_input_name] ?? decoderFeeds.attention_mask)?.dims?.[0] ?? 1; + + const dtype = session?.config?.kv_cache_dtype ?? 'float32'; + const cls = dtype === 'float16' ? DataTypeMap.float16 : DataTypeMap.float32; + const shapes = getCacheShapes(this.config, { batch_size }); + for (const name in shapes) { + const size = shapes[name].reduce((a, b) => a * b, 1); + decoderFeeds[name] = new Tensor(dtype, new cls(size), shapes[name]); + } + } + } + + async encode_image({ pixel_values }) { + // image_inputs === { pixel_values } + return (await sessionRun(this.sessions['vision_encoder'], { pixel_values })).image_features; + } + + async encode_text({ input_ids }) { + // text_inputs === { input_ids, attention_mask } + return (await sessionRun(this.sessions['embed_tokens'], { input_ids })).inputs_embeds; + } + + async encode_audio({ audio_values }) { + // audio_inputs === { audio_values } + return (await sessionRun(this.sessions['audio_encoder'], { audio_values })).audio_features; + } +} diff --git a/src/models/pre-trained-models/albert-pre-trained-model.js b/src/models/pre-trained-models/albert-pre-trained-model.js new file mode 100644 index 000000000..88949955d --- /dev/null +++ b/src/models/pre-trained-models/albert-pre-trained-model.js @@ -0,0 +1,38 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput } from '../output.js'; + +export class AlbertPreTrainedModel extends PreTrainedModel {} +export class AlbertModel extends AlbertPreTrainedModel {} +export class AlbertForSequenceClassification extends AlbertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +export class AlbertForQuestionAnswering extends AlbertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} +export class AlbertForMaskedLM extends AlbertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/apertus-pre-trained-model.js b/src/models/pre-trained-models/apertus-pre-trained-model.js new file mode 100644 index 000000000..ba34da1ce --- /dev/null +++ b/src/models/pre-trained-models/apertus-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ApertusPreTrainedModel extends PreTrainedModel {} +export class ApertusModel extends ApertusPreTrainedModel {} +export class ApertusForCausalLM extends ApertusPreTrainedModel {} diff --git a/src/models/pre-trained-models/arcee-pre-trained-model.js b/src/models/pre-trained-models/arcee-pre-trained-model.js new file mode 100644 index 000000000..77aebca34 --- /dev/null +++ b/src/models/pre-trained-models/arcee-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ArceePreTrainedModel extends PreTrainedModel {} +export class ArceeModel extends ArceePreTrainedModel {} +export class ArceeForCausalLM extends ArceePreTrainedModel {} diff --git a/src/models/pre-trained-models/ast-pre-trained-model.js b/src/models/pre-trained-models/ast-pre-trained-model.js new file mode 100644 index 000000000..5b3344da0 --- /dev/null +++ b/src/models/pre-trained-models/ast-pre-trained-model.js @@ -0,0 +1,14 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ASTPreTrainedModel extends PreTrainedModel {} + +/** + * The bare AST Model transformer outputting raw hidden-states without any specific head on top. + */ +export class ASTModel extends ASTPreTrainedModel {} + +/** + * Audio Spectrogram Transformer model with an audio classification head on top + * (a linear layer on top of the pooled output) e.g. for datasets like AudioSet, Speech Commands v2. + */ +export class ASTForAudioClassification extends ASTPreTrainedModel {} diff --git a/src/models/pre-trained-models/bart-pretrained-model.js b/src/models/pre-trained-models/bart-pretrained-model.js new file mode 100644 index 000000000..99c624f9d --- /dev/null +++ b/src/models/pre-trained-models/bart-pretrained-model.js @@ -0,0 +1,29 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class BartPretrainedModel extends PreTrainedModel {} + +/** + * The bare BART Model outputting raw hidden-states without any specific head on top. + */ +export class BartModel extends BartPretrainedModel {} + +/** + * The BART Model with a language modeling head. Can be used for summarization. + */ +export class BartForConditionalGeneration extends BartPretrainedModel {} + +/** + * Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) + */ +export class BartForSequenceClassification extends BartPretrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/beit-pre-trained-model.js b/src/models/pre-trained-models/beit-pre-trained-model.js new file mode 100644 index 000000000..792683aa5 --- /dev/null +++ b/src/models/pre-trained-models/beit-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class BeitPreTrainedModel extends PreTrainedModel {} +export class BeitModel extends BeitPreTrainedModel {} +export class BeitForImageClassification extends BeitPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/bert-pre-trained-model.js b/src/models/pre-trained-models/bert-pre-trained-model.js new file mode 100644 index 000000000..ce49f5988 --- /dev/null +++ b/src/models/pre-trained-models/bert-pre-trained-model.js @@ -0,0 +1,70 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + SequenceClassifierOutput, + MaskedLMOutput, + TokenClassifierOutput, + QuestionAnsweringModelOutput, +} from '../output.js'; + +export class BertPreTrainedModel extends PreTrainedModel {} +export class BertModel extends BertPreTrainedModel {} + +/** + * BertForMaskedLM is a class representing a BERT model for masked language modeling. + */ +export class BertForMaskedLM extends BertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * BertForSequenceClassification is a class representing a BERT model for sequence classification. + */ +export class BertForSequenceClassification extends BertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * BertForTokenClassification is a class representing a BERT model for token classification. + */ +export class BertForTokenClassification extends BertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * BertForQuestionAnswering is a class representing a BERT model for question answering. + */ +export class BertForQuestionAnswering extends BertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/blenderbot-pre-trained-model.js b/src/models/pre-trained-models/blenderbot-pre-trained-model.js new file mode 100644 index 000000000..49f6cb03a --- /dev/null +++ b/src/models/pre-trained-models/blenderbot-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class BlenderbotPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Blenderbot Model outputting raw hidden-states without any specific head on top. + */ +export class BlenderbotModel extends BlenderbotPreTrainedModel {} + +/** + * The Blenderbot Model with a language modeling head. Can be used for summarization. + */ +export class BlenderbotForConditionalGeneration extends BlenderbotPreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/blenderbot-small-pre-trained-model.js b/src/models/pre-trained-models/blenderbot-small-pre-trained-model.js new file mode 100644 index 000000000..5c7e940d3 --- /dev/null +++ b/src/models/pre-trained-models/blenderbot-small-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class BlenderbotSmallPreTrainedModel extends PreTrainedModel {} + +/** + * The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top. + */ +export class BlenderbotSmallModel extends BlenderbotSmallPreTrainedModel {} + +/** + * The BlenderbotSmall Model with a language modeling head. Can be used for summarization. + */ +export class BlenderbotSmallForConditionalGeneration extends BlenderbotSmallPreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/bloom-pre-trained-model.js b/src/models/pre-trained-models/bloom-pre-trained-model.js new file mode 100644 index 000000000..e313a507c --- /dev/null +++ b/src/models/pre-trained-models/bloom-pre-trained-model.js @@ -0,0 +1,16 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). + */ +export class BloomPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Bloom Model transformer outputting raw hidden-states without any specific head on top. + */ +export class BloomModel extends BloomPreTrainedModel {} + +/** + * The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). + */ +export class BloomForCausalLM extends BloomPreTrainedModel {} diff --git a/src/models/pre-trained-models/camembert-pre-trained-model.js b/src/models/pre-trained-models/camembert-pre-trained-model.js new file mode 100644 index 000000000..2a944d700 --- /dev/null +++ b/src/models/pre-trained-models/camembert-pre-trained-model.js @@ -0,0 +1,74 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class CamembertPreTrainedModel extends PreTrainedModel {} + +/** + * The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top. + */ +export class CamembertModel extends CamembertPreTrainedModel {} + +/** + * CamemBERT Model with a `language modeling` head on top. + */ +export class CamembertForMaskedLM extends CamembertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. + */ +export class CamembertForSequenceClassification extends CamembertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. + */ +export class CamembertForTokenClassification extends CamembertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * CamemBERT Model with a span classification head on top for extractive question-answering tasks + */ +export class CamembertForQuestionAnswering extends CamembertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/chatterbox-pre-trained-model.js b/src/models/pre-trained-models/chatterbox-pre-trained-model.js new file mode 100644 index 000000000..48af278ac --- /dev/null +++ b/src/models/pre-trained-models/chatterbox-pre-trained-model.js @@ -0,0 +1,153 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { decoderForward } from '../utils.js'; +import { cat, ones, full, Tensor } from '../../utils/tensor.js'; + +export class ChatterboxPreTrainedModel extends PreTrainedModel { + forward_params = [ + 'input_ids', + 'inputs_embeds', + 'attention_mask', + 'position_ids', + 'audio_values', + 'exaggeration', + 'audio_features', + 'audio_tokens', + 'speaker_embeddings', + 'speaker_features', + 'past_key_values', + ]; + main_input_name = 'input_ids'; + + _return_dict_in_generate_keys = ['audio_tokens', 'speaker_embeddings', 'speaker_features']; +} +export class ChatterboxModel extends ChatterboxPreTrainedModel { + /** + * @param {Tensor} audio_values + * @returns {Promise<{audio_features: Tensor, audio_tokens: Tensor, speaker_embeddings: Tensor, speaker_features: Tensor}>} + */ + async encode_speech(audio_values) { + return sessionRun(this.sessions['speech_encoder'], { + audio_values, + }); + } + + async forward({ + // Produced by the tokenizer/processor: + input_ids = null, + attention_mask = null, + audio_values = null, + exaggeration = null, + + // Used during generation: + position_ids = null, + inputs_embeds = null, + past_key_values = null, + + // Generic generation parameters + generation_config = null, + logits_processor = null, + + // Speaker embeddings/features (useful for re-using pre-computed speaker data) + audio_features = null, // float32[batch_size,sequence_length,1024] + audio_tokens = null, // int64[batch_size,audio_sequence_length] + speaker_embeddings = null, // float32[batch_size,192] + speaker_features = null, // float32[batch_size,feature_dim,80] + + // TODO: needed? + ...kwargs + }) { + let speech_encoder_outputs; + if (!inputs_embeds) { + const expected_inputs = this.sessions['embed_tokens'].inputNames; + const embed_model_inputs = { input_ids }; + if (expected_inputs.includes('exaggeration')) { + // Support the following types for exaggeration: + // 1. null/undefined (no exaggeration): use the default of 0.5 + // 2. number: broadcast to (batch_size,) + // 3. number[]: convert to Tensor of shape (batch_size,) + // 4. Tensor of shape (batch_size, 1) + if (!(exaggeration instanceof Tensor)) { + const batch_size = input_ids.dims[0]; + if (exaggeration == null) { + exaggeration = full([batch_size], 0.5); + } else if (typeof exaggeration === 'number') { + exaggeration = full([batch_size], exaggeration); + } else if (Array.isArray(exaggeration)) { + exaggeration = new Tensor('float32', exaggeration, [batch_size]); + } else { + throw new Error('Unsupported type for `exaggeration` input'); + } + } + embed_model_inputs.exaggeration = exaggeration; + } + if (expected_inputs.includes('position_ids')) { + embed_model_inputs.position_ids = position_ids; + } + + ({ inputs_embeds } = await sessionRun(this.sessions['embed_tokens'], embed_model_inputs)); + + if (audio_features && audio_tokens && speaker_embeddings && speaker_features) { + // Use pre-computed speech encoder outputs + speech_encoder_outputs = { audio_features, audio_tokens, speaker_embeddings, speaker_features }; + } + + if (speech_encoder_outputs || audio_values) { + speech_encoder_outputs ??= await this.encode_speech(audio_values); + + // Update LLM inputs + inputs_embeds = cat([speech_encoder_outputs.audio_features, inputs_embeds], 1); + attention_mask = ones([inputs_embeds.dims[0], inputs_embeds.dims[1]]); + } else { + const target_length = inputs_embeds.dims[1]; + if (!past_key_values || target_length !== 1) { + throw new Error('Incorrect state encountered during generation.'); + } + const past_length = Object.values(past_key_values)[0].dims.at(-2); + attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]); + } + } + + const outputs = await decoderForward( + this, + { + inputs_embeds, + past_key_values, + attention_mask, + generation_config, + logits_processor, + }, + false, + ); + return { + ...outputs, + ...speech_encoder_outputs, + }; + } + + /** @type {PreTrainedModel['generate']} */ + async generate(params) { + const { sequences, audio_tokens, speaker_embeddings, speaker_features } = /** @type {any} */ ( + await super.generate({ + ...params, + return_dict_in_generate: true, + }) + ); + + const new_tokens = sequences.slice(null, [ + params.input_ids.dims[1], // Exclude start of speech token + -1, // Exclude end of speech token + ]); + + const SILENCE_TOKEN = 4299n; + const silence_tokens = full([new_tokens.dims[0], 3], SILENCE_TOKEN); // Add 3 silence tokens + const speech_tokens = cat([audio_tokens, new_tokens, silence_tokens], 1); + + const { waveform } = await sessionRun(this.sessions['conditional_decoder'], { + speech_tokens, + speaker_features, + speaker_embeddings, + }); + return waveform; + } +} diff --git a/src/models/pre-trained-models/chinese-clip-pre-trained-model.js b/src/models/pre-trained-models/chinese-clip-pre-trained-model.js new file mode 100644 index 000000000..bbb2047eb --- /dev/null +++ b/src/models/pre-trained-models/chinese-clip-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ChineseCLIPPreTrainedModel extends PreTrainedModel {} + +export class ChineseCLIPModel extends ChineseCLIPPreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/clap-pre-trained-model.js b/src/models/pre-trained-models/clap-pre-trained-model.js new file mode 100644 index 000000000..debadd605 --- /dev/null +++ b/src/models/pre-trained-models/clap-pre-trained-model.js @@ -0,0 +1,79 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ClapPreTrainedModel extends PreTrainedModel {} + +export class ClapModel extends ClapPreTrainedModel {} + +/** + * CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output). + * + * **Example:** Compute text embeddings with `ClapTextModelWithProjection`. + * + * ```javascript + * import { AutoTokenizer, ClapTextModelWithProjection } from '@huggingface/transformers'; + * + * // Load tokenizer and text model + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clap-htsat-unfused'); + * const text_model = await ClapTextModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); + * + * // Run tokenization + * const texts = ['a sound of a cat', 'a sound of a dog']; + * const text_inputs = tokenizer(texts, { padding: true, truncation: true }); + * + * // Compute embeddings + * const { text_embeds } = await text_model(text_inputs); + * // Tensor { + * // dims: [ 2, 512 ], + * // type: 'float32', + * // data: Float32Array(1024) [ ... ], + * // size: 1024 + * // } + * ``` + */ +export class ClapTextModelWithProjection extends ClapPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'text_model', + }); + } +} + +/** + * CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output). + * + * **Example:** Compute audio embeddings with `ClapAudioModelWithProjection`. + * + * ```javascript + * import { AutoProcessor, ClapAudioModelWithProjection, read_audio } from '@huggingface/transformers'; + * + * // Load processor and audio model + * const processor = await AutoProcessor.from_pretrained('Xenova/clap-htsat-unfused'); + * const audio_model = await ClapAudioModelWithProjection.from_pretrained('Xenova/clap-htsat-unfused'); + * + * // Read audio and run processor + * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav'); + * const audio_inputs = await processor(audio); + * + * // Compute embeddings + * const { audio_embeds } = await audio_model(audio_inputs); + * // Tensor { + * // dims: [ 1, 512 ], + * // type: 'float32', + * // data: Float32Array(512) [ ... ], + * // size: 512 + * // } + * ``` + */ +export class ClapAudioModelWithProjection extends ClapPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'audio_model', + }); + } +} diff --git a/src/models/pre-trained-models/clip-pre-trained-model.js b/src/models/pre-trained-models/clip-pre-trained-model.js new file mode 100644 index 000000000..34376da40 --- /dev/null +++ b/src/models/pre-trained-models/clip-pre-trained-model.js @@ -0,0 +1,150 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class CLIPPreTrainedModel extends PreTrainedModel {} + +/** + * CLIP Text and Vision Model with a projection layers on top + * + * **Example:** Perform zero-shot image classification with a `CLIPModel`. + * + * ```javascript + * import { AutoTokenizer, AutoProcessor, CLIPModel, RawImage } from '@huggingface/transformers'; + * + * // Load tokenizer, processor, and model + * let tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16'); + * let processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); + * let model = await CLIPModel.from_pretrained('Xenova/clip-vit-base-patch16'); + * + * // Run tokenization + * let texts = ['a photo of a car', 'a photo of a football match'] + * let text_inputs = tokenizer(texts, { padding: true, truncation: true }); + * + * // Read image and run processor + * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); + * let image_inputs = await processor(image); + * + * // Run model with both text and pixel inputs + * let output = await model({ ...text_inputs, ...image_inputs }); + * // { + * // logits_per_image: Tensor { + * // dims: [ 1, 2 ], + * // data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ], + * // }, + * // logits_per_text: Tensor { + * // dims: [ 2, 1 ], + * // data: Float32Array(2) [ 18.579734802246094, 24.31830596923828 ], + * // }, + * // text_embeds: Tensor { + * // dims: [ 2, 512 ], + * // data: Float32Array(1024) [ ... ], + * // }, + * // image_embeds: Tensor { + * // dims: [ 1, 512 ], + * // data: Float32Array(512) [ ... ], + * // } + * // } + * ``` + */ +export class CLIPModel extends CLIPPreTrainedModel {} + +/** + * The text model from CLIP without any head or projection on top. + */ +export class CLIPTextModel extends CLIPPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'text_model', + }); + } +} + +/** + * CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output) + * + * **Example:** Compute text embeddings with `CLIPTextModelWithProjection`. + * + * ```javascript + * import { AutoTokenizer, CLIPTextModelWithProjection } from '@huggingface/transformers'; + * + * // Load tokenizer and text model + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16'); + * const text_model = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16'); + * + * // Run tokenization + * let texts = ['a photo of a car', 'a photo of a football match']; + * let text_inputs = tokenizer(texts, { padding: true, truncation: true }); + * + * // Compute embeddings + * const { text_embeds } = await text_model(text_inputs); + * // Tensor { + * // dims: [ 2, 512 ], + * // type: 'float32', + * // data: Float32Array(1024) [ ... ], + * // size: 1024 + * // } + * ``` + */ +export class CLIPTextModelWithProjection extends CLIPPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'text_model', + }); + } +} + +/** + * The vision model from CLIP without any head or projection on top. + */ +export class CLIPVisionModel extends CLIPPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'vision_model', + }); + } +} + +/** + * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output) + * + * **Example:** Compute vision embeddings with `CLIPVisionModelWithProjection`. + * + * ```javascript + * import { AutoProcessor, CLIPVisionModelWithProjection, RawImage} from '@huggingface/transformers'; + * + * // Load processor and vision model + * const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16'); + * const vision_model = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16'); + * + * // Read image and run processor + * let image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); + * let image_inputs = await processor(image); + * + * // Compute embeddings + * const { image_embeds } = await vision_model(image_inputs); + * // Tensor { + * // dims: [ 1, 512 ], + * // type: 'float32', + * // data: Float32Array(512) [ ... ], + * // size: 512 + * // } + * ``` + */ +export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'vision_model', + }); + } +} \ No newline at end of file diff --git a/src/models/pre-trained-models/clip-seg-pre-trained-model.js b/src/models/pre-trained-models/clip-seg-pre-trained-model.js new file mode 100644 index 000000000..386a7a1fe --- /dev/null +++ b/src/models/pre-trained-models/clip-seg-pre-trained-model.js @@ -0,0 +1,53 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class CLIPSegPreTrainedModel extends PreTrainedModel {} + +export class CLIPSegModel extends CLIPSegPreTrainedModel {} + +/** + * CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation. + * + * **Example:** Perform zero-shot image segmentation with a `CLIPSegForImageSegmentation` model. + * + * ```javascript + * import { AutoTokenizer, AutoProcessor, CLIPSegForImageSegmentation, RawImage } from '@huggingface/transformers'; + * + * // Load tokenizer, processor, and model + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clipseg-rd64-refined'); + * const processor = await AutoProcessor.from_pretrained('Xenova/clipseg-rd64-refined'); + * const model = await CLIPSegForImageSegmentation.from_pretrained('Xenova/clipseg-rd64-refined'); + * + * // Run tokenization + * const texts = ['a glass', 'something to fill', 'wood', 'a jar']; + * const text_inputs = tokenizer(texts, { padding: true, truncation: true }); + * + * // Read image and run processor + * const image = await RawImage.read('https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true'); + * const image_inputs = await processor(image); + * + * // Run model with both text and pixel inputs + * const { logits } = await model({ ...text_inputs, ...image_inputs }); + * // logits: Tensor { + * // dims: [4, 352, 352], + * // type: 'float32', + * // data: Float32Array(495616) [ ... ], + * // size: 495616 + * // } + * ``` + * + * You can visualize the predictions as follows: + * ```javascript + * const preds = logits + * .unsqueeze_(1) + * .sigmoid_() + * .mul_(255) + * .round_() + * .to('uint8'); + * + * for (let i = 0; i < preds.dims[0]; ++i) { + * const img = RawImage.fromTensor(preds[i]); + * img.save(`prediction_${i}.png`); + * } + * ``` + */ +export class CLIPSegForImageSegmentation extends CLIPSegPreTrainedModel {} diff --git a/src/models/pre-trained-models/code-gen-pre-trained-model.js b/src/models/pre-trained-models/code-gen-pre-trained-model.js new file mode 100644 index 000000000..3ea4a6f5b --- /dev/null +++ b/src/models/pre-trained-models/code-gen-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class CodeGenPreTrainedModel extends PreTrainedModel {} + +/** + * CodeGenModel is a class representing a code generation model without a language model head. + */ +export class CodeGenModel extends CodeGenPreTrainedModel {} + +/** + * CodeGenForCausalLM is a class that represents a code generation model based on the GPT-2 architecture. It extends the `CodeGenPreTrainedModel` class. + */ +export class CodeGenForCausalLM extends CodeGenPreTrainedModel {} diff --git a/src/models/pre-trained-models/cohere-pre-trained-model.js b/src/models/pre-trained-models/cohere-pre-trained-model.js new file mode 100644 index 000000000..b3b1ccb89 --- /dev/null +++ b/src/models/pre-trained-models/cohere-pre-trained-model.js @@ -0,0 +1,9 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Cohere Model outputting raw hidden-states without any specific head on top. + */ +export class CoherePreTrainedModel extends PreTrainedModel {} +export class CohereModel extends CoherePreTrainedModel {} + +export class CohereForCausalLM extends CoherePreTrainedModel {} diff --git a/src/models/pre-trained-models/conv-bert-pre-trained-model.js b/src/models/pre-trained-models/conv-bert-pre-trained-model.js new file mode 100644 index 000000000..9d049d790 --- /dev/null +++ b/src/models/pre-trained-models/conv-bert-pre-trained-model.js @@ -0,0 +1,76 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class ConvBertPreTrainedModel extends PreTrainedModel {} + +/** + * The bare ConvBERT Model transformer outputting raw hidden-states without any specific head on top. + */ +export class ConvBertModel extends ConvBertPreTrainedModel {} + +/** + * ConvBERT Model with a language modeling head on top. + */ +export class ConvBertForMaskedLM extends ConvBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * ConvBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class ConvBertForSequenceClassification extends ConvBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * ConvBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) + * e.g. for Named-Entity-Recognition (NER) tasks. + */ +export class ConvBertForTokenClassification extends ConvBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * ConvBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD + * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`) + */ +export class ConvBertForQuestionAnswering extends ConvBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/conv-next-pre-trained-model.js b/src/models/pre-trained-models/conv-next-pre-trained-model.js new file mode 100644 index 000000000..d8e24875a --- /dev/null +++ b/src/models/pre-trained-models/conv-next-pre-trained-model.js @@ -0,0 +1,21 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class ConvNextPreTrainedModel extends PreTrainedModel {} + +/** + * The bare ConvNext model outputting raw features without any specific head on top. + */ +export class ConvNextModel extends ConvNextPreTrainedModel {} + +/** + * ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. + */ +export class ConvNextForImageClassification extends ConvNextPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/conv-next-v2-pre-trained-model.js b/src/models/pre-trained-models/conv-next-v2-pre-trained-model.js new file mode 100644 index 000000000..1dbb28859 --- /dev/null +++ b/src/models/pre-trained-models/conv-next-v2-pre-trained-model.js @@ -0,0 +1,21 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class ConvNextV2PreTrainedModel extends PreTrainedModel {} + +/** + * The bare ConvNextV2 model outputting raw features without any specific head on top. + */ +export class ConvNextV2Model extends ConvNextV2PreTrainedModel {} + +/** + * ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. + */ +export class ConvNextV2ForImageClassification extends ConvNextV2PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/d-fine-pre-trained-model.js b/src/models/pre-trained-models/d-fine-pre-trained-model.js new file mode 100644 index 000000000..73c601435 --- /dev/null +++ b/src/models/pre-trained-models/d-fine-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { RTDetrObjectDetectionOutput } from './rt-detr-pre-trained-model.js'; + +export class DFinePreTrainedModel extends PreTrainedModel {} +export class DFineModel extends DFinePreTrainedModel {} +export class DFineForObjectDetection extends DFinePreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new RTDetrObjectDetectionOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/dac-pre-trained-model.js b/src/models/pre-trained-models/dac-pre-trained-model.js new file mode 100644 index 000000000..3b940752c --- /dev/null +++ b/src/models/pre-trained-models/dac-pre-trained-model.js @@ -0,0 +1,54 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { DacEncoderOutput, DacDecoderOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class DacPreTrainedModel extends PreTrainedModel { + main_input_name = 'input_values'; + forward_params = ['input_values']; +} + +/** + * The DAC (Descript Audio Codec) model. + */ +export class DacModel extends DacPreTrainedModel { + /** + * Encodes the input audio waveform into discrete codes. + * @param {Object} inputs Model inputs + * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`). + * @returns {Promise} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`. + */ + async encode(inputs) { + return new DacEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs)); + } + + /** + * Decodes the given frames into an output audio waveform. + * @param {DacEncoderOutput} inputs The encoded audio codes. + * @returns {Promise} The output tensor of shape `(batch_size, num_channels, sequence_length)`. + */ + async decode(inputs) { + return new DacDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs)); + } +} + +export class DacEncoderModel extends DacPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'encoder_model', + }); + } +} +export class DacDecoderModel extends DacPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'decoder_model', + }); + } +} diff --git a/src/models/pre-trained-models/deberta-pre-trained-model.js b/src/models/pre-trained-models/deberta-pre-trained-model.js new file mode 100644 index 000000000..5542efd27 --- /dev/null +++ b/src/models/pre-trained-models/deberta-pre-trained-model.js @@ -0,0 +1,75 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class DebertaPreTrainedModel extends PreTrainedModel {} + +/** + * The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top. + */ +export class DebertaModel extends DebertaPreTrainedModel {} + +/** + * DeBERTa Model with a `language modeling` head on top. + */ +export class DebertaForMaskedLM extends DebertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class DebertaForSequenceClassification extends DebertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. + */ +export class DebertaForTokenClassification extends DebertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + */ +export class DebertaForQuestionAnswering extends DebertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/deberta-v2-pre-trained-model.js b/src/models/pre-trained-models/deberta-v2-pre-trained-model.js new file mode 100644 index 000000000..ec4ef080e --- /dev/null +++ b/src/models/pre-trained-models/deberta-v2-pre-trained-model.js @@ -0,0 +1,75 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class DebertaV2PreTrainedModel extends PreTrainedModel {} + +/** + * The bare DeBERTa-V2 Model transformer outputting raw hidden-states without any specific head on top. + */ +export class DebertaV2Model extends DebertaV2PreTrainedModel {} + +/** + * DeBERTa-V2 Model with a `language modeling` head on top. + */ +export class DebertaV2ForMaskedLM extends DebertaV2PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * DeBERTa-V2 Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class DebertaV2ForSequenceClassification extends DebertaV2PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * DeBERTa-V2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. + */ +export class DebertaV2ForTokenClassification extends DebertaV2PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * DeBERTa-V2 Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + * layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + */ +export class DebertaV2ForQuestionAnswering extends DebertaV2PreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/decision-transformer-pre-trained-model.js b/src/models/pre-trained-models/decision-transformer-pre-trained-model.js new file mode 100644 index 000000000..fb1dbc6be --- /dev/null +++ b/src/models/pre-trained-models/decision-transformer-pre-trained-model.js @@ -0,0 +1,9 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class DecisionTransformerPreTrainedModel extends PreTrainedModel {} + +/** + * The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting. + * Refer to the paper for more details: https://huggingface.co/papers/2106.01345 + */ +export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel {} diff --git a/src/models/pre-trained-models/dei-t-pre-trained-model.js b/src/models/pre-trained-models/dei-t-pre-trained-model.js new file mode 100644 index 000000000..46e0f295f --- /dev/null +++ b/src/models/pre-trained-models/dei-t-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class DeiTPreTrainedModel extends PreTrainedModel {} +export class DeiTModel extends DeiTPreTrainedModel {} +export class DeiTForImageClassification extends DeiTPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/depth-anything-pre-trained-model.js b/src/models/pre-trained-models/depth-anything-pre-trained-model.js new file mode 100644 index 000000000..0b0124f72 --- /dev/null +++ b/src/models/pre-trained-models/depth-anything-pre-trained-model.js @@ -0,0 +1,8 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class DepthAnythingPreTrainedModel extends PreTrainedModel {} + +/** + * Depth Anything Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. + */ +export class DepthAnythingForDepthEstimation extends DepthAnythingPreTrainedModel {} diff --git a/src/models/pre-trained-models/depth-pro-pre-trained-model.js b/src/models/pre-trained-models/depth-pro-pre-trained-model.js new file mode 100644 index 000000000..1f19c48bf --- /dev/null +++ b/src/models/pre-trained-models/depth-pro-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class DepthProPreTrainedModel extends PreTrainedModel {} +export class DepthProForDepthEstimation extends DepthProPreTrainedModel {} diff --git a/src/models/pre-trained-models/detr-pre-trained-model.js b/src/models/pre-trained-models/detr-pre-trained-model.js new file mode 100644 index 000000000..e4797f769 --- /dev/null +++ b/src/models/pre-trained-models/detr-pre-trained-model.js @@ -0,0 +1,54 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { ModelOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class DetrPreTrainedModel extends PreTrainedModel {} +export class DetrModel extends DetrPreTrainedModel {} +export class DetrForObjectDetection extends DetrPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new DetrObjectDetectionOutput(await super._call(model_inputs)); + } +} + +export class DetrForSegmentation extends DetrPreTrainedModel { + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Model inputs + * @returns {Promise} Object containing segmentation outputs + */ + async _call(model_inputs) { + return new DetrSegmentationOutput(await super._call(model_inputs)); + } +} + +export class DetrObjectDetectionOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification logits (including no-object) for all queries. + * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). + * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). + */ + constructor({ logits, pred_boxes }) { + super(); + this.logits = logits; + this.pred_boxes = pred_boxes; + } +} + +export class DetrSegmentationOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits The output logits of the model. + * @param {Tensor} output.pred_boxes Predicted boxes. + * @param {Tensor} output.pred_masks Predicted masks. + */ + constructor({ logits, pred_boxes, pred_masks }) { + super(); + this.logits = logits; + this.pred_boxes = pred_boxes; + this.pred_masks = pred_masks; + } +} diff --git a/src/models/pre-trained-models/din-ov3-conv-next-pre-trained-model.js b/src/models/pre-trained-models/din-ov3-conv-next-pre-trained-model.js new file mode 100644 index 000000000..793b84ab8 --- /dev/null +++ b/src/models/pre-trained-models/din-ov3-conv-next-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class DINOv3ConvNextPreTrainedModel extends PreTrainedModel {} +export class DINOv3ConvNextModel extends DINOv3ConvNextPreTrainedModel {} diff --git a/src/models/pre-trained-models/din-ov3-vi-t-pre-trained-model.js b/src/models/pre-trained-models/din-ov3-vi-t-pre-trained-model.js new file mode 100644 index 000000000..c5839fe56 --- /dev/null +++ b/src/models/pre-trained-models/din-ov3-vi-t-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class DINOv3ViTPreTrainedModel extends PreTrainedModel {} +export class DINOv3ViTModel extends DINOv3ViTPreTrainedModel {} diff --git a/src/models/pre-trained-models/dinov2-pre-trained-model.js b/src/models/pre-trained-models/dinov2-pre-trained-model.js new file mode 100644 index 000000000..f85f191af --- /dev/null +++ b/src/models/pre-trained-models/dinov2-pre-trained-model.js @@ -0,0 +1,21 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class Dinov2PreTrainedModel extends PreTrainedModel {} + +/** + * The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top. + */ +export class Dinov2Model extends Dinov2PreTrainedModel {} + +/** + * Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. + */ +export class Dinov2ForImageClassification extends Dinov2PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/dinov2-with-registers-pre-trained-model.js b/src/models/pre-trained-models/dinov2-with-registers-pre-trained-model.js new file mode 100644 index 000000000..07dc51777 --- /dev/null +++ b/src/models/pre-trained-models/dinov2-with-registers-pre-trained-model.js @@ -0,0 +1,21 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class Dinov2WithRegistersPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top. + */ +export class Dinov2WithRegistersModel extends Dinov2WithRegistersPreTrainedModel {} + +/** + * Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state of the [CLS] token) e.g. for ImageNet. + */ +export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegistersPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/distil-bert-pre-trained-model.js b/src/models/pre-trained-models/distil-bert-pre-trained-model.js new file mode 100644 index 000000000..42ee40608 --- /dev/null +++ b/src/models/pre-trained-models/distil-bert-pre-trained-model.js @@ -0,0 +1,70 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class DistilBertPreTrainedModel extends PreTrainedModel {} +export class DistilBertModel extends DistilBertPreTrainedModel {} + +/** + * DistilBertForSequenceClassification is a class representing a DistilBERT model for sequence classification. + */ +export class DistilBertForSequenceClassification extends DistilBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * DistilBertForTokenClassification is a class representing a DistilBERT model for token classification. + */ +export class DistilBertForTokenClassification extends DistilBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * DistilBertForQuestionAnswering is a class representing a DistilBERT model for question answering. + */ +export class DistilBertForQuestionAnswering extends DistilBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} + +/** + * DistilBertForMaskedLM is a class representing a DistilBERT model for masking task. + */ +export class DistilBertForMaskedLM extends DistilBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/donut-swin-pre-trained-model.js b/src/models/pre-trained-models/donut-swin-pre-trained-model.js new file mode 100644 index 000000000..b1bacbd1c --- /dev/null +++ b/src/models/pre-trained-models/donut-swin-pre-trained-model.js @@ -0,0 +1,79 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class DonutSwinPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Donut Swin Model transformer outputting raw hidden-states without any specific head on top. + * + * **Example:** Step-by-step Document Parsing. + * + * ```javascript + * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers'; + * + * // Choose model to use + * const model_id = 'Xenova/donut-base-finetuned-cord-v2'; + * + * // Prepare image inputs + * const processor = await AutoProcessor.from_pretrained(model_id); + * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/receipt.png'; + * const image = await RawImage.read(url); + * const image_inputs = await processor(image); + * + * // Prepare decoder inputs + * const tokenizer = await AutoTokenizer.from_pretrained(model_id); + * const task_prompt = ''; + * const decoder_input_ids = tokenizer(task_prompt, { + * add_special_tokens: false, + * }).input_ids; + * + * // Create the model + * const model = await AutoModelForVision2Seq.from_pretrained(model_id); + * + * // Run inference + * const output = await model.generate(image_inputs.pixel_values, { + * decoder_input_ids, + * max_length: model.config.decoder.max_position_embeddings, + * }); + * + * // Decode output + * const decoded = tokenizer.batch_decode(output)[0]; + * // CINNAMON SUGAR 17,000 1 x 17,000 17,000 17,000 20,000 3,000 + * ``` + * + * **Example:** Step-by-step Document Visual Question Answering (DocVQA) + * + * ```javascript + * import { AutoProcessor, AutoTokenizer, AutoModelForVision2Seq, RawImage } from '@huggingface/transformers'; + * + * // Choose model to use + * const model_id = 'Xenova/donut-base-finetuned-docvqa'; + * + * // Prepare image inputs + * const processor = await AutoProcessor.from_pretrained(model_id); + * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/invoice.png'; + * const image = await RawImage.read(url); + * const image_inputs = await processor(image); + * + * // Prepare decoder inputs + * const tokenizer = await AutoTokenizer.from_pretrained(model_id); + * const question = 'What is the invoice number?'; + * const task_prompt = `${question}`; + * const decoder_input_ids = tokenizer(task_prompt, { + * add_special_tokens: false, + * }).input_ids; + * + * // Create the model + * const model = await AutoModelForVision2Seq.from_pretrained(model_id); + * + * // Run inference + * const output = await model.generate(image_inputs.pixel_values, { + * decoder_input_ids, + * max_length: model.config.decoder.max_position_embeddings, + * }); + * + * // Decode output + * const decoded = tokenizer.batch_decode(output)[0]; + * // What is the invoice number? us-001 + * ``` + */ +export class DonutSwinModel extends DonutSwinPreTrainedModel {} diff --git a/src/models/pre-trained-models/dpt-pre-trained-model.js b/src/models/pre-trained-models/dpt-pre-trained-model.js new file mode 100644 index 000000000..72ee11167 --- /dev/null +++ b/src/models/pre-trained-models/dpt-pre-trained-model.js @@ -0,0 +1,51 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class DPTPreTrainedModel extends PreTrainedModel {} + +/** + * The bare DPT Model transformer outputting raw hidden-states without any specific head on top. + */ +export class DPTModel extends DPTPreTrainedModel {} + +/** + * DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2. + * + * **Example:** Depth estimation w/ `Xenova/dpt-hybrid-midas`. + * ```javascript + * import { DPTForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers'; + * + * // Load model and processor + * const model_id = 'Xenova/dpt-hybrid-midas'; + * const model = await DPTForDepthEstimation.from_pretrained(model_id); + * const processor = await AutoProcessor.from_pretrained(model_id); + * + * // Load image from URL + * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg'; + * const image = await RawImage.read(url); + * + * // Prepare image for the model + * const inputs = await processor(image); + * + * // Run model + * const { predicted_depth } = await model(inputs); + * + * // Interpolate to original size + * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), { + * size: image.size.reverse(), + * mode: 'bilinear', + * })).squeeze(1); + * + * // Visualize the prediction + * const min = prediction.min().item(); + * const max = prediction.max().item(); + * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8'); + * const depth = RawImage.fromTensor(formatted); + * // RawImage { + * // data: Uint8Array(307200) [ 85, 85, 84, ... ], + * // width: 640, + * // height: 480, + * // channels: 1 + * // } + * ``` + */ +export class DPTForDepthEstimation extends DPTPreTrainedModel {} diff --git a/src/models/pre-trained-models/efficient-net-pre-trained-model.js b/src/models/pre-trained-models/efficient-net-pre-trained-model.js new file mode 100644 index 000000000..b5141e704 --- /dev/null +++ b/src/models/pre-trained-models/efficient-net-pre-trained-model.js @@ -0,0 +1,21 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class EfficientNetPreTrainedModel extends PreTrainedModel {} + +/** + * The bare EfficientNet model outputting raw features without any specific head on top. + */ +export class EfficientNetModel extends EfficientNetPreTrainedModel {} + +/** + * EfficientNet Model with an image classification head on top (a linear layer on top of the pooled features). + */ +export class EfficientNetForImageClassification extends EfficientNetPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/electra-pre-trained-model.js b/src/models/pre-trained-models/electra-pre-trained-model.js new file mode 100644 index 000000000..104ff2b45 --- /dev/null +++ b/src/models/pre-trained-models/electra-pre-trained-model.js @@ -0,0 +1,77 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class ElectraPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Electra Model transformer outputting raw hidden-states without any specific head on top. + * Identical to the BERT model except that it uses an additional linear layer between the embedding + * layer and the encoder if the hidden size and embedding size are different. + */ +export class ElectraModel extends ElectraPreTrainedModel {} +// TODO add ElectraForPreTraining +/** + * Electra model with a language modeling head on top. + */ +export class ElectraForMaskedLM extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * ELECTRA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class ElectraForSequenceClassification extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * Electra model with a token classification head on top. + */ +export class ElectraForTokenClassification extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * LECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD + * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + */ +export class ElectraForQuestionAnswering extends ElectraPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} \ No newline at end of file diff --git a/src/models/pre-trained-models/ernie4_5_pretrained-model.js b/src/models/pre-trained-models/ernie4_5_pretrained-model.js new file mode 100644 index 000000000..d3e5dec1c --- /dev/null +++ b/src/models/pre-trained-models/ernie4_5_pretrained-model.js @@ -0,0 +1,7 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Ernie4_5_PretrainedModel extends PreTrainedModel {} + +export class Ernie4_5_Model extends Ernie4_5_PretrainedModel {} + +export class Ernie4_5_ForCausalLM extends Ernie4_5_PretrainedModel {} diff --git a/src/models/pre-trained-models/esm-pre-trained-model.js b/src/models/pre-trained-models/esm-pre-trained-model.js new file mode 100644 index 000000000..2e1c85718 --- /dev/null +++ b/src/models/pre-trained-models/esm-pre-trained-model.js @@ -0,0 +1,59 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class EsmPreTrainedModel extends PreTrainedModel {} + +/** + * The bare ESM Model transformer outputting raw hidden-states without any specific head on top. + */ +export class EsmModel extends EsmPreTrainedModel {} + +/** + * ESM Model with a `language modeling` head on top. + */ +export class EsmForMaskedLM extends EsmPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class EsmForSequenceClassification extends EsmPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * ESM Model with a token classification head on top (a linear layer on top of the hidden-states output) + * e.g. for Named-Entity-Recognition (NER) tasks. + */ +export class EsmForTokenClassification extends EsmPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} \ No newline at end of file diff --git a/src/models/pre-trained-models/exaone-pre-trained-model.js b/src/models/pre-trained-models/exaone-pre-trained-model.js new file mode 100644 index 000000000..0f8500668 --- /dev/null +++ b/src/models/pre-trained-models/exaone-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ExaonePreTrainedModel extends PreTrainedModel {} +export class ExaoneModel extends ExaonePreTrainedModel {} +export class ExaoneForCausalLM extends ExaonePreTrainedModel {} diff --git a/src/models/pre-trained-models/falcon-pre-trained-model.js b/src/models/pre-trained-models/falcon-pre-trained-model.js new file mode 100644 index 000000000..81aaf890c --- /dev/null +++ b/src/models/pre-trained-models/falcon-pre-trained-model.js @@ -0,0 +1,10 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Falcon Model outputting raw hidden-states without any specific head on top. + */ +export class FalconPreTrainedModel extends PreTrainedModel {} + +export class FalconModel extends FalconPreTrainedModel {} + +export class FalconForCausalLM extends FalconPreTrainedModel {} diff --git a/src/models/pre-trained-models/fast-vi-t-pre-trained-model.js b/src/models/pre-trained-models/fast-vi-t-pre-trained-model.js new file mode 100644 index 000000000..822def48f --- /dev/null +++ b/src/models/pre-trained-models/fast-vi-t-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class FastViTPreTrainedModel extends PreTrainedModel {} +export class FastViTModel extends FastViTPreTrainedModel {} +export class FastViTForImageClassification extends FastViTPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/florence2-pre-trained-model.js b/src/models/pre-trained-models/florence2-pre-trained-model.js new file mode 100644 index 000000000..a9d3965ec --- /dev/null +++ b/src/models/pre-trained-models/florence2-pre-trained-model.js @@ -0,0 +1,115 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { cat, ones } from '../../utils/tensor.js'; +import { encoderForward, decoderForward } from '../utils.js'; + +export class Florence2PreTrainedModel extends PreTrainedModel { + forward_params = [ + // Encoder inputs + 'input_ids', + 'inputs_embeds', + 'attention_mask', + 'pixel_values', + + // Decoder inputs + 'encoder_outputs', + 'decoder_input_ids', + 'decoder_inputs_embeds', + 'decoder_attention_mask', + 'past_key_values', + ]; + main_input_name = 'inputs_embeds'; +} + +export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel { + _merge_input_ids_with_image_features({ inputs_embeds, image_features, input_ids, attention_mask }) { + return { + inputs_embeds: cat( + [ + image_features, // image embeds + inputs_embeds, // task prefix embeds + ], + 1, + ), + attention_mask: cat( + [ + ones(image_features.dims.slice(0, 2)), // image attention mask + attention_mask, // task prefix attention mask + ], + 1, + ), + }; + } + + async _prepare_inputs_embeds({ input_ids, pixel_values, inputs_embeds, attention_mask }) { + if (!input_ids && !pixel_values) { + throw new Error('Either `input_ids` or `pixel_values` should be provided.'); + } + + // 1. Possibly, extract the input embeddings + let text_features, image_features; + if (input_ids) { + text_features = await this.encode_text({ input_ids }); + } + if (pixel_values) { + image_features = await this.encode_image({ pixel_values }); + } + + // 2. Possibly, merge text and images + if (text_features && image_features) { + ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({ + inputs_embeds: text_features, + image_features, + input_ids, + attention_mask, + })); + } else { + inputs_embeds = text_features || image_features; + } + + return { inputs_embeds, attention_mask }; + } + + async forward({ + input_ids, + pixel_values, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + encoder_outputs, + past_key_values, + + inputs_embeds, + decoder_inputs_embeds, + }) { + if (!inputs_embeds) { + ({ inputs_embeds, attention_mask } = await this._prepare_inputs_embeds({ + input_ids, + pixel_values, + inputs_embeds, + attention_mask, + })); + } + + if (!encoder_outputs) { + // Must compute encoder outputs + let { last_hidden_state } = await encoderForward(this, { inputs_embeds, attention_mask }); + encoder_outputs = last_hidden_state; + } + + if (!decoder_inputs_embeds) { + if (!decoder_input_ids) { + throw new Error('Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.'); + } + decoder_inputs_embeds = await this.encode_text({ input_ids: decoder_input_ids }); + } + + const decoderFeeds = { + inputs_embeds: decoder_inputs_embeds, + attention_mask: decoder_attention_mask, + encoder_attention_mask: attention_mask, + encoder_hidden_states: encoder_outputs, + past_key_values, + }; + return await decoderForward(this, decoderFeeds, true); + } +} diff --git a/src/models/pre-trained-models/gemma-pre-trained-model.js b/src/models/pre-trained-models/gemma-pre-trained-model.js new file mode 100644 index 000000000..1621b9718 --- /dev/null +++ b/src/models/pre-trained-models/gemma-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Gemma Model outputting raw hidden-states without any specific head on top. + */ +export class GemmaPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Gemma Model outputting raw hidden-states without any specific head on top. + */ +export class GemmaModel extends GemmaPreTrainedModel {} + +export class GemmaForCausalLM extends GemmaPreTrainedModel {} diff --git a/src/models/pre-trained-models/gemma2-pre-trained-model.js b/src/models/pre-trained-models/gemma2-pre-trained-model.js new file mode 100644 index 000000000..ac0d231b3 --- /dev/null +++ b/src/models/pre-trained-models/gemma2-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Gemma2 Model outputting raw hidden-states without any specific head on top. + */ +export class Gemma2PreTrainedModel extends PreTrainedModel {} + +/** + * The bare Gemma2 Model outputting raw hidden-states without any specific head on top. + */ +export class Gemma2Model extends Gemma2PreTrainedModel {} + +export class Gemma2ForCausalLM extends Gemma2PreTrainedModel {} diff --git a/src/models/pre-trained-models/gemma3-pre-trained-model.js b/src/models/pre-trained-models/gemma3-pre-trained-model.js new file mode 100644 index 000000000..8bb9fc3b5 --- /dev/null +++ b/src/models/pre-trained-models/gemma3-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Gemma3 Model outputting raw hidden-states without any specific head on top. + */ +export class Gemma3PreTrainedModel extends PreTrainedModel {} + +/** + * The bare Gemma3 Model outputting raw hidden-states without any specific head on top. + */ +export class Gemma3Model extends Gemma3PreTrainedModel {} + +export class Gemma3ForCausalLM extends Gemma3PreTrainedModel {} diff --git a/src/models/pre-trained-models/gemma3n-pre-trained-model.js b/src/models/pre-trained-models/gemma3n-pre-trained-model.js new file mode 100644 index 000000000..c90c5900e --- /dev/null +++ b/src/models/pre-trained-models/gemma3n-pre-trained-model.js @@ -0,0 +1,118 @@ +import { Tensor } from '../../utils/tensor.js'; +import { + decoderForward, + default_merge_input_ids_with_image_features, + default_merge_input_ids_with_audio_features, +} from '../utils.js'; +import { sessionRun } from '../session.js'; +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Gemma3nPreTrainedModel extends PreTrainedModel { + forward_params = [ + 'input_ids', + 'attention_mask', + 'inputs_embeds', + 'per_layer_inputs', + + 'position_ids', + 'pixel_values', + 'input_features', + 'input_features_mask', + 'past_key_values', + ]; +} +export class Gemma3nForConditionalGeneration extends Gemma3nPreTrainedModel { + async forward({ + // Produced by the tokenizer/processor: + input_ids = null, + attention_mask = null, + pixel_values = null, + input_features = null, + input_features_mask = null, + + // Used during generation: + position_ids = null, + inputs_embeds = null, + per_layer_inputs = null, + past_key_values = null, + + // Generic generation parameters + generation_config = null, + logits_processor = null, + + // TODO: needed? + ...kwargs + }) { + if (!inputs_embeds || !per_layer_inputs) { + // 1. Extract the text embeddings. + ({ inputs_embeds, per_layer_inputs } = await sessionRun(this.sessions['embed_tokens'], { + input_ids, + })); + if (input_ids.dims[1] !== 1) { + if (pixel_values) { + // Encode the image + const { image_features } = await sessionRun(this.sessions['vision_encoder'], { + pixel_values, + }); + ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_image_features({ + image_features, + inputs_embeds, + input_ids, + attention_mask, + })); + } + + if (input_features) { + // Encode the audio + const { audio_features } = await sessionRun(this.sessions['audio_encoder'], { + input_features, + input_features_mask, + }); + ({ inputs_embeds, attention_mask } = this._merge_input_ids_with_audio_features({ + audio_features, + inputs_embeds, + input_ids, + attention_mask, + })); + } + } + } + + const outputs = await decoderForward( + this, + { + inputs_embeds, + per_layer_inputs, + past_key_values, + attention_mask, + position_ids, + generation_config, + logits_processor, + }, + true, + ); + return outputs; + } + + _merge_input_ids_with_image_features(kwargs) { + const vision_hidden_size = kwargs.image_features.dims.at(-1); + const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); + return default_merge_input_ids_with_image_features({ + // @ts-ignore + image_token_id: this.config.image_token_id, + ...kwargs, + image_features: reshaped_image_hidden_states, + }); + } + _merge_input_ids_with_audio_features(kwargs) { + const audio_hidden_size = kwargs.audio_features.dims.at(-1); + const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size); + + return default_merge_input_ids_with_audio_features({ + // @ts-ignore + audio_token_id: this.config.audio_token_id, + ...kwargs, + audio_features: reshaped_audio_features, + }); + } +} diff --git a/src/models/pre-trained-models/glm-pre-trained-model.js b/src/models/pre-trained-models/glm-pre-trained-model.js new file mode 100644 index 000000000..dd49c5269 --- /dev/null +++ b/src/models/pre-trained-models/glm-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GlmPreTrainedModel extends PreTrainedModel {} +export class GlmModel extends GlmPreTrainedModel {} +export class GlmForCausalLM extends GlmPreTrainedModel {} diff --git a/src/models/pre-trained-models/glpn-pre-trained-model.js b/src/models/pre-trained-models/glpn-pre-trained-model.js new file mode 100644 index 000000000..975d2e4f6 --- /dev/null +++ b/src/models/pre-trained-models/glpn-pre-trained-model.js @@ -0,0 +1,47 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GLPNPreTrainedModel extends PreTrainedModel {} + +/** + * The bare GLPN encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. + */ +export class GLPNModel extends GLPNPreTrainedModel {} + +/** + * import { GLPNForDepthEstimation, AutoProcessor, RawImage, interpolate_4d } from '@huggingface/transformers'; + * + * // Load model and processor + * const model_id = 'Xenova/glpn-kitti'; + * const model = await GLPNForDepthEstimation.from_pretrained(model_id); + * const processor = await AutoProcessor.from_pretrained(model_id); + * + * // Load image from URL + * const url = 'http://images.cocodataset.org/val2017/000000039769.jpg'; + * const image = await RawImage.read(url); + * + * // Prepare image for the model + * const inputs = await processor(image); + * + * // Run model + * const { predicted_depth } = await model(inputs); + * + * // Interpolate to original size + * const prediction = (await interpolate_4d(predicted_depth.unsqueeze(1), { + * size: image.size.reverse(), + * mode: 'bilinear', + * })).squeeze(1); + * + * // Visualize the prediction + * const min = prediction.min().item(); + * const max = prediction.max().item(); + * const formatted = prediction.sub_(min).div_(max - min).mul_(255).to('uint8'); + * const depth = RawImage.fromTensor(formatted); + * // RawImage { + * // data: Uint8Array(307200) [ 85, 85, 84, ... ], + * // width: 640, + * // height: 480, + * // channels: 1 + * // } + * ``` + */ +export class GLPNForDepthEstimation extends GLPNPreTrainedModel {} diff --git a/src/models/pre-trained-models/gpt-big-code-pre-trained-model.js b/src/models/pre-trained-models/gpt-big-code-pre-trained-model.js new file mode 100644 index 000000000..d10288b5b --- /dev/null +++ b/src/models/pre-trained-models/gpt-big-code-pre-trained-model.js @@ -0,0 +1,6 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GPTBigCodePreTrainedModel extends PreTrainedModel {} +export class GPTBigCodeModel extends GPTBigCodePreTrainedModel {} + +export class GPTBigCodeForCausalLM extends GPTBigCodePreTrainedModel {} diff --git a/src/models/pre-trained-models/gpt-neo-pre-trained-model.js b/src/models/pre-trained-models/gpt-neo-pre-trained-model.js new file mode 100644 index 000000000..5934d5e80 --- /dev/null +++ b/src/models/pre-trained-models/gpt-neo-pre-trained-model.js @@ -0,0 +1,6 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GPTNeoPreTrainedModel extends PreTrainedModel {} +export class GPTNeoModel extends GPTNeoPreTrainedModel {} + +export class GPTNeoForCausalLM extends GPTNeoPreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/gpt-neo-x-pre-trained-model.js b/src/models/pre-trained-models/gpt-neo-x-pre-trained-model.js new file mode 100644 index 000000000..d848106fc --- /dev/null +++ b/src/models/pre-trained-models/gpt-neo-x-pre-trained-model.js @@ -0,0 +1,6 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GPTNeoXPreTrainedModel extends PreTrainedModel {} +export class GPTNeoXModel extends GPTNeoXPreTrainedModel {} + +export class GPTNeoXForCausalLM extends GPTNeoXPreTrainedModel {} diff --git a/src/models/pre-trained-models/gpt-oss-pre-trained-model.js b/src/models/pre-trained-models/gpt-oss-pre-trained-model.js new file mode 100644 index 000000000..769e45c8f --- /dev/null +++ b/src/models/pre-trained-models/gpt-oss-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GptOssPreTrainedModel extends PreTrainedModel {} +export class GptOssModel extends GptOssPreTrainedModel {} +export class GptOssForCausalLM extends GptOssPreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/gpt2-pre-trained-model.js b/src/models/pre-trained-models/gpt2-pre-trained-model.js new file mode 100644 index 000000000..0b5e09ce3 --- /dev/null +++ b/src/models/pre-trained-models/gpt2-pre-trained-model.js @@ -0,0 +1,12 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GPT2PreTrainedModel extends PreTrainedModel {} + +export class GPT2Model extends GPT2PreTrainedModel {} + +/** + * GPT-2 language model head on top of the GPT-2 base model. This model is suitable for text generation tasks. + */ +export class GPT2LMHeadModel extends GPT2PreTrainedModel {} +// export class GPT2ForSequenceClassification extends GPT2PreTrainedModel { +// TODO \ No newline at end of file diff --git a/src/models/pre-trained-models/gptj-pre-trained-model.js b/src/models/pre-trained-models/gptj-pre-trained-model.js new file mode 100644 index 000000000..a229c265a --- /dev/null +++ b/src/models/pre-trained-models/gptj-pre-trained-model.js @@ -0,0 +1,6 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GPTJPreTrainedModel extends PreTrainedModel {} +export class GPTJModel extends GPTJPreTrainedModel {} + +export class GPTJForCausalLM extends GPTJPreTrainedModel {} diff --git a/src/models/pre-trained-models/granite-moe-hybrid-pre-trained-model.js b/src/models/pre-trained-models/granite-moe-hybrid-pre-trained-model.js new file mode 100644 index 000000000..ce1262413 --- /dev/null +++ b/src/models/pre-trained-models/granite-moe-hybrid-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GraniteMoeHybridPreTrainedModel extends PreTrainedModel {} +export class GraniteMoeHybridModel extends GraniteMoeHybridPreTrainedModel {} +export class GraniteMoeHybridForCausalLM extends GraniteMoeHybridPreTrainedModel {} diff --git a/src/models/pre-trained-models/granite-pre-trained-model.js b/src/models/pre-trained-models/granite-pre-trained-model.js new file mode 100644 index 000000000..3eaee85e4 --- /dev/null +++ b/src/models/pre-trained-models/granite-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GranitePreTrainedModel extends PreTrainedModel {} +export class GraniteModel extends GranitePreTrainedModel {} +export class GraniteForCausalLM extends GranitePreTrainedModel {} diff --git a/src/models/pre-trained-models/grounding-dino-pre-trained-model.js b/src/models/pre-trained-models/grounding-dino-pre-trained-model.js new file mode 100644 index 000000000..61f92805a --- /dev/null +++ b/src/models/pre-trained-models/grounding-dino-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GroundingDinoPreTrainedModel extends PreTrainedModel {} +export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel {} diff --git a/src/models/pre-trained-models/group-vi-t-pre-trained-model.js b/src/models/pre-trained-models/group-vi-t-pre-trained-model.js new file mode 100644 index 000000000..c6796f902 --- /dev/null +++ b/src/models/pre-trained-models/group-vi-t-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class GroupViTPreTrainedModel extends PreTrainedModel {} +export class GroupViTModel extends GroupViTPreTrainedModel {} diff --git a/src/models/pre-trained-models/helium-pre-trained-model.js b/src/models/pre-trained-models/helium-pre-trained-model.js new file mode 100644 index 000000000..51208e560 --- /dev/null +++ b/src/models/pre-trained-models/helium-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class HeliumPreTrainedModel extends PreTrainedModel {} +export class HeliumModel extends HeliumPreTrainedModel {} +export class HeliumForCausalLM extends HeliumPreTrainedModel {} diff --git a/src/models/pre-trained-models/hiera-pre-trained-model.js b/src/models/pre-trained-models/hiera-pre-trained-model.js new file mode 100644 index 000000000..98623ec56 --- /dev/null +++ b/src/models/pre-trained-models/hiera-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class HieraPreTrainedModel extends PreTrainedModel {} +export class HieraModel extends HieraPreTrainedModel {} +export class HieraForImageClassification extends HieraPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/hubert-pre-trained-model.js b/src/models/pre-trained-models/hubert-pre-trained-model.js new file mode 100644 index 000000000..e53dca111 --- /dev/null +++ b/src/models/pre-trained-models/hubert-pre-trained-model.js @@ -0,0 +1,62 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CausalLMOutput, SequenceClassifierOutput } from '../output.js'; +import { Wav2Vec2PreTrainedModel } from './wav2-vec2-pre-trained-model.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class HubertPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Hubert Model transformer outputting raw hidden-states without any specific head on top. + * + * **Example:** Load and run a `HubertModel` for feature extraction. + * + * ```javascript + * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; + * + * // Read and preprocess audio + * const processor = await AutoProcessor.from_pretrained('Xenova/hubert-base-ls960'); + * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000); + * const inputs = await processor(audio); + * + * // Load and run model with inputs + * const model = await AutoModel.from_pretrained('Xenova/hubert-base-ls960'); + * const output = await model(inputs); + * // { + * // last_hidden_state: Tensor { + * // dims: [ 1, 549, 768 ], + * // type: 'float32', + * // data: Float32Array(421632) [0.0682469978928566, 0.08104046434164047, -0.4975186586380005, ...], + * // size: 421632 + * // } + * // } + * ``` + */ +export class HubertModel extends Wav2Vec2PreTrainedModel {} + +/** + * Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + */ +export class HubertForCTC extends Wav2Vec2PreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +/** + * Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like SUPERB Keyword Spotting. + */ +export class HubertForSequenceClassification extends Wav2Vec2PreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/i-jepa-pre-trained-model.js b/src/models/pre-trained-models/i-jepa-pre-trained-model.js new file mode 100644 index 000000000..534988c81 --- /dev/null +++ b/src/models/pre-trained-models/i-jepa-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class IJepaPreTrainedModel extends PreTrainedModel {} +export class IJepaModel extends IJepaPreTrainedModel {} +export class IJepaForImageClassification extends IJepaPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/idefics3-pre-trained-model.js b/src/models/pre-trained-models/idefics3-pre-trained-model.js new file mode 100644 index 000000000..913c0785c --- /dev/null +++ b/src/models/pre-trained-models/idefics3-pre-trained-model.js @@ -0,0 +1,43 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { default_merge_input_ids_with_image_features } from '../utils.js'; + +export class Idefics3PreTrainedModel extends PreTrainedModel { + forward_params = [ + 'input_ids', + 'attention_mask', + 'pixel_values', + 'pixel_attention_mask', + 'position_ids', + 'past_key_values', + ]; +} + +/** + * The Idefics3 model which consists of a vision backbone and a language model. + */ +export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel { + async encode_image({ pixel_values, pixel_attention_mask }) { + const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, pixel_attention_mask })) + .image_features; + return features; + } + + _merge_input_ids_with_image_features(kwargs) { + const vision_hidden_size = kwargs.image_features.dims.at(-1); + const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); + + return default_merge_input_ids_with_image_features({ + // @ts-ignore + image_token_id: this.config.image_token_id, + ...kwargs, + image_features: reshaped_image_hidden_states, + }); + } +} + +/** + * The SmolVLM Model with a language modeling head. + * It is made up a SigLIP vision encoder, with a language modeling head on top. + */ +export class SmolVLMForConditionalGeneration extends Idefics3ForConditionalGeneration {} \ No newline at end of file diff --git a/src/models/pre-trained-models/index.js b/src/models/pre-trained-models/index.js new file mode 100644 index 000000000..f180a3b5c --- /dev/null +++ b/src/models/pre-trained-models/index.js @@ -0,0 +1,163 @@ +export * from './albert-pre-trained-model.js'; +export * from './apertus-pre-trained-model.js'; +export * from './arcee-pre-trained-model.js'; +export * from './ast-pre-trained-model.js'; +export * from './bart-pretrained-model.js'; +export * from './beit-pre-trained-model.js'; +export * from './bert-pre-trained-model.js'; +export * from './blenderbot-pre-trained-model.js'; +export * from './blenderbot-small-pre-trained-model.js'; +export * from './bloom-pre-trained-model.js'; +export * from './camembert-pre-trained-model.js'; +export * from './chatterbox-pre-trained-model.js'; +export * from './chinese-clip-pre-trained-model.js'; +export * from './clap-pre-trained-model.js'; +export * from './clip-pre-trained-model.js'; +export * from './clip-seg-pre-trained-model.js'; +export * from './code-gen-pre-trained-model.js'; +export * from './cohere-pre-trained-model.js'; +export * from './conv-bert-pre-trained-model.js'; +export * from './conv-next-pre-trained-model.js'; +export * from './conv-next-v2-pre-trained-model.js'; +export * from './d-fine-pre-trained-model.js'; +export * from './dac-pre-trained-model.js'; +export * from './deberta-pre-trained-model.js'; +export * from './deberta-v2-pre-trained-model.js'; +export * from './decision-transformer-pre-trained-model.js'; +export * from './dei-t-pre-trained-model.js'; +export * from './depth-anything-pre-trained-model.js'; +export * from './depth-pro-pre-trained-model.js'; +export * from './detr-pre-trained-model.js'; +export * from './din-ov3-conv-next-pre-trained-model.js'; +export * from './din-ov3-vi-t-pre-trained-model.js'; +export * from './dinov2-pre-trained-model.js'; +export * from './dinov2-with-registers-pre-trained-model.js'; +export * from './distil-bert-pre-trained-model.js'; +export * from './donut-swin-pre-trained-model.js'; +export * from './dpt-pre-trained-model.js'; +export * from './efficient-net-pre-trained-model.js'; +export * from './electra-pre-trained-model.js'; +export * from './ernie4_5_pretrained-model.js'; +export * from './esm-pre-trained-model.js'; +export * from './exaone-pre-trained-model.js'; +export * from './falcon-pre-trained-model.js'; +export * from './fast-vi-t-pre-trained-model.js'; +export * from './florence2-pre-trained-model.js'; +export * from './gemma-pre-trained-model.js'; +export * from './gemma2-pre-trained-model.js'; +export * from './gemma3-pre-trained-model.js'; +export * from './gemma3n-pre-trained-model.js'; +export * from './glm-pre-trained-model.js'; +export * from './glpn-pre-trained-model.js'; +export * from './gpt-big-code-pre-trained-model.js'; +export * from './gpt-neo-pre-trained-model.js'; +export * from './gpt-neo-x-pre-trained-model.js'; +export * from './gpt-oss-pre-trained-model.js'; +export * from './gpt2-pre-trained-model.js'; +export * from './gptj-pre-trained-model.js'; +export * from './granite-moe-hybrid-pre-trained-model.js'; +export * from './granite-pre-trained-model.js'; +export * from './grounding-dino-pre-trained-model.js'; +export * from './group-vi-t-pre-trained-model.js'; +export * from './helium-pre-trained-model.js'; +export * from './hiera-pre-trained-model.js'; +export * from './hubert-pre-trained-model.js'; +export * from './i-jepa-pre-trained-model.js'; +export * from './idefics3-pre-trained-model.js'; +export * from './jais-pre-trained-model.js'; +export * from './jina-clip-pre-trained-model.js'; +export * from './lfm2-pre-trained-model.js'; +export * from './llama-pre-trained-model.js'; +export * from './llama4-pre-trained-model.js'; +export * from './llava-pre-trained-model.js'; +export * from './long-t5-pre-trained-model.js'; +export * from './m-bart-pre-trained-model.js'; +export * from './m2-m100-pre-trained-model.js'; +export * from './marian-pre-trained-model.js'; +export * from './mask-former-pre-trained-model.js'; +export * from './metric3-d-pre-trained-model.js'; +export * from './metric3-dv2-pre-trained-model.js'; +export * from './mgpstr-pre-trained-model.js'; +export * from './mimi-pre-trained-model.js'; +export * from './mistral-pre-trained-model.js'; +export * from './mobile-bert-pre-trained-model.js'; +export * from './mobile-llm-pre-trained-model.js'; +export * from './mobile-net-v1-pre-trained-model.js'; +export * from './mobile-net-v2-pre-trained-model.js'; +export * from './mobile-net-v3-pre-trained-model.js'; +export * from './mobile-net-v4-pre-trained-model.js'; +export * from './mobile-vi-t-pre-trained-model.js'; +export * from './mobile-vi-tv2-pre-trained-model.js'; +export * from './modern-bert-decoder-pre-trained-model.js'; +export * from './modern-bert-pre-trained-model.js'; +export * from './moonshine-pre-trained-model.js'; +export * from './mp-net-pre-trained-model.js'; +export * from './mpt-pre-trained-model.js'; +export * from './mt5-pre-trained-model.js'; +export * from './multi-modality-pre-trained-model.js'; +export * from './musicgen-pre-trained-model.js'; +export * from './nano-chat-pre-trained-model.js'; +export * from './neo-bert-pre-trained-model.js'; +export * from './nomic-bert-pre-trained-model.js'; +export * from './olmo-pre-trained-model.js'; +export * from './olmo2-pre-trained-model.js'; +export * from './olmo3-pre-trained-model.js'; +export * from './open-elm-pre-trained-model.js'; +export * from './opt-pre-trained-model.js'; +export * from './owl-vi-t-pre-trained-model.js'; +export * from './owlv2-pre-trained-model.js'; +export * from './pali-gemma-pre-trained-model.js'; +export * from './parakeet-pre-trained-model.js'; +export * from './patch-ts-mixer-pre-trained-model.js'; +export * from './patch-tst-pre-trained-model.js'; +export * from './phi-pre-trained-model.js'; +export * from './phi3-pre-trained-model.js'; +export * from './phi3-v-pre-trained-model.js'; +export * from './pvt-pre-trained-model.js'; +export * from './py-annote-pre-trained-model.js'; +export * from './qwen2-pre-trained-model.js'; +export * from './qwen2-vl-pre-trained-model.js'; +export * from './qwen3-pre-trained-model.js'; +export * from './res-net-pre-trained-model.js'; +export * from './rf-detr-pre-trained-model.js'; +export * from './ro-former-pre-trained-model.js'; +export * from './roberta-pre-trained-model.js'; +export * from './rt-detr-pre-trained-model.js'; +export * from './rt-detr-v2-pre-trained-model.js'; +export * from './sam-pre-trained-model.js'; +export * from './sam2-pre-trained-model.js'; +export * from './sapiens-pre-trained-model.js'; +export * from './segformer-pre-trained-model.js'; +export * from './siglip-pre-trained-model.js'; +export * from './smol-lm3-pre-trained-model.js'; +export * from './snac-pre-trained-model.js'; +export * from './speech-t5-pre-trained-model.js'; +export * from './squeeze-bert-pre-trained-model.js'; +export * from './stable-lm-pre-trained-model.js'; +export * from './starcoder2-pre-trained-model.js'; +export * from './style-text-to-speech2-pre-trained-model.js'; +export * from './supertonic-pre-trained-model.js'; +export * from './swin-pre-trained-model.js'; +export * from './swin2-sr-pre-trained-model.js'; +export * from './t5-pre-trained-model.js'; +export * from './table-transformer-pre-trained-model.js'; +export * from './tr-ocr-pre-trained-model.js'; +export * from './ultravox-pre-trained-model.js'; +export * from './uni-speech-pre-trained-model.js'; +export * from './uni-speech-sat-pre-trained-model.js'; +export * from './vault-gemma-pre-trained-model.js'; +export * from './vi-t-pre-trained-model.js'; +export * from './vi-tmae-pre-trained-model.js'; +export * from './vi-tmsn-pre-trained-model.js'; +export * from './vision-encoder-decoder-model.js'; +export * from './vit-matte-pre-trained-model.js'; +export * from './vit-pose-pre-trained-model.js'; +export * from './vits-pre-trained-model.js'; +export * from './wav-lm-pre-trained-model.js'; +export * from './wav2-vec2-bert-pre-trained-model.js'; +export * from './wav2-vec2-pre-trained-model.js'; +export * from './we-speaker-res-net-pre-trained-model.js'; +export * from './whisper-pre-trained-model.js'; +export * from './xlm-pre-trained-model.js'; +export * from './xlm-roberta-pre-trained-model.js'; +export * from './yolos-pre-trained-model.js'; \ No newline at end of file diff --git a/src/models/pre-trained-models/jais-pre-trained-model.js b/src/models/pre-trained-models/jais-pre-trained-model.js new file mode 100644 index 000000000..cdef7e0b9 --- /dev/null +++ b/src/models/pre-trained-models/jais-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class JAISPreTrainedModel extends PreTrainedModel {} + +/** + * The bare JAIS Model transformer outputting raw hidden-states without any specific head on top. + */ +export class JAISModel extends JAISPreTrainedModel {} + +/** + * The JAIS Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). + */ +export class JAISLMHeadModel extends JAISPreTrainedModel {} diff --git a/src/models/pre-trained-models/jina-clip-pre-trained-model.js b/src/models/pre-trained-models/jina-clip-pre-trained-model.js new file mode 100644 index 000000000..6bdbfee37 --- /dev/null +++ b/src/models/pre-trained-models/jina-clip-pre-trained-model.js @@ -0,0 +1,65 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { ones, full } from '../../utils/tensor.js'; + +export class JinaCLIPPreTrainedModel extends PreTrainedModel {} + +export class JinaCLIPModel extends JinaCLIPPreTrainedModel { + async forward(model_inputs) { + const missing_text_inputs = !model_inputs.input_ids; + const missing_image_inputs = !model_inputs.pixel_values; + + if (missing_text_inputs && missing_image_inputs) { + throw new Error('Either `input_ids` or `pixel_values` should be provided.'); + } + + // If either `input_ids` or `pixel_values` aren't passed, we need to create dummy input since the model requires a value to be specified. + if (missing_text_inputs) { + // NOTE: We cannot pass zero-dimension tensor as input for input_ids. + // Fortunately, the majority of time is spent in the vision encoder, so this shouldn't significantly impact performance. + model_inputs.input_ids = ones([model_inputs.pixel_values.dims[0], 1]); + } + + if (missing_image_inputs) { + // NOTE: Since we create a zero-sized tensor, this does not increase computation time. + // @ts-ignore + const { image_size } = this.config.vision_config; + model_inputs.pixel_values = full([0, 3, image_size, image_size], 0.0); // (pass zero-dimension tensor) + } + + const { text_embeddings, image_embeddings, l2norm_text_embeddings, l2norm_image_embeddings } = + await super.forward(model_inputs); + + const result = {}; + if (!missing_text_inputs) { + result.text_embeddings = text_embeddings; + result.l2norm_text_embeddings = l2norm_text_embeddings; + } + if (!missing_image_inputs) { + result.image_embeddings = image_embeddings; + result.l2norm_image_embeddings = l2norm_image_embeddings; + } + return result; + } +} + +export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'text_model', + }); + } +} + +export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'vision_model', + }); + } +} \ No newline at end of file diff --git a/src/models/pre-trained-models/lfm2-pre-trained-model.js b/src/models/pre-trained-models/lfm2-pre-trained-model.js new file mode 100644 index 000000000..6793cf220 --- /dev/null +++ b/src/models/pre-trained-models/lfm2-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Lfm2PreTrainedModel extends PreTrainedModel {} +export class Lfm2Model extends Lfm2PreTrainedModel {} +export class Lfm2ForCausalLM extends Lfm2PreTrainedModel {} diff --git a/src/models/pre-trained-models/llama-pre-trained-model.js b/src/models/pre-trained-models/llama-pre-trained-model.js new file mode 100644 index 000000000..c727221a8 --- /dev/null +++ b/src/models/pre-trained-models/llama-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare LLama Model outputting raw hidden-states without any specific head on top. + */ +export class LlamaPreTrainedModel extends PreTrainedModel {} + +/** + * The bare LLaMA Model outputting raw hidden-states without any specific head on top. + */ +export class LlamaModel extends LlamaPreTrainedModel {} + +export class LlamaForCausalLM extends LlamaPreTrainedModel {} diff --git a/src/models/pre-trained-models/llama4-pre-trained-model.js b/src/models/pre-trained-models/llama4-pre-trained-model.js new file mode 100644 index 000000000..33c934569 --- /dev/null +++ b/src/models/pre-trained-models/llama4-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Llama4PreTrainedModel extends PreTrainedModel {} +export class Llama4ForCausalLM extends Llama4PreTrainedModel {} diff --git a/src/models/pre-trained-models/llava-pre-trained-model.js b/src/models/pre-trained-models/llava-pre-trained-model.js new file mode 100644 index 000000000..c535b4ccf --- /dev/null +++ b/src/models/pre-trained-models/llava-pre-trained-model.js @@ -0,0 +1,41 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { default_merge_input_ids_with_features, default_merge_input_ids_with_image_features } from '../utils.js'; + +export class LlavaPreTrainedModel extends PreTrainedModel { + forward_params = ['input_ids', 'attention_mask', 'pixel_values', 'position_ids', 'past_key_values']; +} + +/** + * The LLAVA model which consists of a vision backbone and a language model. + */ +export class LlavaForConditionalGeneration extends LlavaPreTrainedModel { + _merge_input_ids_with_image_features(kwargs) { + const vision_hidden_size = kwargs.image_features.dims.at(-1); + const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); + + return default_merge_input_ids_with_features({ + // @ts-ignore + image_token_id: this.config.image_token_index, + ...kwargs, + image_features: reshaped_image_hidden_states, + }); + } +} + +export class LlavaOnevisionForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration + +export class Moondream1ForConditionalGeneration extends LlavaForConditionalGeneration {} // NOTE: extends LlavaForConditionalGeneration + +export class LlavaQwen2ForCausalLM extends LlavaPreTrainedModel { + _merge_input_ids_with_image_features(kwargs) { + const vision_hidden_size = kwargs.image_features.dims.at(-1); + const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); + + return default_merge_input_ids_with_image_features({ + // @ts-ignore + image_token_id: this.config.image_token_index, + ...kwargs, + image_features: reshaped_image_hidden_states, + }); + } +} diff --git a/src/models/pre-trained-models/long-t5-pre-trained-model.js b/src/models/pre-trained-models/long-t5-pre-trained-model.js new file mode 100644 index 000000000..3a70ef0b9 --- /dev/null +++ b/src/models/pre-trained-models/long-t5-pre-trained-model.js @@ -0,0 +1,15 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +/** + * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + */ +export class LongT5PreTrainedModel extends PreTrainedModel {} + +/** + * The bare LONGT5 Model transformer outputting raw hidden-states without any specific head on top. + */ +export class LongT5Model extends LongT5PreTrainedModel {} + +/** + * LONGT5 Model with a `language modeling` head on top. + */ +export class LongT5ForConditionalGeneration extends LongT5PreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/m-bart-pre-trained-model.js b/src/models/pre-trained-models/m-bart-pre-trained-model.js new file mode 100644 index 000000000..3ed4ad46d --- /dev/null +++ b/src/models/pre-trained-models/m-bart-pre-trained-model.js @@ -0,0 +1,31 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class MBartPreTrainedModel extends PreTrainedModel {} + +/** + * The bare MBART Model outputting raw hidden-states without any specific head on top. + */ +export class MBartModel extends MBartPreTrainedModel {} + +/** + * The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models. + */ +export class MBartForConditionalGeneration extends MBartPreTrainedModel {} + +/** + * MBart model with a sequence classification/head on top (a linear layer on top of the pooled output). + */ +export class MBartForSequenceClassification extends MBartPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +export class MBartForCausalLM extends MBartPreTrainedModel {} diff --git a/src/models/pre-trained-models/m2-m100-pre-trained-model.js b/src/models/pre-trained-models/m2-m100-pre-trained-model.js new file mode 100644 index 000000000..378814433 --- /dev/null +++ b/src/models/pre-trained-models/m2-m100-pre-trained-model.js @@ -0,0 +1,7 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class M2M100PreTrainedModel extends PreTrainedModel {} + +export class M2M100Model extends M2M100PreTrainedModel {} + +export class M2M100ForConditionalGeneration extends M2M100PreTrainedModel {} diff --git a/src/models/pre-trained-models/marian-pre-trained-model.js b/src/models/pre-trained-models/marian-pre-trained-model.js new file mode 100644 index 000000000..5786cfc86 --- /dev/null +++ b/src/models/pre-trained-models/marian-pre-trained-model.js @@ -0,0 +1,7 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class MarianPreTrainedModel extends PreTrainedModel {} + +export class MarianModel extends MarianPreTrainedModel {} + +export class MarianMTModel extends MarianPreTrainedModel {} diff --git a/src/models/pre-trained-models/mask-former-pre-trained-model.js b/src/models/pre-trained-models/mask-former-pre-trained-model.js new file mode 100644 index 000000000..190ea05b6 --- /dev/null +++ b/src/models/pre-trained-models/mask-former-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class MaskFormerPreTrainedModel extends PreTrainedModel {} +export class MaskFormerModel extends MaskFormerPreTrainedModel {} +export class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel {} diff --git a/src/models/pre-trained-models/metric3-d-pre-trained-model.js b/src/models/pre-trained-models/metric3-d-pre-trained-model.js new file mode 100644 index 000000000..4062435f9 --- /dev/null +++ b/src/models/pre-trained-models/metric3-d-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Metric3DPreTrainedModel extends PreTrainedModel {} +export class Metric3DForDepthEstimation extends Metric3DPreTrainedModel {} diff --git a/src/models/pre-trained-models/metric3-dv2-pre-trained-model.js b/src/models/pre-trained-models/metric3-dv2-pre-trained-model.js new file mode 100644 index 000000000..6ea0ce8a5 --- /dev/null +++ b/src/models/pre-trained-models/metric3-dv2-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Metric3Dv2PreTrainedModel extends PreTrainedModel {} +export class Metric3Dv2ForDepthEstimation extends Metric3Dv2PreTrainedModel {} diff --git a/src/models/pre-trained-models/mgpstr-pre-trained-model.js b/src/models/pre-trained-models/mgpstr-pre-trained-model.js new file mode 100644 index 000000000..d35d2b204 --- /dev/null +++ b/src/models/pre-trained-models/mgpstr-pre-trained-model.js @@ -0,0 +1,17 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { MgpstrModelOutput } from '../output.js'; + +export class MgpstrPreTrainedModel extends PreTrainedModel {} + +/** + * MGP-STR Model transformer with three classification heads on top + * (three A^3 modules and three linear layer on top of the transformer encoder output) for scene text recognition (STR). + */ +export class MgpstrForSceneTextRecognition extends MgpstrPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new MgpstrModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/mimi-pre-trained-model.js b/src/models/pre-trained-models/mimi-pre-trained-model.js new file mode 100644 index 000000000..c33dcd503 --- /dev/null +++ b/src/models/pre-trained-models/mimi-pre-trained-model.js @@ -0,0 +1,54 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { MimiEncoderOutput, MimiDecoderOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class MimiPreTrainedModel extends PreTrainedModel { + main_input_name = 'input_values'; + forward_params = ['input_values']; +} + +/** + * The Mimi neural audio codec model. + */ +export class MimiModel extends MimiPreTrainedModel { + /** + * Encodes the input audio waveform into discrete codes. + * @param {Object} inputs Model inputs + * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`). + * @returns {Promise} The output tensor of shape `(batch_size, num_codebooks, sequence_length)`. + */ + async encode(inputs) { + return new MimiEncoderOutput(await sessionRun(this.sessions['encoder_model'], inputs)); + } + + /** + * Decodes the given frames into an output audio waveform. + * @param {MimiEncoderOutput} inputs The encoded audio codes. + * @returns {Promise} The output tensor of shape `(batch_size, num_channels, sequence_length)`. + */ + async decode(inputs) { + return new MimiDecoderOutput(await sessionRun(this.sessions['decoder_model'], inputs)); + } +} + +export class MimiEncoderModel extends MimiPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'encoder_model', + }); + } +} +export class MimiDecoderModel extends MimiPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'decoder_model', + }); + } +} diff --git a/src/models/pre-trained-models/mistral-pre-trained-model.js b/src/models/pre-trained-models/mistral-pre-trained-model.js new file mode 100644 index 000000000..157137966 --- /dev/null +++ b/src/models/pre-trained-models/mistral-pre-trained-model.js @@ -0,0 +1,10 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Mistral Model outputting raw hidden-states without any specific head on top. + */ +export class MistralPreTrainedModel extends PreTrainedModel {} + +export class MistralModel extends MistralPreTrainedModel {} + +export class MistralForCausalLM extends MistralPreTrainedModel {} diff --git a/src/models/pre-trained-models/mobile-bert-pre-trained-model.js b/src/models/pre-trained-models/mobile-bert-pre-trained-model.js new file mode 100644 index 000000000..1a615980a --- /dev/null +++ b/src/models/pre-trained-models/mobile-bert-pre-trained-model.js @@ -0,0 +1,50 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput } from '../output.js'; + +export class MobileBertPreTrainedModel extends PreTrainedModel {} +export class MobileBertModel extends MobileBertPreTrainedModel {} + +/** + * MobileBertForMaskedLM is a class representing a MobileBERT model for masking task. + */ +export class MobileBertForMaskedLM extends MobileBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * MobileBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class MobileBertForSequenceClassification extends MobileBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * MobileBert Model with a span classification head on top for extractive question-answering tasks + */ +export class MobileBertForQuestionAnswering extends MobileBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/mobile-llm-pre-trained-model.js b/src/models/pre-trained-models/mobile-llm-pre-trained-model.js new file mode 100644 index 000000000..e1f20ee19 --- /dev/null +++ b/src/models/pre-trained-models/mobile-llm-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class MobileLLMPreTrainedModel extends PreTrainedModel {} +export class MobileLLMModel extends MobileLLMPreTrainedModel {} +export class MobileLLMForCausalLM extends MobileLLMPreTrainedModel {} diff --git a/src/models/pre-trained-models/mobile-net-v1-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v1-pre-trained-model.js new file mode 100644 index 000000000..c3f467dc4 --- /dev/null +++ b/src/models/pre-trained-models/mobile-net-v1-pre-trained-model.js @@ -0,0 +1,24 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class MobileNetV1PreTrainedModel extends PreTrainedModel {} + +/** + * The bare MobileNetV1 model outputting raw hidden-states without any specific head on top. + */ +export class MobileNetV1Model extends MobileNetV1PreTrainedModel {} + +/** + * MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features), + * e.g. for ImageNet. + */ +export class MobileNetV1ForImageClassification extends MobileNetV1PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +export class MobileNetV1ForSemanticSegmentation extends MobileNetV1PreTrainedModel {} diff --git a/src/models/pre-trained-models/mobile-net-v2-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v2-pre-trained-model.js new file mode 100644 index 000000000..9801a6c83 --- /dev/null +++ b/src/models/pre-trained-models/mobile-net-v2-pre-trained-model.js @@ -0,0 +1,23 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class MobileNetV2PreTrainedModel extends PreTrainedModel {} + +/** + * The bare MobileNetV2 model outputting raw hidden-states without any specific head on top. + */ +export class MobileNetV2Model extends MobileNetV2PreTrainedModel {} + +/** + * MobileNetV2 model with an image classification head on top (a linear layer on top of the pooled features), + * e.g. for ImageNet. + */ +export class MobileNetV2ForImageClassification extends MobileNetV2PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +export class MobileNetV2ForSemanticSegmentation extends MobileNetV2PreTrainedModel {} diff --git a/src/models/pre-trained-models/mobile-net-v3-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v3-pre-trained-model.js new file mode 100644 index 000000000..c5564a4ac --- /dev/null +++ b/src/models/pre-trained-models/mobile-net-v3-pre-trained-model.js @@ -0,0 +1,23 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class MobileNetV3PreTrainedModel extends PreTrainedModel {} + +/** + * The bare MobileNetV3 model outputting raw hidden-states without any specific head on top. + */ +export class MobileNetV3Model extends MobileNetV3PreTrainedModel {} + +/** + * MobileNetV3 model with an image classification head on top (a linear layer on top of the pooled features), + * e.g. for ImageNet. + */ +export class MobileNetV3ForImageClassification extends MobileNetV3PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +export class MobileNetV3ForSemanticSegmentation extends MobileNetV3PreTrainedModel {} diff --git a/src/models/pre-trained-models/mobile-net-v4-pre-trained-model.js b/src/models/pre-trained-models/mobile-net-v4-pre-trained-model.js new file mode 100644 index 000000000..fbb6584d4 --- /dev/null +++ b/src/models/pre-trained-models/mobile-net-v4-pre-trained-model.js @@ -0,0 +1,23 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class MobileNetV4PreTrainedModel extends PreTrainedModel {} + +/** + * The bare MobileNetV4 model outputting raw hidden-states without any specific head on top. + */ +export class MobileNetV4Model extends MobileNetV4PreTrainedModel {} + +/** + * MobileNetV4 model with an image classification head on top (a linear layer on top of the pooled features), + * e.g. for ImageNet. + */ +export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +export class MobileNetV4ForSemanticSegmentation extends MobileNetV4PreTrainedModel {} diff --git a/src/models/pre-trained-models/mobile-vi-t-pre-trained-model.js b/src/models/pre-trained-models/mobile-vi-t-pre-trained-model.js new file mode 100644 index 000000000..20fc9465b --- /dev/null +++ b/src/models/pre-trained-models/mobile-vi-t-pre-trained-model.js @@ -0,0 +1,14 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class MobileViTPreTrainedModel extends PreTrainedModel {} +export class MobileViTModel extends MobileViTPreTrainedModel {} +export class MobileViTForImageClassification extends MobileViTPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +// TODO: MobileViTForSemanticSegmentation diff --git a/src/models/pre-trained-models/mobile-vi-tv2-pre-trained-model.js b/src/models/pre-trained-models/mobile-vi-tv2-pre-trained-model.js new file mode 100644 index 000000000..a13e3334c --- /dev/null +++ b/src/models/pre-trained-models/mobile-vi-tv2-pre-trained-model.js @@ -0,0 +1,14 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class MobileViTV2PreTrainedModel extends PreTrainedModel {} +export class MobileViTV2Model extends MobileViTV2PreTrainedModel {} +export class MobileViTV2ForImageClassification extends MobileViTV2PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +// TODO: MobileViTV2ForSemanticSegmentation diff --git a/src/models/pre-trained-models/modern-bert-decoder-pre-trained-model.js b/src/models/pre-trained-models/modern-bert-decoder-pre-trained-model.js new file mode 100644 index 000000000..117025583 --- /dev/null +++ b/src/models/pre-trained-models/modern-bert-decoder-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ModernBertDecoderPreTrainedModel extends PreTrainedModel {} +export class ModernBertDecoderModel extends ModernBertDecoderPreTrainedModel {} +export class ModernBertDecoderForCausalLM extends ModernBertDecoderPreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/modern-bert-pre-trained-model.js b/src/models/pre-trained-models/modern-bert-pre-trained-model.js new file mode 100644 index 000000000..3387eeffb --- /dev/null +++ b/src/models/pre-trained-models/modern-bert-pre-trained-model.js @@ -0,0 +1,41 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { MaskedLMOutput, SequenceClassifierOutput, TokenClassifierOutput } from '../output.js'; + +export class ModernBertPreTrainedModel extends PreTrainedModel {} +export class ModernBertModel extends ModernBertPreTrainedModel {} + +export class ModernBertForMaskedLM extends ModernBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +export class ModernBertForSequenceClassification extends ModernBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +export class ModernBertForTokenClassification extends ModernBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/moonshine-pre-trained-model.js b/src/models/pre-trained-models/moonshine-pre-trained-model.js new file mode 100644 index 000000000..bad6b4b6d --- /dev/null +++ b/src/models/pre-trained-models/moonshine-pre-trained-model.js @@ -0,0 +1,14 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class MoonshinePreTrainedModel extends PreTrainedModel { + requires_attention_mask = false; + main_input_name = 'input_values'; + forward_params = ['input_values', 'decoder_input_ids', 'past_key_values']; +} + +/** + * MoonshineModel class for training Moonshine models without a language model head. + */ +export class MoonshineModel extends MoonshinePreTrainedModel {} + +export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel {} diff --git a/src/models/pre-trained-models/mp-net-pre-trained-model.js b/src/models/pre-trained-models/mp-net-pre-trained-model.js new file mode 100644 index 000000000..278f3b922 --- /dev/null +++ b/src/models/pre-trained-models/mp-net-pre-trained-model.js @@ -0,0 +1,74 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class MPNetPreTrainedModel extends PreTrainedModel {} + +/** + * The bare MPNet Model transformer outputting raw hidden-states without any specific head on top. + */ +export class MPNetModel extends MPNetPreTrainedModel {} + +/** + * MPNetForMaskedLM is a class representing a MPNet model for masked language modeling. + */ +export class MPNetForMaskedLM extends MPNetPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * MPNetForSequenceClassification is a class representing a MPNet model for sequence classification. + */ +export class MPNetForSequenceClassification extends MPNetPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * MPNetForTokenClassification is a class representing a MPNet model for token classification. + */ +export class MPNetForTokenClassification extends MPNetPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * MPNetForQuestionAnswering is a class representing a MPNet model for question answering. + */ +export class MPNetForQuestionAnswering extends MPNetPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} \ No newline at end of file diff --git a/src/models/pre-trained-models/mpt-pre-trained-model.js b/src/models/pre-trained-models/mpt-pre-trained-model.js new file mode 100644 index 000000000..0e1d5504c --- /dev/null +++ b/src/models/pre-trained-models/mpt-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class MptPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Mpt Model transformer outputting raw hidden-states without any specific head on top. + */ +export class MptModel extends MptPreTrainedModel {} + +/** + * The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). + */ +export class MptForCausalLM extends MptPreTrainedModel {} diff --git a/src/models/pre-trained-models/mt5-pre-trained-model.js b/src/models/pre-trained-models/mt5-pre-trained-model.js new file mode 100644 index 000000000..ea3b9cea8 --- /dev/null +++ b/src/models/pre-trained-models/mt5-pre-trained-model.js @@ -0,0 +1,10 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class MT5PreTrainedModel extends PreTrainedModel {} + +export class MT5Model extends MT5PreTrainedModel {} + +/** + * A class representing a conditional sequence-to-sequence model based on the MT5 architecture. + */ +export class MT5ForConditionalGeneration extends MT5PreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/multi-modality-pre-trained-model.js b/src/models/pre-trained-models/multi-modality-pre-trained-model.js new file mode 100644 index 000000000..c5685b8d1 --- /dev/null +++ b/src/models/pre-trained-models/multi-modality-pre-trained-model.js @@ -0,0 +1,113 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { pick } from '../../utils/core.js'; +import { decoderForward } from '../utils.js'; +import { RawImage } from '../../utils/image.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class MultiModalityPreTrainedModel extends PreTrainedModel {} +export class MultiModalityCausalLM extends MultiModalityPreTrainedModel { + forward_params = [ + // prepare_inputs_embeds + 'input_ids', + 'pixel_values', + 'images_seq_mask', + 'images_emb_mask', + + // language_model + 'attention_mask', + 'position_ids', + 'past_key_values', + ]; + + /** + * @param {ConstructorParameters} args + */ + constructor(...args) { + super(...args); + + // State-based approach to switch out which heads to use during generation + this._generation_mode = 'text'; + } + + async forward(model_inputs) { + const mode = this._generation_mode ?? 'text'; + + // TODO support re-using PKVs for input_ids.dims[1] !== 1 + // if (model_inputs.past_key_values) { + // // && model_inputs.input_ids.dims[1] === 1 + // } + + let output_1; + if (mode === 'text' || !model_inputs.past_key_values) { + const session = this.sessions['prepare_inputs_embeds']; + const prep_inputs = pick(model_inputs, session.inputNames); + output_1 = await sessionRun(session, prep_inputs); + } else { + const session = this.sessions['gen_img_embeds']; + const prep_inputs = pick( + { + image_ids: model_inputs.input_ids, + }, + session.inputNames, + ); + output_1 = await sessionRun(session, prep_inputs); + } + + const input_2 = { ...model_inputs, ...output_1 }; + const output_2 = await decoderForward(this, input_2); + + const head = this.sessions[mode === 'text' ? 'lm_head' : 'gen_head']; + if (!head) { + throw new Error(`Unable to find "${head}" generation head`); + } + + const output_3 = await sessionRun(head, pick(output_2, head.inputNames)); + + return { + ...output_1, + ...output_2, + ...output_3, + }; + } + + /** + * @param {import('../../generation/parameters.js').GenerationFunctionParameters} options + */ + async generate(options) { + this._generation_mode = 'text'; + return super.generate(options); + } + + /** + * @param {import('../../generation/parameters.js').GenerationFunctionParameters} options + */ + async generate_images(options) { + this._generation_mode = 'image'; + + const start_num_tokens = (options.inputs ?? options[this.main_input_name]).dims[1]; + const all_tokens = await super.generate(options); + + const generated_tokens = /** @type {Tensor} */ (all_tokens).slice(null, [start_num_tokens, null]); + + const image_decode = this.sessions['image_decode']; + const { decoded_image } = await sessionRun(image_decode, { + generated_tokens, + }); + + // Equivalent to `np.clip((dec + 1) / 2 * 255, 0, 255)` + const clamped = decoded_image + .add_(1) + .mul_(255 / 2) + .clamp_(0, 255) + .to('uint8'); + + // Return as a list of images + const images = []; + for (const tensor of clamped) { + const img = RawImage.fromTensor(tensor); + images.push(img); + } + return images; + } +} diff --git a/src/models/pre-trained-models/musicgen-pre-trained-model.js b/src/models/pre-trained-models/musicgen-pre-trained-model.js new file mode 100644 index 000000000..d00a30c00 --- /dev/null +++ b/src/models/pre-trained-models/musicgen-pre-trained-model.js @@ -0,0 +1,139 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { Tensor } from '../../utils/tensor.js'; +import { ModelOutput } from '../output.js'; + +export class MusicgenPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Musicgen decoder model outputting raw hidden-states without any specific head on top. + */ +export class MusicgenModel extends MusicgenPreTrainedModel {} + +/** + * The MusicGen decoder model with a language modelling head on top. + */ +export class MusicgenForCausalLM extends MusicgenPreTrainedModel {} + +/** + * The composite MusicGen model with a text encoder, audio encoder and Musicgen decoder, + * for music generation tasks with one or both of text and audio prompts. + * + * **Example:** Generate music from text with `Xenova/musicgen-small`. + * ```javascript + * import { AutoTokenizer, MusicgenForConditionalGeneration } from '@huggingface/transformers'; + * + * // Load tokenizer and model + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/musicgen-small'); + * const model = await MusicgenForConditionalGeneration.from_pretrained( + * 'Xenova/musicgen-small', { dtype: 'fp32' } + * ); + * + * // Prepare text input + * const prompt = '80s pop track with bassy drums and synth'; + * const inputs = tokenizer(prompt); + * + * // Generate audio + * const audio_values = await model.generate({ + * ...inputs, + * max_new_tokens: 512, + * do_sample: true, + * guidance_scale: 3, + * }); + * + * // (Optional) Write the output to a WAV file + * import wavefile from 'wavefile'; + * import fs from 'fs'; + * + * const wav = new wavefile.WaveFile(); + * wav.fromScratch(1, model.config.audio_encoder.sampling_rate, '32f', audio_values.data); + * fs.writeFileSync('musicgen_out.wav', wav.toBuffer()); + * ``` + */ +export class MusicgenForConditionalGeneration extends PreTrainedModel { + // NOTE: not MusicgenPreTrainedModel + forward_params = [ + 'input_ids', + 'attention_mask', + 'encoder_outputs', + 'decoder_input_ids', + 'decoder_attention_mask', + 'past_key_values', + ]; + + /** + * Apply the pattern mask to the final ids, + * then revert the pattern delay mask by filtering the pad token id in a single step. + * @param {Tensor} outputs The output tensor from the model. + * @returns {Tensor} The filtered output tensor. + */ + _apply_and_filter_by_delay_pattern_mask(outputs) { + const [bs_x_codebooks, seqLength] = outputs.dims; + // @ts-expect-error TS2339 + const num_codebooks = this.config.decoder.num_codebooks; + const upperBound = seqLength - num_codebooks; + + let newDataSize = 0; + for (let i = 0; i < outputs.size; ++i) { + // @ts-expect-error TS2339 + if (outputs.data[i] === this.config.decoder.pad_token_id) { + continue; + } + + const row = i % seqLength; + const col = Math.floor(i / seqLength) % num_codebooks; + + const diff = row - col; + if (diff > 0 && diff <= upperBound) { + outputs.data[newDataSize++] = outputs.data[i]; + } + } + + const batch_size = Math.floor(bs_x_codebooks / num_codebooks); + const inferred = newDataSize / (batch_size * num_codebooks); + // TODO: assert `inferred` is an integer + return new Tensor(outputs.type, outputs.data.slice(0, newDataSize), [batch_size, num_codebooks, inferred]); + } + + prepare_inputs_for_generation(input_ids, model_inputs, generation_config) { + // apply the delay pattern mask + let clonedInputIds = structuredClone(input_ids); + for (let i = 0; i < clonedInputIds.length; ++i) { + for (let j = 0; j < clonedInputIds[i].length; ++j) { + // @ts-expect-error TS2339 + if (i % this.config.decoder.num_codebooks >= j) { + // @ts-expect-error TS2339 + clonedInputIds[i][j] = BigInt(this.config.decoder.pad_token_id); + } + } + } + // for classifier free guidance we need to replicate the decoder args across the batch dim + // (we'll split these before sampling) + if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { + // [batch, seqLength] -> [2 * batch, seqLength] + clonedInputIds = clonedInputIds.concat(clonedInputIds); + } + + const prepped = super.prepare_inputs_for_generation(clonedInputIds, model_inputs, generation_config); + return prepped; + } + + /** + * Generates sequences of token ids for models with a language modeling head. + * @param {import('../../generation/parameters.js').GenerationFunctionParameters} options + * @returns {Promise} The output of the model, which can contain the generated token ids, attentions, and scores. + */ + async generate(options) { + const output_ids = await super.generate(options); + + // apply the pattern mask to the final ids + // tensor: int64[1,batch_size,4,chunk_length] + const audio_codes = this._apply_and_filter_by_delay_pattern_mask(/** @type {Tensor} */ (output_ids)).unsqueeze_( + 0, + ); // append the frame dimension back to the audio codes + + const { audio_values } = await sessionRun(this.sessions['encodec_decode'], { audio_codes }); + + return audio_values; + } +} diff --git a/src/models/pre-trained-models/nano-chat-pre-trained-model.js b/src/models/pre-trained-models/nano-chat-pre-trained-model.js new file mode 100644 index 000000000..11935915e --- /dev/null +++ b/src/models/pre-trained-models/nano-chat-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class NanoChatPreTrainedModel extends PreTrainedModel {} +export class NanoChatModel extends NanoChatPreTrainedModel {} +export class NanoChatForCausalLM extends NanoChatPreTrainedModel {} diff --git a/src/models/pre-trained-models/neo-bert-pre-trained-model.js b/src/models/pre-trained-models/neo-bert-pre-trained-model.js new file mode 100644 index 000000000..5c352ff86 --- /dev/null +++ b/src/models/pre-trained-models/neo-bert-pre-trained-model.js @@ -0,0 +1,58 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + SequenceClassifierOutput, + TokenClassifierOutput, + QuestionAnsweringModelOutput, +} from '../output.js'; + +export class NeoBertPreTrainedModel extends PreTrainedModel {} +export class NeoBertModel extends NeoBertPreTrainedModel {} + +export class NeoBertForMaskedLM extends NeoBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +export class NeoBertForSequenceClassification extends NeoBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +export class NeoBertForTokenClassification extends NeoBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +export class NeoBertForQuestionAnswering extends NeoBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/nomic-bert-pre-trained-model.js b/src/models/pre-trained-models/nomic-bert-pre-trained-model.js new file mode 100644 index 000000000..fc6b8b17c --- /dev/null +++ b/src/models/pre-trained-models/nomic-bert-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class NomicBertPreTrainedModel extends PreTrainedModel {} +export class NomicBertModel extends NomicBertPreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/olmo-pre-trained-model.js b/src/models/pre-trained-models/olmo-pre-trained-model.js new file mode 100644 index 000000000..15c656d77 --- /dev/null +++ b/src/models/pre-trained-models/olmo-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class OlmoPreTrainedModel extends PreTrainedModel {} +export class OlmoModel extends OlmoPreTrainedModel {} +export class OlmoForCausalLM extends OlmoPreTrainedModel {} diff --git a/src/models/pre-trained-models/olmo2-pre-trained-model.js b/src/models/pre-trained-models/olmo2-pre-trained-model.js new file mode 100644 index 000000000..53a88c7b9 --- /dev/null +++ b/src/models/pre-trained-models/olmo2-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Olmo2PreTrainedModel extends PreTrainedModel {} +export class Olmo2Model extends Olmo2PreTrainedModel {} +export class Olmo2ForCausalLM extends Olmo2PreTrainedModel {} diff --git a/src/models/pre-trained-models/olmo3-pre-trained-model.js b/src/models/pre-trained-models/olmo3-pre-trained-model.js new file mode 100644 index 000000000..77c98cb35 --- /dev/null +++ b/src/models/pre-trained-models/olmo3-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Olmo3PreTrainedModel extends PreTrainedModel {} +export class Olmo3Model extends Olmo3PreTrainedModel {} +export class Olmo3ForCausalLM extends Olmo3PreTrainedModel {} diff --git a/src/models/pre-trained-models/open-elm-pre-trained-model.js b/src/models/pre-trained-models/open-elm-pre-trained-model.js new file mode 100644 index 000000000..9c76a0342 --- /dev/null +++ b/src/models/pre-trained-models/open-elm-pre-trained-model.js @@ -0,0 +1,6 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class OpenELMPreTrainedModel extends PreTrainedModel {} +export class OpenELMModel extends OpenELMPreTrainedModel {} + +export class OpenELMForCausalLM extends OpenELMPreTrainedModel {} diff --git a/src/models/pre-trained-models/opt-pre-trained-model.js b/src/models/pre-trained-models/opt-pre-trained-model.js new file mode 100644 index 000000000..a529a51bd --- /dev/null +++ b/src/models/pre-trained-models/opt-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class OPTPreTrainedModel extends PreTrainedModel {} + +/** + * The bare OPT Model outputting raw hidden-states without any specific head on top. + */ +export class OPTModel extends OPTPreTrainedModel {} + +/** + * The OPT Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). + */ +export class OPTForCausalLM extends OPTPreTrainedModel {} diff --git a/src/models/pre-trained-models/owl-vi-t-pre-trained-model.js b/src/models/pre-trained-models/owl-vi-t-pre-trained-model.js new file mode 100644 index 000000000..a74da309d --- /dev/null +++ b/src/models/pre-trained-models/owl-vi-t-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class OwlViTPreTrainedModel extends PreTrainedModel {} +export class OwlViTModel extends OwlViTPreTrainedModel {} +export class OwlViTForObjectDetection extends OwlViTPreTrainedModel {} diff --git a/src/models/pre-trained-models/owlv2-pre-trained-model.js b/src/models/pre-trained-models/owlv2-pre-trained-model.js new file mode 100644 index 000000000..6e720ef74 --- /dev/null +++ b/src/models/pre-trained-models/owlv2-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Owlv2PreTrainedModel extends PreTrainedModel {} +export class Owlv2Model extends Owlv2PreTrainedModel {} +export class Owlv2ForObjectDetection extends Owlv2PreTrainedModel {} diff --git a/src/models/pre-trained-models/pali-gemma-pre-trained-model.js b/src/models/pre-trained-models/pali-gemma-pre-trained-model.js new file mode 100644 index 000000000..7745b08a0 --- /dev/null +++ b/src/models/pre-trained-models/pali-gemma-pre-trained-model.js @@ -0,0 +1,27 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { default_merge_input_ids_with_image_features } from '../utils.js'; + +export class PaliGemmaPreTrainedModel extends PreTrainedModel { + forward_params = [ + 'input_ids', + // 'inputs_embeds', + 'attention_mask', + 'pixel_values', + 'position_ids', + 'past_key_values', + ]; +} + +export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel { + _merge_input_ids_with_image_features(kwargs) { + const vision_hidden_size = kwargs.image_features.dims.at(-1); + const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size); + + return default_merge_input_ids_with_image_features({ + // @ts-ignore + image_token_id: this.config.image_token_index, + ...kwargs, + image_features: reshaped_image_hidden_states, + }); + } +} \ No newline at end of file diff --git a/src/models/pre-trained-models/parakeet-pre-trained-model.js b/src/models/pre-trained-models/parakeet-pre-trained-model.js new file mode 100644 index 000000000..99245700a --- /dev/null +++ b/src/models/pre-trained-models/parakeet-pre-trained-model.js @@ -0,0 +1,15 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CausalLMOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class ParakeetPreTrainedModel extends PreTrainedModel {} +export class ParakeetForCTC extends ParakeetPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/patch-ts-mixer-pre-trained-model.js b/src/models/pre-trained-models/patch-ts-mixer-pre-trained-model.js new file mode 100644 index 000000000..c04dfd3ca --- /dev/null +++ b/src/models/pre-trained-models/patch-ts-mixer-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class PatchTSMixerPreTrainedModel extends PreTrainedModel {} + +/** + * The bare PatchTSMixer Model outputting raw hidden-states without any specific head. + */ +export class PatchTSMixerModel extends PatchTSMixerPreTrainedModel {} + +/** + * The PatchTSMixer for prediction model. + */ +export class PatchTSMixerForPrediction extends PatchTSMixerPreTrainedModel {} diff --git a/src/models/pre-trained-models/patch-tst-pre-trained-model.js b/src/models/pre-trained-models/patch-tst-pre-trained-model.js new file mode 100644 index 000000000..ba24a1b9f --- /dev/null +++ b/src/models/pre-trained-models/patch-tst-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class PatchTSTPreTrainedModel extends PreTrainedModel {} + +/** + * The bare PatchTST Model outputting raw hidden-states without any specific head. + */ +export class PatchTSTModel extends PatchTSTPreTrainedModel {} + +/** + * The PatchTST for prediction model. + */ +export class PatchTSTForPrediction extends PatchTSTPreTrainedModel {} diff --git a/src/models/pre-trained-models/phi-pre-trained-model.js b/src/models/pre-trained-models/phi-pre-trained-model.js new file mode 100644 index 000000000..979e4c8ef --- /dev/null +++ b/src/models/pre-trained-models/phi-pre-trained-model.js @@ -0,0 +1,10 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class PhiPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Phi Model outputting raw hidden-states without any specific head on top. + */ +export class PhiModel extends PhiPreTrainedModel {} + +export class PhiForCausalLM extends PhiPreTrainedModel {} diff --git a/src/models/pre-trained-models/phi3-pre-trained-model.js b/src/models/pre-trained-models/phi3-pre-trained-model.js new file mode 100644 index 000000000..5bc47915c --- /dev/null +++ b/src/models/pre-trained-models/phi3-pre-trained-model.js @@ -0,0 +1,10 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Phi3PreTrainedModel extends PreTrainedModel {} + +/** + * The bare Phi3 Model outputting raw hidden-states without any specific head on top. + */ +export class Phi3Model extends Phi3PreTrainedModel {} + +export class Phi3ForCausalLM extends Phi3PreTrainedModel {} diff --git a/src/models/pre-trained-models/phi3-v-pre-trained-model.js b/src/models/pre-trained-models/phi3-v-pre-trained-model.js new file mode 100644 index 000000000..9c5331a42 --- /dev/null +++ b/src/models/pre-trained-models/phi3-v-pre-trained-model.js @@ -0,0 +1,74 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { decoderForward } from '../utils.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class Phi3VPreTrainedModel extends PreTrainedModel { + forward_params = [ + 'input_ids', + 'inputs_embeds', + 'attention_mask', + 'position_ids', + 'pixel_values', + 'image_sizes', + 'past_key_values', + ]; +} +export class Phi3VForCausalLM extends Phi3VPreTrainedModel { + async forward({ + // Produced by the tokenizer/processor: + input_ids = null, + attention_mask = null, + pixel_values = null, + image_sizes = null, + + // Used during generation: + position_ids = null, + inputs_embeds = null, + past_key_values = null, + + // Generic generation parameters + generation_config = null, + logits_processor = null, + + // TODO: needed? + ...kwargs + }) { + if (!inputs_embeds) { + let image_features; + if (pixel_values && input_ids.dims[1] !== 1) { + if (!image_sizes) { + throw new Error('`image_sizes` must be provided when `pixel_values` is provided.'); + } + + // Encode the image + ({ image_features } = await sessionRun(this.sessions['vision_encoder'], { + pixel_values, + image_sizes, + })); + } else { + const hidden_size = this.config.normalized_config.hidden_size; + image_features = new Tensor('float32', [], [0, hidden_size]); + } + + ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], { + input_ids, + image_features, + })); + } + + const outputs = await decoderForward( + this, + { + inputs_embeds, + past_key_values, + attention_mask, + position_ids, + generation_config, + logits_processor, + }, + false, + ); + return outputs; + } +} diff --git a/src/models/pre-trained-models/pvt-pre-trained-model.js b/src/models/pre-trained-models/pvt-pre-trained-model.js new file mode 100644 index 000000000..70ea887fb --- /dev/null +++ b/src/models/pre-trained-models/pvt-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class PvtPreTrainedModel extends PreTrainedModel {} +export class PvtModel extends PvtPreTrainedModel {} +export class PvtForImageClassification extends PvtPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/py-annote-pre-trained-model.js b/src/models/pre-trained-models/py-annote-pre-trained-model.js new file mode 100644 index 000000000..f7e02c169 --- /dev/null +++ b/src/models/pre-trained-models/py-annote-pre-trained-model.js @@ -0,0 +1,77 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { TokenClassifierOutput } from '../output.js'; + +export class PyAnnotePreTrainedModel extends PreTrainedModel {} + +/** + * The bare PyAnnote Model transformer outputting raw hidden-states without any specific head on top. + */ +export class PyAnnoteModel extends PyAnnotePreTrainedModel {} + +/** + * PyAnnote Model with a frame classification head on top for tasks like Speaker Diarization. + * + * **Example:** Load and run a `PyAnnoteForAudioFrameClassification` for speaker diarization. + * + * ```javascript + * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers'; + * + * // Load model and processor + * const model_id = 'onnx-community/pyannote-segmentation-3.0'; + * const model = await AutoModelForAudioFrameClassification.from_pretrained(model_id); + * const processor = await AutoProcessor.from_pretrained(model_id); + * + * // Read and preprocess audio + * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/mlk.wav'; + * const audio = await read_audio(url, processor.feature_extractor.config.sampling_rate); + * const inputs = await processor(audio); + * + * // Run model with inputs + * const { logits } = await model(inputs); + * // { + * // logits: Tensor { + * // dims: [ 1, 767, 7 ], // [batch_size, num_frames, num_classes] + * // type: 'float32', + * // data: Float32Array(5369) [ ... ], + * // size: 5369 + * // } + * // } + * + * const result = processor.post_process_speaker_diarization(logits, audio.length); + * // [ + * // [ + * // { id: 0, start: 0, end: 1.0512535626298245, confidence: 0.8220156481664611 }, + * // { id: 2, start: 1.0512535626298245, end: 2.3398869619825127, confidence: 0.9008811707860472 }, + * // ... + * // ] + * // ] + * + * // Display result + * console.table(result[0], ['start', 'end', 'id', 'confidence']); + * // ┌─────────┬────────────────────┬────────────────────┬────┬─────────────────────┐ + * // │ (index) │ start │ end │ id │ confidence │ + * // ├─────────┼────────────────────┼────────────────────┼────┼─────────────────────┤ + * // │ 0 │ 0 │ 1.0512535626298245 │ 0 │ 0.8220156481664611 │ + * // │ 1 │ 1.0512535626298245 │ 2.3398869619825127 │ 2 │ 0.9008811707860472 │ + * // │ 2 │ 2.3398869619825127 │ 3.5946089560890773 │ 0 │ 0.7521651315796233 │ + * // │ 3 │ 3.5946089560890773 │ 4.578039708226655 │ 2 │ 0.8491978128022479 │ + * // │ 4 │ 4.578039708226655 │ 4.594995410849717 │ 0 │ 0.2935352600416393 │ + * // │ 5 │ 4.594995410849717 │ 6.121008646925269 │ 3 │ 0.6788051309866024 │ + * // │ 6 │ 6.121008646925269 │ 6.256654267909762 │ 0 │ 0.37125512393851134 │ + * // │ 7 │ 6.256654267909762 │ 8.630452635138397 │ 2 │ 0.7467035186353542 │ + * // │ 8 │ 8.630452635138397 │ 10.088643060721703 │ 0 │ 0.7689364814666032 │ + * // │ 9 │ 10.088643060721703 │ 12.58113134631177 │ 2 │ 0.9123324509131324 │ + * // │ 10 │ 12.58113134631177 │ 13.005023911888312 │ 0 │ 0.4828358177572041 │ + * // └─────────┴────────────────────┴────────────────────┴────┴─────────────────────┘ + * ``` + */ +export class PyAnnoteForAudioFrameClassification extends PyAnnotePreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/qwen2-pre-trained-model.js b/src/models/pre-trained-models/qwen2-pre-trained-model.js new file mode 100644 index 000000000..23cd85995 --- /dev/null +++ b/src/models/pre-trained-models/qwen2-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Qwen2 Model outputting raw hidden-states without any specific head on top. + */ +export class Qwen2PreTrainedModel extends PreTrainedModel {} + +/** + * The bare Qwen2 Model outputting raw hidden-states without any specific head on top. + */ +export class Qwen2Model extends Qwen2PreTrainedModel {} + +export class Qwen2ForCausalLM extends Qwen2PreTrainedModel {} diff --git a/src/models/pre-trained-models/qwen2-vl-pre-trained-model.js b/src/models/pre-trained-models/qwen2-vl-pre-trained-model.js new file mode 100644 index 000000000..92b5d256a --- /dev/null +++ b/src/models/pre-trained-models/qwen2-vl-pre-trained-model.js @@ -0,0 +1,251 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { stack, Tensor, ones_like, zeros } from '../../utils/tensor.js'; +import { max } from '../../utils/maths.js'; +import { cumsum_masked_fill, default_merge_input_ids_with_image_features } from '../utils.js'; + +export class Qwen2VLPreTrainedModel extends PreTrainedModel { + forward_params = [ + // Text inputs + 'input_ids', + 'attention_mask', + 'position_ids', + 'past_key_values', + + // Vision inputs + 'pixel_values', + 'image_grid_thw', + ]; +} +export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel { + /** + * Calculate the 3D rope index based on image and video's temporal, height and width in LLM. + * + * Explanation: + * Each embedding sequence contains vision embedding and text embedding or just contains text embedding. + * + * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs. + * Examples: + * input_ids: [T T T T T], here T is for text. + * temporal position_ids: [0, 1, 2, 3, 4] + * height position_ids: [0, 1, 2, 3, 4] + * width position_ids: [0, 1, 2, 3, 4] + * + * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part + * and 1D rotary position embeddin for text part. + * Examples: + * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches. + * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision. + * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2] + * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1] + * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] + * text temporal position_ids: [3, 4, 5, 6, 7] + * text height position_ids: [3, 4, 5, 6, 7] + * text width position_ids: [3, 4, 5, 6, 7] + * Here we calculate the text start position_ids as the max vision position_ids plus 1. + * + * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`. + * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`. + * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`. + * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`: + * - 1 for tokens that are **not masked**, + * - 0 for tokens that are **masked**. + * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with: + * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`. + * - mrope_position_deltas: Tensor of shape `(batch_size)`. + */ + get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) { + // @ts-ignore + const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config; + const spatial_merge_size = vision_config.spatial_merge_size ?? 2; + + const mrope_position_deltas = []; + if (image_grid_thw || video_grid_thw) { + let total_input_ids = input_ids.tolist(); + if (!attention_mask) { + attention_mask = ones_like(input_ids); + } + + const attention_mask_list = attention_mask.tolist(); + const position_ids_list = Array.from({ length: 3 }, (_) => + Array.from({ length: input_ids.dims[0] }, (_) => Array.from({ length: input_ids.dims[1] }, (_) => 1)), + ); + + const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : []; + const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : []; + + let image_index = 0; + let video_index = 0; + for (let i = 0; i < total_input_ids.length; ++i) { + const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1); + + const vision_start_indices = ids.reduce((acc, x, idx) => { + if (x == vision_start_token_id) acc.push(idx); + return acc; + }, []); + + const vision_tokens = vision_start_indices.map((x) => ids[x + 1]); + const image_nums = vision_tokens.filter((x) => x == image_token_id).length; + const video_nums = vision_tokens.filter((x) => x == video_token_id).length; + + /** @type {number[][]} */ + let llm_pos_ids_list = []; + let st = 0; + let remain_images = image_nums; + let remain_videos = video_nums; + for (let j = 0; j < vision_tokens.length; ++j) { + const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id); + const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id); + + const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1; + + const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1; + + let ed; + let t, h, w; + if (ed_image < ed_video) { + [t, h, w] = image_grid_thw_list[image_index]; + ++image_index; + --remain_images; + ed = ed_image; + } else { + [t, h, w] = video_grid_thw_list[video_index]; + ++video_index; + --remain_videos; + ed = ed_video; + } + + const [llm_grid_t, llm_grid_h, llm_grid_w] = [ + Number(t), + Math.floor(Number(h) / spatial_merge_size), + Math.floor(Number(w) / spatial_merge_size), + ]; + const text_len = ed - st; + const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0; + + llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len))); + + const offset = text_len + st_idx; + const grid_size = llm_grid_t * llm_grid_h * llm_grid_w; + const t_index = Array.from( + { length: grid_size }, + (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)), + ); + const h_index = Array.from( + { length: grid_size }, + (_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h), + ); + const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w)); + + llm_pos_ids_list.push([t_index, h_index, w_index].flat()); + + st = ed + grid_size; + } + + if (st < ids.length) { + const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0; + const text_len = ids.length - st; + + llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len))); + } + + // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len), + // meaning to perform concatenation along dim=1, we can do the following: + const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0); + /** @type {number[]} */ + const llm_positions = new Array(num_items); + let index = 0; + for (let x = 0; x < 3; ++x) { + for (let y = 0; y < llm_pos_ids_list.length; ++y) { + const val = llm_pos_ids_list[y]; + const text_len = val.length / 3; + for (let z = x * text_len; z < (x + 1) * text_len; ++z) { + llm_positions[index++] = val[z]; + } + } + } + + let count = 0; + const attn_mask = attention_mask_list[i]; + for (let y = 0; y < attn_mask.length; ++y) { + if (attn_mask[y] == 1) { + for (let x = 0; x < 3; ++x) { + position_ids_list[x][i][y] = llm_positions[(x * num_items) / 3 + count]; + } + ++count; + } + } + + const max_llm_positions = max(llm_positions)[0]; + mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length); + } + + return [ + new Tensor('int64', position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]), + new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]), + ]; + } else { + // Text-only + if (attention_mask) { + const { data, dims } = cumsum_masked_fill(attention_mask); + + const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]); + /** @type {bigint[]} */ + const mrope_position_deltas = Array.from( + { length: dims[0] }, + (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]), + ); + + return [ + new Tensor('int64', position_ids, [3, ...dims]), + new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]), + ]; + } else { + const [batch_size, seq_length] = input_ids.dims; + const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) => + BigInt(Math.floor((i % seq_length) / batch_size)), + ); + + return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])]; + } + } + } + + async encode_image({ pixel_values, image_grid_thw }) { + const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, grid_thw: image_grid_thw })) + .image_features; + return features; + } + + _merge_input_ids_with_image_features(kwargs) { + return default_merge_input_ids_with_image_features({ + // @ts-ignore + image_token_id: this.config.image_token_id, + ...kwargs, + }); + } + + prepare_inputs_for_generation(input_ids, model_inputs, generation_config) { + // Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if (model_inputs.attention_mask && !model_inputs.position_ids) { + // Calculate position_ids and rope_deltas + if (!model_inputs.past_key_values) { + [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index( + model_inputs.input_ids, + model_inputs.image_grid_thw, + model_inputs.video_grid_thw, + model_inputs.attention_mask, + ); + } else { + model_inputs.pixel_values = null; + // model_inputs.pixel_values_videos = null; + + const delta = BigInt(Object.values(model_inputs.past_key_values)[0].dims.at(-2)); + const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x); + model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0); + } + } + + return model_inputs; + } +} diff --git a/src/models/pre-trained-models/qwen3-pre-trained-model.js b/src/models/pre-trained-models/qwen3-pre-trained-model.js new file mode 100644 index 000000000..bcef352b5 --- /dev/null +++ b/src/models/pre-trained-models/qwen3-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Qwen3 Model outputting raw hidden-states without any specific head on top. + */ +export class Qwen3PreTrainedModel extends PreTrainedModel {} + +/** + * The bare Qwen3 Model outputting raw hidden-states without any specific head on top. + */ +export class Qwen3Model extends Qwen3PreTrainedModel {} + +export class Qwen3ForCausalLM extends Qwen3PreTrainedModel {} diff --git a/src/models/pre-trained-models/res-net-pre-trained-model.js b/src/models/pre-trained-models/res-net-pre-trained-model.js new file mode 100644 index 000000000..9bb37a428 --- /dev/null +++ b/src/models/pre-trained-models/res-net-pre-trained-model.js @@ -0,0 +1,24 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +/** + * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + */ +export class ResNetPreTrainedModel extends PreTrainedModel {} + +/** + * The bare ResNet model outputting raw features without any specific head on top. + */ +export class ResNetModel extends ResNetPreTrainedModel {} + +/** + * ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. + */ +export class ResNetForImageClassification extends ResNetPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/rf-detr-pre-trained-model.js b/src/models/pre-trained-models/rf-detr-pre-trained-model.js new file mode 100644 index 000000000..9a7d79f4b --- /dev/null +++ b/src/models/pre-trained-models/rf-detr-pre-trained-model.js @@ -0,0 +1,15 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { RTDetrObjectDetectionOutput } from './rt-detr-pre-trained-model.js'; + +export class RFDetrPreTrainedModel extends PreTrainedModel {} +export class RFDetrModel extends RFDetrPreTrainedModel {} +export class RFDetrForObjectDetection extends RFDetrPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new RFDetrObjectDetectionOutput(await super._call(model_inputs)); + } +} + +export class RFDetrObjectDetectionOutput extends RTDetrObjectDetectionOutput {} diff --git a/src/models/pre-trained-models/ro-former-pre-trained-model.js b/src/models/pre-trained-models/ro-former-pre-trained-model.js new file mode 100644 index 000000000..a9f41d6e1 --- /dev/null +++ b/src/models/pre-trained-models/ro-former-pre-trained-model.js @@ -0,0 +1,77 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, + MaskedLMOutput, +} from '../output.js'; + +export class RoFormerPreTrainedModel extends PreTrainedModel {} + +/** + * The bare RoFormer Model transformer outputting raw hidden-states without any specific head on top. + */ +export class RoFormerModel extends RoFormerPreTrainedModel {} + +/** + * RoFormer Model with a `language modeling` head on top. + */ +export class RoFormerForMaskedLM extends RoFormerPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for masked language modeling. + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * RoFormer Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class RoFormerForSequenceClassification extends RoFormerPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * RoFormer Model with a token classification head on top (a linear layer on top of the hidden-states output) + * e.g. for Named-Entity-Recognition (NER) tasks. + */ +export class RoFormerForTokenClassification extends RoFormerPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * RoFormer Model with a span classification head on top for extractive question-answering tasks like SQuAD + * (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + */ +export class RoFormerForQuestionAnswering extends RoFormerPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for question answering. + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} +// TODO: Add RoFormerForCausalLM and RoFormerForMultipleChoice \ No newline at end of file diff --git a/src/models/pre-trained-models/roberta-pre-trained-model.js b/src/models/pre-trained-models/roberta-pre-trained-model.js new file mode 100644 index 000000000..982bc1b5c --- /dev/null +++ b/src/models/pre-trained-models/roberta-pre-trained-model.js @@ -0,0 +1,70 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class RobertaPreTrainedModel extends PreTrainedModel {} +export class RobertaModel extends RobertaPreTrainedModel {} + +/** + * RobertaForMaskedLM class for performing masked language modeling on Roberta models. + */ +export class RobertaForMaskedLM extends RobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * RobertaForSequenceClassification class for performing sequence classification on Roberta models. + */ +export class RobertaForSequenceClassification extends RobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * RobertaForTokenClassification class for performing token classification on Roberta models. + */ +export class RobertaForTokenClassification extends RobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * RobertaForQuestionAnswering class for performing question answering on Roberta models. + */ +export class RobertaForQuestionAnswering extends RobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} \ No newline at end of file diff --git a/src/models/pre-trained-models/rt-detr-pre-trained-model.js b/src/models/pre-trained-models/rt-detr-pre-trained-model.js new file mode 100644 index 000000000..fa39c4678 --- /dev/null +++ b/src/models/pre-trained-models/rt-detr-pre-trained-model.js @@ -0,0 +1,28 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { ModelOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class RTDetrPreTrainedModel extends PreTrainedModel {} +export class RTDetrModel extends RTDetrPreTrainedModel {} +export class RTDetrForObjectDetection extends RTDetrPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new RTDetrObjectDetectionOutput(await super._call(model_inputs)); + } +} + +export class RTDetrObjectDetectionOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification logits (including no-object) for all queries. + * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). + * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). + */ + constructor({ logits, pred_boxes }) { + super(); + this.logits = logits; + this.pred_boxes = pred_boxes; + } +} diff --git a/src/models/pre-trained-models/rt-detr-v2-pre-trained-model.js b/src/models/pre-trained-models/rt-detr-v2-pre-trained-model.js new file mode 100644 index 000000000..02a9398e4 --- /dev/null +++ b/src/models/pre-trained-models/rt-detr-v2-pre-trained-model.js @@ -0,0 +1,15 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { RTDetrObjectDetectionOutput } from './rt-detr-pre-trained-model.js'; + +export class RTDetrV2PreTrainedModel extends PreTrainedModel {} +export class RTDetrV2Model extends RTDetrV2PreTrainedModel {} +export class RTDetrV2ForObjectDetection extends RTDetrV2PreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new RTDetrV2ObjectDetectionOutput(await super._call(model_inputs)); + } +} + +export class RTDetrV2ObjectDetectionOutput extends RTDetrObjectDetectionOutput {} diff --git a/src/models/pre-trained-models/sam-pre-trained-model.js b/src/models/pre-trained-models/sam-pre-trained-model.js new file mode 100644 index 000000000..3b4d9d0cf --- /dev/null +++ b/src/models/pre-trained-models/sam-pre-trained-model.js @@ -0,0 +1,130 @@ +import { encoderForward } from '../utils.js'; +import { ones } from '../../utils/tensor.js'; +import { sessionRun } from '../session.js'; +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SamImageSegmentationOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class SamPreTrainedModel extends PreTrainedModel {} + +/** + * Segment Anything Model (SAM) for generating segmentation masks, given an input image + * and optional 2D location and bounding boxes. + * + * **Example:** Perform mask generation w/ `Xenova/sam-vit-base`. + * ```javascript + * import { SamModel, AutoProcessor, RawImage } from '@huggingface/transformers'; + * + * const model = await SamModel.from_pretrained('Xenova/sam-vit-base'); + * const processor = await AutoProcessor.from_pretrained('Xenova/sam-vit-base'); + * + * const img_url = 'https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png'; + * const raw_image = await RawImage.read(img_url); + * const input_points = [[[450, 600]]] // 2D localization of a window + * + * const inputs = await processor(raw_image, { input_points }); + * const outputs = await model(inputs); + * + * const masks = await processor.post_process_masks(outputs.pred_masks, inputs.original_sizes, inputs.reshaped_input_sizes); + * // [ + * // Tensor { + * // dims: [ 1, 3, 1764, 2646 ], + * // type: 'bool', + * // data: Uint8Array(14002632) [ ... ], + * // size: 14002632 + * // } + * // ] + * const scores = outputs.iou_scores; + * // Tensor { + * // dims: [ 1, 1, 3 ], + * // type: 'float32', + * // data: Float32Array(3) [ + * // 0.8892380595207214, + * // 0.9311248064041138, + * // 0.983696699142456 + * // ], + * // size: 3 + * // } + * ``` + */ +export class SamModel extends SamPreTrainedModel { + /** + * Compute image embeddings and positional image embeddings, given the pixel values of an image. + * @param {Object} model_inputs Object containing the model inputs. + * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `SamProcessor`. + * @returns {Promise<{ image_embeddings: Tensor, image_positional_embeddings: Tensor }>} The image embeddings and positional image embeddings. + */ + async get_image_embeddings({ pixel_values }) { + // in: + // - pixel_values: tensor.float32[batch_size,3,1024,1024] + // + // out: + // - image_embeddings: tensor.float32[batch_size,256,64,64] + // - image_positional_embeddings: tensor.float32[batch_size,256,64,64] + return await encoderForward(this, { pixel_values }); + } + + /** + * @typedef {Object} SamModelInputs Object containing the model inputs. + * @property {Tensor} pixel_values Pixel values as a Tensor with shape `(batch_size, num_channels, height, width)`. + * These can be obtained using a `SamProcessor`. + * @property {Tensor} [input_points] Input 2D spatial points with shape `(batch_size, num_points, 2)`. + * This is used by the prompt encoder to encode the prompt. + * @property {Tensor} [input_labels] Input labels for the points, as a Tensor of shape `(batch_size, point_batch_size, num_points)`. + * This is used by the prompt encoder to encode the prompt. There are 4 types of labels: + * - `1`: the point is a point that contains the object of interest + * - `0`: the point is a point that does not contain the object of interest + * - `-1`: the point corresponds to the background + * - `-10`: the point is a padding point, thus should be ignored by the prompt encoder + * @property {Tensor} [input_boxes] Input bounding boxes with shape `(batch_size, num_boxes, 4)`. + * @property {Tensor} [image_embeddings] Image embeddings used by the mask decoder. + * @property {Tensor} [image_positional_embeddings] Image positional embeddings used by the mask decoder. + */ + + /** + * @param {SamModelInputs} model_inputs Object containing the model inputs. + * @returns {Promise} The output of the model. + */ + async forward(model_inputs) { + if (!model_inputs.image_embeddings || !model_inputs.image_positional_embeddings) { + // Compute the image embeddings if they are missing + model_inputs = { + ...model_inputs, + ...(await this.get_image_embeddings(model_inputs)), + }; + } else { + model_inputs = { ...model_inputs }; + } + + // Set default input labels if they are missing + model_inputs.input_labels ??= ones(model_inputs.input_points.dims.slice(0, -1)); + + const decoder_inputs = { + image_embeddings: model_inputs.image_embeddings, + image_positional_embeddings: model_inputs.image_positional_embeddings, + }; + if (model_inputs.input_points) { + decoder_inputs.input_points = model_inputs.input_points; + } + if (model_inputs.input_labels) { + decoder_inputs.input_labels = model_inputs.input_labels; + } + if (model_inputs.input_boxes) { + decoder_inputs.input_boxes = model_inputs.input_boxes; + } + + // Returns: + // - iou_scores: tensor.float32[batch_size,point_batch_size,3] + // - pred_masks: tensor.float32[batch_size,point_batch_size,3,256,256] + return await sessionRun(this.sessions['prompt_encoder_mask_decoder'], decoder_inputs); + } + + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Model inputs + * @returns {Promise} Object containing segmentation outputs + */ + async _call(model_inputs) { + return new SamImageSegmentationOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/sam2-pre-trained-model.js b/src/models/pre-trained-models/sam2-pre-trained-model.js new file mode 100644 index 000000000..e32cca1d9 --- /dev/null +++ b/src/models/pre-trained-models/sam2-pre-trained-model.js @@ -0,0 +1,81 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { encoderForward } from '../utils.js'; +import { sessionRun } from '../session.js'; +import { Sam2ImageSegmentationOutput } from '../output.js'; +import { ones, full } from '../../utils/tensor.js'; +import { pick } from '../../utils/core.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class Sam2PreTrainedModel extends PreTrainedModel {} +export class Sam2Model extends Sam2PreTrainedModel { + /** + * Compute image embeddings and positional image embeddings, given the pixel values of an image. + * @param {Object} model_inputs Object containing the model inputs. + * @param {Tensor} model_inputs.pixel_values Pixel values obtained using a `Sam2Processor`. + * @returns {Promise>} The image embeddings. + */ + async get_image_embeddings({ pixel_values }) { + // in: + // - pixel_values: tensor.float32[batch_size,3,1024,1024] + // + // out: + // - image_embeddings.0: tensor.float32[batch_size,32,256,256] + // - image_embeddings.1: tensor.float32[batch_size,64,128,128] + // - image_embeddings.2: tensor.float32[batch_size,256,64,64] + return await encoderForward(this, { pixel_values }); + } + + async forward(model_inputs) { + // @ts-expect-error ts(2339) + const { num_feature_levels } = this.config.vision_config; + const image_embeddings_name = Array.from({ length: num_feature_levels }, (_, i) => `image_embeddings.${i}`); + + if (image_embeddings_name.some((name) => !model_inputs[name])) { + // Compute the image embeddings if they are missing + model_inputs = { + ...model_inputs, + ...(await this.get_image_embeddings(model_inputs)), + }; + } else { + model_inputs = { ...model_inputs }; + } + + if (model_inputs.input_points) { + if (model_inputs.input_boxes && model_inputs.input_boxes.dims[1] !== 1) { + throw new Error( + 'When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.', + ); + } + const shape = model_inputs.input_points.dims; + model_inputs.input_labels ??= ones(shape.slice(0, -1)); + model_inputs.input_boxes ??= full([shape[0], 0, 4], 0.0); + } else if (model_inputs.input_boxes) { + // only boxes + const shape = model_inputs.input_boxes.dims; + model_inputs.input_labels = full([shape[0], shape[1], 0], -1n); + model_inputs.input_points = full([shape[0], 1, 0, 2], 0.0); + } else { + throw new Error('At least one of `input_points` or `input_boxes` must be provided.'); + } + + const prompt_encoder_mask_decoder_session = this.sessions['prompt_encoder_mask_decoder']; + const decoder_inputs = pick(model_inputs, prompt_encoder_mask_decoder_session.inputNames); + + // Returns: + // - iou_scores: tensor.float32[batch_size,num_boxes_or_points,3] + // - pred_masks: tensor.float32[batch_size,num_boxes_or_points,3,256,256] + // - object_score_logits: tensor.float32[batch_size,num_boxes_or_points,1] + return await sessionRun(prompt_encoder_mask_decoder_session, decoder_inputs); + } + + /** + * Runs the model with the provided inputs + * @param {Object} model_inputs Model inputs + * @returns {Promise} Object containing segmentation outputs + */ + async _call(model_inputs) { + return new Sam2ImageSegmentationOutput(await super._call(model_inputs)); + } +} +export class EdgeTamModel extends Sam2Model {} // NOTE: extends Sam2Model +export class Sam3TrackerModel extends Sam2Model {} // NOTE: extends Sam2Model \ No newline at end of file diff --git a/src/models/pre-trained-models/sapiens-pre-trained-model.js b/src/models/pre-trained-models/sapiens-pre-trained-model.js new file mode 100644 index 000000000..8b9c24256 --- /dev/null +++ b/src/models/pre-trained-models/sapiens-pre-trained-model.js @@ -0,0 +1,6 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class SapiensPreTrainedModel extends PreTrainedModel {} +export class SapiensForSemanticSegmentation extends SapiensPreTrainedModel {} +export class SapiensForDepthEstimation extends SapiensPreTrainedModel {} +export class SapiensForNormalEstimation extends SapiensPreTrainedModel {} diff --git a/src/models/pre-trained-models/segformer-pre-trained-model.js b/src/models/pre-trained-models/segformer-pre-trained-model.js new file mode 100644 index 000000000..e8f28c211 --- /dev/null +++ b/src/models/pre-trained-models/segformer-pre-trained-model.js @@ -0,0 +1,18 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class SegformerPreTrainedModel extends PreTrainedModel {} + +/** + * The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top. + */ +export class SegformerModel extends SegformerPreTrainedModel {} + +/** + * SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden states) e.g. for ImageNet. + */ +export class SegformerForImageClassification extends SegformerPreTrainedModel {} + +/** + * SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes. + */ +export class SegformerForSemanticSegmentation extends SegformerPreTrainedModel {} diff --git a/src/models/pre-trained-models/siglip-pre-trained-model.js b/src/models/pre-trained-models/siglip-pre-trained-model.js new file mode 100644 index 000000000..81c26dcef --- /dev/null +++ b/src/models/pre-trained-models/siglip-pre-trained-model.js @@ -0,0 +1,123 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CLIPPreTrainedModel } from './clip-pre-trained-model.js'; + +export class SiglipPreTrainedModel extends PreTrainedModel {} + +/** + * SigLIP Text and Vision Model with a projection layers on top + * + * **Example:** Perform zero-shot image classification with a `SiglipModel`. + * + * ```javascript + * import { AutoTokenizer, AutoProcessor, SiglipModel, RawImage } from '@huggingface/transformers'; + * + * // Load tokenizer, processor, and model + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224'); + * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224'); + * const model = await SiglipModel.from_pretrained('Xenova/siglip-base-patch16-224'); + * + * // Run tokenization + * const texts = ['a photo of 2 cats', 'a photo of 2 dogs']; + * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true }); + * + * // Read image and run processor + * const image = await RawImage.read('http://images.cocodataset.org/val2017/000000039769.jpg'); + * const image_inputs = await processor(image); + * + * // Run model with both text and pixel inputs + * const output = await model({ ...text_inputs, ...image_inputs }); + * // { + * // logits_per_image: Tensor { + * // dims: [ 1, 2 ], + * // data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ], + * // }, + * // logits_per_text: Tensor { + * // dims: [ 2, 1 ], + * // data: Float32Array(2) [ -1.6019744873046875, -10.720091819763184 ], + * // }, + * // text_embeds: Tensor { + * // dims: [ 2, 768 ], + * // data: Float32Array(1536) [ ... ], + * // }, + * // image_embeds: Tensor { + * // dims: [ 1, 768 ], + * // data: Float32Array(768) [ ... ], + * // } + * // } + * ``` + */ +export class SiglipModel extends SiglipPreTrainedModel {} + +/** + * The text model from SigLIP without any head or projection on top. + * + * **Example:** Compute text embeddings with `SiglipTextModel`. + * + * ```javascript + * import { AutoTokenizer, SiglipTextModel } from '@huggingface/transformers'; + * + * // Load tokenizer and text model + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/siglip-base-patch16-224'); + * const text_model = await SiglipTextModel.from_pretrained('Xenova/siglip-base-patch16-224'); + * + * // Run tokenization + * const texts = ['a photo of 2 cats', 'a photo of 2 dogs']; + * const text_inputs = tokenizer(texts, { padding: 'max_length', truncation: true }); + * + * // Compute embeddings + * const { pooler_output } = await text_model(text_inputs); + * // Tensor { + * // dims: [ 2, 768 ], + * // type: 'float32', + * // data: Float32Array(1536) [ ... ], + * // size: 1536 + * // } + * ``` + */ +export class SiglipTextModel extends SiglipPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'text_model', + }); + } +} + +/** + * The vision model from SigLIP without any head or projection on top. + * + * **Example:** Compute vision embeddings with `SiglipVisionModel`. + * + * ```javascript + * import { AutoProcessor, SiglipVisionModel, RawImage} from '@huggingface/transformers'; + * + * // Load processor and vision model + * const processor = await AutoProcessor.from_pretrained('Xenova/siglip-base-patch16-224'); + * const vision_model = await SiglipVisionModel.from_pretrained('Xenova/siglip-base-patch16-224'); + * + * // Read image and run processor + * const image = await RawImage.read('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/football-match.jpg'); + * const image_inputs = await processor(image); + * + * // Compute embeddings + * const { pooler_output } = await vision_model(image_inputs); + * // Tensor { + * // dims: [ 1, 768 ], + * // type: 'float32', + * // data: Float32Array(768) [ ... ], + * // size: 768 + * // } + * ``` + */ +export class SiglipVisionModel extends CLIPPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'vision_model', + }); + } +} diff --git a/src/models/pre-trained-models/smol-lm3-pre-trained-model.js b/src/models/pre-trained-models/smol-lm3-pre-trained-model.js new file mode 100644 index 000000000..77c02063c --- /dev/null +++ b/src/models/pre-trained-models/smol-lm3-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class SmolLM3PreTrainedModel extends PreTrainedModel {} +export class SmolLM3Model extends SmolLM3PreTrainedModel {} +export class SmolLM3ForCausalLM extends SmolLM3PreTrainedModel {} diff --git a/src/models/pre-trained-models/snac-pre-trained-model.js b/src/models/pre-trained-models/snac-pre-trained-model.js new file mode 100644 index 000000000..fb1c84035 --- /dev/null +++ b/src/models/pre-trained-models/snac-pre-trained-model.js @@ -0,0 +1,53 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class SnacPreTrainedModel extends PreTrainedModel { + main_input_name = 'input_values'; + forward_params = ['input_values']; +} + +/** + * The SNAC (Multi-Scale Neural Audio Codec) model. + */ +export class SnacModel extends SnacPreTrainedModel { + /** + * Encodes the input audio waveform into discrete codes. + * @param {Object} inputs Model inputs + * @param {Tensor} [inputs.input_values] Float values of the input audio waveform, of shape `(batch_size, channels, sequence_length)`). + * @returns {Promise>} The output tensors of shape `(batch_size, num_codebooks, sequence_length)`. + */ + async encode(inputs) { + return await sessionRun(this.sessions['encoder_model'], inputs); + } + + /** + * Decodes the given frames into an output audio waveform. + * @param {Record} inputs The encoded audio codes. + * @returns {Promise<{audio_values: Tensor}>} The output tensor of shape `(batch_size, num_channels, sequence_length)`. + */ + async decode(inputs) { + return await sessionRun(this.sessions['decoder_model'], inputs); + } +} + +export class SnacEncoderModel extends SnacPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'encoder_model', + }); + } +} +export class SnacDecoderModel extends SnacPreTrainedModel { + /** @type {typeof PreTrainedModel.from_pretrained} */ + static async from_pretrained(pretrained_model_name_or_path, options = {}) { + return super.from_pretrained(pretrained_model_name_or_path, { + ...options, + // Update default model file name if not provided + model_file_name: options.model_file_name ?? 'decoder_model', + }); + } +} diff --git a/src/models/pre-trained-models/speech-t5-pre-trained-model.js b/src/models/pre-trained-models/speech-t5-pre-trained-model.js new file mode 100644 index 000000000..2f0a051a7 --- /dev/null +++ b/src/models/pre-trained-models/speech-t5-pre-trained-model.js @@ -0,0 +1,165 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { sessionRun } from '../session.js'; +import { Tensor, boolTensor, cat } from '../../utils/tensor.js'; +import { encoderForward } from '../utils.js'; + +/** + * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + */ +export class SpeechT5PreTrainedModel extends PreTrainedModel {} + +/** + * The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets. + */ +export class SpeechT5Model extends SpeechT5PreTrainedModel {} + +/** + * SpeechT5 Model with a speech encoder and a text decoder. + * + * **Example:** Generate speech from text with `SpeechT5ForSpeechToText`. + * ```javascript + * import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@huggingface/transformers'; + * + * // Load the tokenizer and processor + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts'); + * const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts'); + * + * // Load the models + * // NOTE: We use the full-precision versions as they are more accurate + * const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { dtype: 'fp32' }); + * const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { dtype: 'fp32' }); + * + * // Load speaker embeddings from URL + * const speaker_embeddings_data = new Float32Array( + * await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer() + * ); + * const speaker_embeddings = new Tensor( + * 'float32', + * speaker_embeddings_data, + * [1, speaker_embeddings_data.length] + * ) + * + * // Run tokenization + * const { input_ids } = tokenizer('Hello, my dog is cute'); + * + * // Generate waveform + * const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder }); + * console.log(waveform) + * // Tensor { + * // dims: [ 26112 ], + * // type: 'float32', + * // size: 26112, + * // data: Float32Array(26112) [ -0.00043630177970044315, -0.00018082228780258447, ... ], + * // } + * ``` + */ +export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel {} + +/** + * SpeechT5 Model with a text encoder and a speech decoder. + */ +export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel { + /** + * @typedef {Object} SpeechOutput + * @property {Tensor} [spectrogram] The predicted log-mel spectrogram of shape + * `(output_sequence_length, config.num_mel_bins)`. Returned when no `vocoder` is provided + * @property {Tensor} [waveform] The predicted waveform of shape `(num_frames,)`. Returned when a `vocoder` is provided. + * @property {Tensor} [cross_attentions] The outputs of the decoder's cross-attention layers of shape + * `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length, input_sequence_length)`. returned when `output_cross_attentions` is `true`. + */ + + /** + * Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a speech waveform using a vocoder. + * @param {Tensor} input_values Indices of input sequence tokens in the vocabulary. + * @param {Tensor} speaker_embeddings Tensor containing the speaker embeddings. + * @param {Object} options Optional parameters for generating speech. + * @param {number} [options.threshold=0.5] The generated sequence ends when the predicted stop token probability exceeds this value. + * @param {number} [options.minlenratio=0.0] Used to calculate the minimum required length for the output sequence. + * @param {number} [options.maxlenratio=20.0] Used to calculate the maximum allowed length for the output sequence. + * @param {Object} [options.vocoder=null] The vocoder that converts the mel spectrogram into a speech waveform. If `null`, the output is the mel spectrogram. + * @param {boolean} [options.output_cross_attentions=false] Whether or not to return the attentions tensors of the decoder's cross-attention layers. + * @returns {Promise} A promise which resolves to an object containing the spectrogram, waveform, and cross-attention tensors. + */ + async generate_speech( + input_values, + speaker_embeddings, + { + threshold = 0.5, + minlenratio = 0.0, + maxlenratio = 20.0, + vocoder = null, + // output_cross_attentions = false, // TODO add + } = {}, + ) { + const model_inputs = { + input_ids: input_values, + }; + + const { encoder_outputs, encoder_attention_mask } = await encoderForward(this, model_inputs); + + // @ts-expect-error TS2339 + const r = encoder_outputs.dims[1] / this.config.reduction_factor; + const maxlen = Math.floor(r * maxlenratio); + const minlen = Math.floor(r * minlenratio); + + // @ts-expect-error TS2339 + const num_mel_bins = this.config.num_mel_bins; + + let spectrogramParts = []; + let past_key_values = null; + let decoder_outputs = null; + let idx = 0; + + while (true) { + ++idx; + + const use_cache_branch = boolTensor(!!decoder_outputs); + let output_sequence; + if (decoder_outputs) { + output_sequence = decoder_outputs.output_sequence_out; + } else { + output_sequence = new Tensor('float32', new Float32Array(num_mel_bins), [1, 1, num_mel_bins]); + } + let decoderFeeds = { + use_cache_branch, + output_sequence, + encoder_attention_mask: encoder_attention_mask, + speaker_embeddings: speaker_embeddings, + encoder_hidden_states: encoder_outputs, + }; + + this.addPastKeyValues(decoderFeeds, past_key_values); + decoder_outputs = await sessionRun(this.sessions['decoder_model_merged'], decoderFeeds); + past_key_values = this.getPastKeyValues(decoder_outputs, past_key_values); + + const { prob, spectrum } = decoder_outputs; + spectrogramParts.push(spectrum); + + if ( + idx >= minlen && + // Finished when stop token or maximum length is reached. + (Array.from(prob.data).filter((p) => p >= threshold).length > 0 || idx >= maxlen) + ) { + break; + } + } + + const spectrogram = cat(spectrogramParts); + const { waveform } = await sessionRun(vocoder.sessions['model'], { spectrogram }); + + return { + spectrogram, + waveform, + // cross_attentions: null, // TODO add + }; + } +} + +/** + * HiFi-GAN vocoder. + * + * See [SpeechT5ForSpeechToText](./models#module_models.SpeechT5ForSpeechToText) for example usage. + */ +export class SpeechT5HifiGan extends PreTrainedModel { + main_input_name = 'spectrogram'; +} diff --git a/src/models/pre-trained-models/squeeze-bert-pre-trained-model.js b/src/models/pre-trained-models/squeeze-bert-pre-trained-model.js new file mode 100644 index 000000000..9c9b19f72 --- /dev/null +++ b/src/models/pre-trained-models/squeeze-bert-pre-trained-model.js @@ -0,0 +1,38 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput } from '../output.js'; + +export class SqueezeBertPreTrainedModel extends PreTrainedModel {} +export class SqueezeBertModel extends SqueezeBertPreTrainedModel {} +export class SqueezeBertForMaskedLM extends SqueezeBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} +export class SqueezeBertForSequenceClassification extends SqueezeBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +export class SqueezeBertForQuestionAnswering extends SqueezeBertPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/stable-lm-pre-trained-model.js b/src/models/pre-trained-models/stable-lm-pre-trained-model.js new file mode 100644 index 000000000..9f7876955 --- /dev/null +++ b/src/models/pre-trained-models/stable-lm-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class StableLmPreTrainedModel extends PreTrainedModel {} + +/** + * The bare StableLm Model transformer outputting raw hidden-states without any specific head on top. + */ +export class StableLmModel extends StableLmPreTrainedModel {} + +/** + * StableLm Model with a `language modeling` head on top for Causal Language Modeling (with past). + */ +export class StableLmForCausalLM extends StableLmPreTrainedModel {} diff --git a/src/models/pre-trained-models/starcoder2-pre-trained-model.js b/src/models/pre-trained-models/starcoder2-pre-trained-model.js new file mode 100644 index 000000000..4814e056a --- /dev/null +++ b/src/models/pre-trained-models/starcoder2-pre-trained-model.js @@ -0,0 +1,10 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * The bare Starcoder2 Model outputting raw hidden-states without any specific head on top. + */ +export class Starcoder2PreTrainedModel extends PreTrainedModel {} + +export class Starcoder2Model extends Starcoder2PreTrainedModel {} + +export class Starcoder2ForCausalLM extends Starcoder2PreTrainedModel {} diff --git a/src/models/pre-trained-models/style-text-to-speech2-pre-trained-model.js b/src/models/pre-trained-models/style-text-to-speech2-pre-trained-model.js new file mode 100644 index 000000000..21006e035 --- /dev/null +++ b/src/models/pre-trained-models/style-text-to-speech2-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel {} +export class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel {} diff --git a/src/models/pre-trained-models/supertonic-pre-trained-model.js b/src/models/pre-trained-models/supertonic-pre-trained-model.js new file mode 100644 index 000000000..69686ba59 --- /dev/null +++ b/src/models/pre-trained-models/supertonic-pre-trained-model.js @@ -0,0 +1,59 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { ones, full, randn } from '../../utils/tensor.js'; +import { sessionRun } from '../session.js'; + +export class SupertonicPreTrainedModel extends PreTrainedModel {} +export class SupertonicForConditionalGeneration extends SupertonicPreTrainedModel { + async generate_speech({ + // Required inputs + input_ids, + attention_mask, + style, + + // Optional inputs + num_inference_steps = 5, + speed = 1.05, + }) { + // @ts-expect-error TS2339 + const { sampling_rate, chunk_compress_factor, base_chunk_size, latent_dim } = this.config; + + // 1. Text Encoder + const { last_hidden_state, durations } = await sessionRun(this.sessions['text_encoder'], { + input_ids, + attention_mask, + style, + }); + durations.div_(speed); // Apply speed factor to duration + + // 2. Latent Denoiser + const wav_len_max = durations.max().item() * sampling_rate; + const chunk_size = base_chunk_size * chunk_compress_factor; + const latent_len = Math.floor((wav_len_max + chunk_size - 1) / chunk_size); + const batch_size = input_ids.dims[0]; + const latent_mask = ones([batch_size, latent_len]); + const num_steps = full([batch_size], num_inference_steps); + + let noisy_latents = randn([batch_size, latent_dim * chunk_compress_factor, latent_len]); + for (let step = 0; step < num_inference_steps; ++step) { + const timestep = full([batch_size], step); + ({ denoised_latents: noisy_latents } = await sessionRun(this.sessions['latent_denoiser'], { + style, + noisy_latents, + latent_mask, + encoder_outputs: last_hidden_state, + attention_mask, + timestep, + num_inference_steps: num_steps, + })); + } + + // 3. Voice Decoder + const { waveform } = await sessionRun(this.sessions['voice_decoder'], { + latents: noisy_latents, + }); + return { + waveform, + durations, + }; + } +} diff --git a/src/models/pre-trained-models/swin-pre-trained-model.js b/src/models/pre-trained-models/swin-pre-trained-model.js new file mode 100644 index 000000000..082f81f89 --- /dev/null +++ b/src/models/pre-trained-models/swin-pre-trained-model.js @@ -0,0 +1,14 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class SwinPreTrainedModel extends PreTrainedModel {} +export class SwinModel extends SwinPreTrainedModel {} +export class SwinForImageClassification extends SwinPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +export class SwinForSemanticSegmentation extends SwinPreTrainedModel {} diff --git a/src/models/pre-trained-models/swin2-sr-pre-trained-model.js b/src/models/pre-trained-models/swin2-sr-pre-trained-model.js new file mode 100644 index 000000000..ac1494bdb --- /dev/null +++ b/src/models/pre-trained-models/swin2-sr-pre-trained-model.js @@ -0,0 +1,42 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class Swin2SRPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top. + */ +export class Swin2SRModel extends Swin2SRPreTrainedModel {} + +/** + * Swin2SR Model transformer with an upsampler head on top for image super resolution and restoration. + * + * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64`. + * + * ```javascript + * import { AutoProcessor, Swin2SRForImageSuperResolution, RawImage } from '@huggingface/transformers'; + * + * // Load processor and model + * const model_id = 'Xenova/swin2SR-classical-sr-x2-64'; + * const processor = await AutoProcessor.from_pretrained(model_id); + * const model = await Swin2SRForImageSuperResolution.from_pretrained(model_id); + * + * // Prepare model inputs + * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/butterfly.jpg'; + * const image = await RawImage.fromURL(url); + * const inputs = await processor(image); + * + * // Run model + * const outputs = await model(inputs); + * + * // Convert Tensor to RawImage + * const output = outputs.reconstruction.squeeze().clamp_(0, 1).mul_(255).round_().to('uint8'); + * const outputImage = RawImage.fromTensor(output); + * // RawImage { + * // data: Uint8Array(786432) [ 41, 31, 24, ... ], + * // width: 512, + * // height: 512, + * // channels: 3 + * // } + * ``` + */ +export class Swin2SRForImageSuperResolution extends Swin2SRPreTrainedModel {} diff --git a/src/models/pre-trained-models/t5-pre-trained-model.js b/src/models/pre-trained-models/t5-pre-trained-model.js new file mode 100644 index 000000000..96118b833 --- /dev/null +++ b/src/models/pre-trained-models/t5-pre-trained-model.js @@ -0,0 +1,19 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class T5PreTrainedModel extends PreTrainedModel { + forward_params = [ + 'input_ids', + 'attention_mask', + 'encoder_outputs', + 'decoder_input_ids', + 'decoder_attention_mask', + 'past_key_values', + ]; +} + +export class T5Model extends T5PreTrainedModel {} + +/** + * T5Model is a class representing a T5 model for conditional generation. + */ +export class T5ForConditionalGeneration extends T5PreTrainedModel {} \ No newline at end of file diff --git a/src/models/pre-trained-models/table-transformer-pre-trained-model.js b/src/models/pre-trained-models/table-transformer-pre-trained-model.js new file mode 100644 index 000000000..dfcb9ad9b --- /dev/null +++ b/src/models/pre-trained-models/table-transformer-pre-trained-model.js @@ -0,0 +1,24 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { DetrObjectDetectionOutput } from './detr-pre-trained-model.js'; + +export class TableTransformerPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) + * outputting raw hidden-states without any specific head on top. + */ +export class TableTransformerModel extends TableTransformerPreTrainedModel {} + +/** + * Table Transformer Model (consisting of a backbone and encoder-decoder Transformer) + * with object detection heads on top, for tasks such as COCO detection. + */ +export class TableTransformerForObjectDetection extends TableTransformerPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new TableTransformerObjectDetectionOutput(await super._call(model_inputs)); + } +} +export class TableTransformerObjectDetectionOutput extends DetrObjectDetectionOutput {} diff --git a/src/models/pre-trained-models/tr-ocr-pre-trained-model.js b/src/models/pre-trained-models/tr-ocr-pre-trained-model.js new file mode 100644 index 000000000..9ecaa7c6f --- /dev/null +++ b/src/models/pre-trained-models/tr-ocr-pre-trained-model.js @@ -0,0 +1,8 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class TrOCRPreTrainedModel extends PreTrainedModel {} + +/** + * The TrOCR Decoder with a language modeling head. + */ +export class TrOCRForCausalLM extends TrOCRPreTrainedModel {} diff --git a/src/models/pre-trained-models/ultravox-pre-trained-model.js b/src/models/pre-trained-models/ultravox-pre-trained-model.js new file mode 100644 index 000000000..1c3f6022d --- /dev/null +++ b/src/models/pre-trained-models/ultravox-pre-trained-model.js @@ -0,0 +1,22 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { default_merge_input_ids_with_audio_features } from '../utils.js'; + +export class UltravoxPreTrainedModel extends PreTrainedModel { + forward_params = ['input_ids', 'attention_mask', 'position_ids', 'audio_values', 'past_key_values']; +} + +export class UltravoxModel extends UltravoxPreTrainedModel { + _merge_input_ids_with_audio_features(kwargs) { + const audio_hidden_size = kwargs.audio_features.dims.at(-1); + const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size); + + return default_merge_input_ids_with_audio_features({ + // @ts-ignore + audio_token_id: this.config.ignore_index ?? this.config.audio_token_id, + ...kwargs, + audio_features: reshaped_audio_features, + }); + } +} + +export class VoxtralForConditionalGeneration extends UltravoxModel {} diff --git a/src/models/pre-trained-models/uni-speech-pre-trained-model.js b/src/models/pre-trained-models/uni-speech-pre-trained-model.js new file mode 100644 index 000000000..2d26b9c90 --- /dev/null +++ b/src/models/pre-trained-models/uni-speech-pre-trained-model.js @@ -0,0 +1,38 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CausalLMOutput, SequenceClassifierOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class UniSpeechPreTrainedModel extends PreTrainedModel {} + +/** + * The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top. + */ +export class UniSpeechModel extends UniSpeechPreTrainedModel {} + +/** + * UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + */ +export class UniSpeechForCTC extends UniSpeechPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +/** + * UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output). + */ +export class UniSpeechForSequenceClassification extends UniSpeechPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/uni-speech-sat-pre-trained-model.js b/src/models/pre-trained-models/uni-speech-sat-pre-trained-model.js new file mode 100644 index 000000000..a66b6f2a4 --- /dev/null +++ b/src/models/pre-trained-models/uni-speech-sat-pre-trained-model.js @@ -0,0 +1,52 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class UniSpeechSatPreTrainedModel extends PreTrainedModel {} + +/** + * The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top. + */ +export class UniSpeechSatModel extends UniSpeechSatPreTrainedModel {} + +/** + * UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + */ +export class UniSpeechSatForCTC extends UniSpeechSatPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +/** + * UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output). + */ +export class UniSpeechSatForSequenceClassification extends UniSpeechSatPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization. + */ +export class UniSpeechSatForAudioFrameClassification extends UniSpeechSatPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/vault-gemma-pre-trained-model.js b/src/models/pre-trained-models/vault-gemma-pre-trained-model.js new file mode 100644 index 000000000..fb2223a2b --- /dev/null +++ b/src/models/pre-trained-models/vault-gemma-pre-trained-model.js @@ -0,0 +1,5 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class VaultGemmaPreTrainedModel extends PreTrainedModel {} +export class VaultGemmaModel extends VaultGemmaPreTrainedModel {} +export class VaultGemmaForCausalLM extends VaultGemmaPreTrainedModel {} diff --git a/src/models/pre-trained-models/vi-t-pre-trained-model.js b/src/models/pre-trained-models/vi-t-pre-trained-model.js new file mode 100644 index 000000000..2f9589c59 --- /dev/null +++ b/src/models/pre-trained-models/vi-t-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class ViTPreTrainedModel extends PreTrainedModel {} +export class ViTModel extends ViTPreTrainedModel {} +export class ViTForImageClassification extends ViTPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/vi-tmae-pre-trained-model.js b/src/models/pre-trained-models/vi-tmae-pre-trained-model.js new file mode 100644 index 000000000..c25e7b910 --- /dev/null +++ b/src/models/pre-trained-models/vi-tmae-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class ViTMAEPreTrainedModel extends PreTrainedModel {} +export class ViTMAEModel extends ViTMAEPreTrainedModel {} diff --git a/src/models/pre-trained-models/vi-tmsn-pre-trained-model.js b/src/models/pre-trained-models/vi-tmsn-pre-trained-model.js new file mode 100644 index 000000000..47f577064 --- /dev/null +++ b/src/models/pre-trained-models/vi-tmsn-pre-trained-model.js @@ -0,0 +1,13 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { SequenceClassifierOutput } from '../output.js'; + +export class ViTMSNPreTrainedModel extends PreTrainedModel {} +export class ViTMSNModel extends ViTMSNPreTrainedModel {} +export class ViTMSNForImageClassification extends ViTMSNPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/vision-encoder-decoder-model.js b/src/models/pre-trained-models/vision-encoder-decoder-model.js new file mode 100644 index 000000000..6066dd24a --- /dev/null +++ b/src/models/pre-trained-models/vision-encoder-decoder-model.js @@ -0,0 +1,17 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +/** + * Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks + */ +export class VisionEncoderDecoderModel extends PreTrainedModel { + main_input_name = 'pixel_values'; + forward_params = [ + // Encoder inputs + 'pixel_values', + + // Decoder inpputs + 'decoder_input_ids', + 'encoder_hidden_states', + 'past_key_values', + ]; +} diff --git a/src/models/pre-trained-models/vit-matte-pre-trained-model.js b/src/models/pre-trained-models/vit-matte-pre-trained-model.js new file mode 100644 index 000000000..b8b351c41 --- /dev/null +++ b/src/models/pre-trained-models/vit-matte-pre-trained-model.js @@ -0,0 +1,64 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { ImageMattingOutput } from '../output.js'; + +export class VitMattePreTrainedModel extends PreTrainedModel {} + +/** + * ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes. + * + * **Example:** Perform image matting with a `VitMatteForImageMatting` model. + * ```javascript + * import { AutoProcessor, VitMatteForImageMatting, RawImage } from '@huggingface/transformers'; + * + * // Load processor and model + * const processor = await AutoProcessor.from_pretrained('Xenova/vitmatte-small-distinctions-646'); + * const model = await VitMatteForImageMatting.from_pretrained('Xenova/vitmatte-small-distinctions-646'); + * + * // Load image and trimap + * const image = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_image.png'); + * const trimap = await RawImage.fromURL('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/vitmatte_trimap.png'); + * + * // Prepare image + trimap for the model + * const inputs = await processor(image, trimap); + * + * // Predict alpha matte + * const { alphas } = await model(inputs); + * // Tensor { + * // dims: [ 1, 1, 640, 960 ], + * // type: 'float32', + * // size: 614400, + * // data: Float32Array(614400) [ 0.9894027709960938, 0.9970508813858032, ... ] + * // } + * ``` + * + * You can visualize the alpha matte as follows: + * ```javascript + * import { Tensor, cat } from '@huggingface/transformers'; + * + * // Visualize predicted alpha matte + * const imageTensor = image.toTensor(); + * + * // Convert float (0-1) alpha matte to uint8 (0-255) + * const alphaChannel = alphas + * .squeeze(0) + * .mul_(255) + * .clamp_(0, 255) + * .round_() + * .to('uint8'); + * + * // Concatenate original image with predicted alpha + * const imageData = cat([imageTensor, alphaChannel], 0); + * + * // Save output image + * const outputImage = RawImage.fromTensor(imageData); + * outputImage.save('output.png'); + * ``` + */ +export class VitMatteForImageMatting extends VitMattePreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new ImageMattingOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/vit-pose-pre-trained-model.js b/src/models/pre-trained-models/vit-pose-pre-trained-model.js new file mode 100644 index 000000000..1c9d2faf9 --- /dev/null +++ b/src/models/pre-trained-models/vit-pose-pre-trained-model.js @@ -0,0 +1,8 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class VitPosePreTrainedModel extends PreTrainedModel {} + +/** + * The VitPose model with a pose estimation head on top. + */ +export class VitPoseForPoseEstimation extends VitPosePreTrainedModel {} diff --git a/src/models/pre-trained-models/vits-pre-trained-model.js b/src/models/pre-trained-models/vits-pre-trained-model.js new file mode 100644 index 000000000..e7a558d0e --- /dev/null +++ b/src/models/pre-trained-models/vits-pre-trained-model.js @@ -0,0 +1,39 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { VitsModelOutput } from '../output.js'; + +export class VitsPreTrainedModel extends PreTrainedModel {} + +/** + * The complete VITS model, for text-to-speech synthesis. + * + * **Example:** Generate speech from text with `VitsModel`. + * ```javascript + * import { AutoTokenizer, VitsModel } from '@huggingface/transformers'; + * + * // Load the tokenizer and model + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/mms-tts-eng'); + * const model = await VitsModel.from_pretrained('Xenova/mms-tts-eng'); + * + * // Run tokenization + * const inputs = tokenizer('I love transformers'); + * + * // Generate waveform + * const { waveform } = await model(inputs); + * // Tensor { + * // dims: [ 1, 35328 ], + * // type: 'float32', + * // data: Float32Array(35328) [ ... ], + * // size: 35328, + * // } + * ``` + */ +export class VitsModel extends VitsPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} The outputs for the VITS model. + */ + async _call(model_inputs) { + return new VitsModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/wav-lm-pre-trained-model.js b/src/models/pre-trained-models/wav-lm-pre-trained-model.js new file mode 100644 index 000000000..46b2d881c --- /dev/null +++ b/src/models/pre-trained-models/wav-lm-pre-trained-model.js @@ -0,0 +1,155 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput, XVectorOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +/** + * An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + */ +export class WavLMPreTrainedModel extends PreTrainedModel {} + +/** + * The bare WavLM Model transformer outputting raw hidden-states without any specific head on top. + * + * **Example:** Load and run a `WavLMModel` for feature extraction. + * + * ```javascript + * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; + * + * // Read and preprocess audio + * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base'); + * const audio = await read_audio('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav', 16000); + * const inputs = await processor(audio); + * + * // Run model with inputs + * const model = await AutoModel.from_pretrained('Xenova/wavlm-base'); + * const output = await model(inputs); + * // { + * // last_hidden_state: Tensor { + * // dims: [ 1, 549, 768 ], + * // type: 'float32', + * // data: Float32Array(421632) [-0.349443256855011, -0.39341306686401367, 0.022836603224277496, ...], + * // size: 421632 + * // } + * // } + * ``` + */ +export class WavLMModel extends WavLMPreTrainedModel {} + +/** + * WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + */ +export class WavLMForCTC extends WavLMPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +/** + * WavLM Model with a sequence classification head on top (a linear layer over the pooled output). + */ +export class WavLMForSequenceClassification extends WavLMPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification. + * + * **Example:** Extract speaker embeddings with `WavLMForXVector`. + * ```javascript + * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; + * + * // Read and preprocess audio + * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sv'); + * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav'; + * const audio = await read_audio(url, 16000); + * const inputs = await processor(audio); + * + * // Run model with inputs + * const model = await AutoModel.from_pretrained('Xenova/wavlm-base-plus-sv'); + * const outputs = await model(inputs); + * // { + * // logits: Tensor { + * // dims: [ 1, 512 ], + * // type: 'float32', + * // data: Float32Array(512) [0.5847219228744507, ...], + * // size: 512 + * // }, + * // embeddings: Tensor { + * // dims: [ 1, 512 ], + * // type: 'float32', + * // data: Float32Array(512) [-0.09079201519489288, ...], + * // size: 512 + * // } + * // } + * ``` + */ +export class WavLMForXVector extends WavLMPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits and speaker embeddings. + */ + async _call(model_inputs) { + return new XVectorOutput(await super._call(model_inputs)); + } +} + +/** + * WavLM Model with a frame classification head on top for tasks like Speaker Diarization. + * + * **Example:** Perform speaker diarization with `WavLMForAudioFrameClassification`. + * ```javascript + * import { AutoProcessor, AutoModelForAudioFrameClassification, read_audio } from '@huggingface/transformers'; + * + * // Read and preprocess audio + * const processor = await AutoProcessor.from_pretrained('Xenova/wavlm-base-plus-sd'); + * const url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav'; + * const audio = await read_audio(url, 16000); + * const inputs = await processor(audio); + * + * // Run model with inputs + * const model = await AutoModelForAudioFrameClassification.from_pretrained('Xenova/wavlm-base-plus-sd'); + * const { logits } = await model(inputs); + * // { + * // logits: Tensor { + * // dims: [ 1, 549, 2 ], // [batch_size, num_frames, num_speakers] + * // type: 'float32', + * // data: Float32Array(1098) [-3.5301010608673096, ...], + * // size: 1098 + * // } + * // } + * + * const labels = logits[0].sigmoid().tolist().map( + * frames => frames.map(speaker => speaker > 0.5 ? 1 : 0) + * ); + * console.log(labels); // labels is a one-hot array of shape (num_frames, num_speakers) + * // [ + * // [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], + * // [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], + * // [0, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], + * // ... + * // ] + * ``` + */ +export class WavLMForAudioFrameClassification extends WavLMPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/wav2-vec2-bert-pre-trained-model.js b/src/models/pre-trained-models/wav2-vec2-bert-pre-trained-model.js new file mode 100644 index 000000000..4719694c3 --- /dev/null +++ b/src/models/pre-trained-models/wav2-vec2-bert-pre-trained-model.js @@ -0,0 +1,38 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CausalLMOutput, SequenceClassifierOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class Wav2Vec2BertPreTrainedModel extends PreTrainedModel {} + +/** + * The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top. + */ +export class Wav2Vec2BertModel extends Wav2Vec2BertPreTrainedModel {} + +/** + * Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC). + */ +export class Wav2Vec2BertForCTC extends Wav2Vec2BertPreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_features Float values of input mel-spectrogram. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +/** + * Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output). + */ +export class Wav2Vec2BertForSequenceClassification extends Wav2Vec2BertPreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/wav2-vec2-pre-trained-model.js b/src/models/pre-trained-models/wav2-vec2-pre-trained-model.js new file mode 100644 index 000000000..d175a579b --- /dev/null +++ b/src/models/pre-trained-models/wav2-vec2-pre-trained-model.js @@ -0,0 +1,69 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class Wav2Vec2PreTrainedModel extends PreTrainedModel {} + +/** + * The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top. + * + * **Example:** Load and run a `Wav2Vec2Model` for feature extraction. + * + * ```javascript + * import { AutoProcessor, AutoModel, read_audio } from '@huggingface/transformers'; + * + * // Read and preprocess audio + * const processor = await AutoProcessor.from_pretrained('Xenova/mms-300m'); + * const audio = await read_audio('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac', 16000); + * const inputs = await processor(audio); + * + * // Run model with inputs + * const model = await AutoModel.from_pretrained('Xenova/mms-300m'); + * const output = await model(inputs); + * // { + * // last_hidden_state: Tensor { + * // dims: [ 1, 1144, 1024 ], + * // type: 'float32', + * // data: Float32Array(1171456) [ ... ], + * // size: 1171456 + * // } + * // } + * ``` + */ +export class Wav2Vec2Model extends Wav2Vec2PreTrainedModel {} + +export class Wav2Vec2ForCTC extends Wav2Vec2PreTrainedModel { + /** + * @param {Object} model_inputs + * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform. + * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1] + */ + async _call(model_inputs) { + return new CausalLMOutput(await super._call(model_inputs)); + } +} + +export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization. + */ +export class Wav2Vec2ForAudioFrameClassification extends Wav2Vec2PreTrainedModel { + /** + * Calls the model on new inputs. + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for sequence classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/we-speaker-res-net-pre-trained-model.js b/src/models/pre-trained-models/we-speaker-res-net-pre-trained-model.js new file mode 100644 index 000000000..8290eaf8c --- /dev/null +++ b/src/models/pre-trained-models/we-speaker-res-net-pre-trained-model.js @@ -0,0 +1,4 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; + +export class WeSpeakerResNetPreTrainedModel extends PreTrainedModel {} +export class WeSpeakerResNetModel extends WeSpeakerResNetPreTrainedModel {} diff --git a/src/models/pre-trained-models/whisper-pre-trained-model.js b/src/models/pre-trained-models/whisper-pre-trained-model.js new file mode 100644 index 000000000..96f280812 --- /dev/null +++ b/src/models/pre-trained-models/whisper-pre-trained-model.js @@ -0,0 +1,293 @@ +import { cat, mean, Tensor, stack, std_mean } from '../../utils/tensor.js'; +import { PreTrainedModel } from '../pre-trained-model.js'; +import { WhisperGenerationConfig } from '../model-processors/whisper/generation_whisper.js'; +import { whisper_language_to_code } from '../model-processors/whisper/common_whisper.js'; +import { + LogitsProcessorList, + SuppressTokensAtBeginLogitsProcessor, + WhisperTimeStampLogitsProcessor, +} from '../../generation/logits_process.js'; +import { medianFilter, dynamic_time_warping } from '../../utils/maths.js'; +import { mergeArrays } from '../../utils/core.js'; +import { ModelOutput } from '../output.js'; + +export class WhisperPreTrainedModel extends PreTrainedModel { + requires_attention_mask = false; + main_input_name = 'input_features'; + forward_params = [ + 'input_features', + 'attention_mask', + 'decoder_input_ids', + 'decoder_attention_mask', + 'past_key_values', + ]; +} + +/** + * WhisperModel class for training Whisper models without a language model head. + */ +export class WhisperModel extends WhisperPreTrainedModel {} + +/** + * WhisperForConditionalGeneration class for generating conditional outputs from Whisper models. + */ +export class WhisperForConditionalGeneration extends WhisperPreTrainedModel { + _prepare_generation_config(generation_config, kwargs) { + return /** @type {WhisperGenerationConfig} */ ( + super._prepare_generation_config(generation_config, kwargs, WhisperGenerationConfig) + ); + } + + /** + * + * @param {WhisperGenerationConfig} generation_config + */ + _retrieve_init_tokens(generation_config) { + // prefix tokens are of the form: + // - Multilingual: <|startoftranscript|> <|lang_id|> <|task|> [<|notimestamps|>] + // - English-only: <|startoftranscript|> [<|notimestamps|>] + + // 1. Handle <|startoftranscript|> token + const init_tokens = [generation_config.decoder_start_token_id]; + + // 2. Handle <|lang_id|> and <|task> tokens + let language = generation_config.language; + const task = generation_config.task; + if (generation_config.is_multilingual) { + if (!language) { + // TODO: Implement language detection + console.warn('No language specified - defaulting to English (en).'); + language = 'en'; + } + + // Add language token + const language_code = whisper_language_to_code(language); + const language_token = `<|${language_code}|>`; + init_tokens.push(generation_config.lang_to_id[language_token]); + + // Add task token + // NOTE: Defaults to 'transcribe' if no task is specified + init_tokens.push(generation_config.task_to_id[task ?? 'transcribe']); + } else if (language || task) { + throw new Error( + 'Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.', + ); + } + + // 3. Handle <|notimestamps|> token + if ( + !generation_config.return_timestamps && + generation_config.no_timestamps_token_id && + init_tokens.at(-1) !== generation_config.no_timestamps_token_id + ) { + init_tokens.push(generation_config.no_timestamps_token_id); + } else if ( + generation_config.return_timestamps && + init_tokens.at(-1) === generation_config.no_timestamps_token_id + ) { + console.warn( + '<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`.', + ); + init_tokens.pop(); + } + + // let's make sure we don't pass `null` tokens as prompt tokens + return init_tokens.filter((token) => token != null); + } + + /** + * Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids. + * @param {import('../model-processors/whisper/generation_whisper.js').WhisperGenerationFunctionParameters} options + * @returns {Promise} The output of the model, which can contain the generated token ids, attentions, and scores. + */ + async generate({ + inputs = null, + generation_config = null, + logits_processor = null, + stopping_criteria = null, + + // Whisper-specific options (passed to kwargs) + // prompt_ids = null, + // language = null, + // task = null, + + ...kwargs + }) { + generation_config = this._prepare_generation_config(generation_config, kwargs); + + const init_tokens = kwargs.decoder_input_ids ?? this._retrieve_init_tokens(generation_config); + + if (generation_config.return_timestamps) { + logits_processor ??= new LogitsProcessorList(); + logits_processor.push(new WhisperTimeStampLogitsProcessor(generation_config, init_tokens)); + } + + if (generation_config.begin_suppress_tokens) { + logits_processor ??= new LogitsProcessorList(); + logits_processor.push( + new SuppressTokensAtBeginLogitsProcessor(generation_config.begin_suppress_tokens, init_tokens.length), + ); + } + + if (generation_config.return_token_timestamps) { + if (!generation_config.alignment_heads) { + throw new Error( + 'Model generation config has no `alignment_heads`, token-level timestamps not available. ' + + 'See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.', + ); + } + + if (generation_config.task === 'translate') { + console.warn("Token-level timestamps may not be reliable for task 'translate'."); + } + + generation_config.output_attentions = true; + generation_config.return_dict_in_generate = true; + } + + const outputs = await super.generate({ + inputs, + generation_config, + logits_processor, + decoder_input_ids: init_tokens, + ...kwargs, + }); + + if (generation_config.return_token_timestamps) { + outputs['token_timestamps'] = this._extract_token_timestamps( + // @ts-expect-error TS2345 + outputs, + generation_config.alignment_heads, + generation_config.num_frames, + ); + } + + return outputs; + } + + /** + * Calculates token-level timestamps using the encoder-decoder cross-attentions and + * dynamic time-warping (DTW) to map each output token to a position in the input audio. + * If `num_frames` is specified, the encoder-decoder cross-attentions will be cropped before applying DTW. + * @param {Object} generate_outputs Outputs generated by the model + * @param {Tensor[][]} generate_outputs.cross_attentions The cross attentions output by the model + * @param {Tensor} generate_outputs.sequences The sequences output by the model + * @param {number[][]} alignment_heads Alignment heads of the model + * @param {number} [num_frames=null] Number of frames in the input audio. + * @param {number} [time_precision=0.02] Precision of the timestamps in seconds + * @returns {Tensor} tensor containing the timestamps in seconds for each predicted token + */ + _extract_token_timestamps(generate_outputs, alignment_heads, num_frames = null, time_precision = 0.02) { + if (!generate_outputs.cross_attentions) { + throw new Error( + 'Model outputs must contain cross attentions to extract timestamps. ' + + 'This is most likely because the model was not exported with `output_attentions=True`.', + ); + } + if (num_frames == null) { + console.warn( + '`num_frames` has not been set, meaning the entire audio will be analyzed. ' + + 'This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).', + ); + } + + // @ts-expect-error TS2339 + let median_filter_width = this.config.median_filter_width; + if (median_filter_width === undefined) { + console.warn('Model config has no `median_filter_width`, using default value of 7.'); + median_filter_width = 7; + } + + // TODO: Improve batch processing + const batch = generate_outputs.cross_attentions; + // Create a list with `decoder_layers` elements, each a tensor of shape + // (batch size, attention_heads, output length, input length). + const cross_attentions = Array.from( + // @ts-expect-error TS2339 + { length: this.config.decoder_layers }, + // Concatenate the cross attentions for each layer across sequence length dimension. + (_, i) => + cat( + batch.map((x) => x[i]), + 2, + ), + ); + + const weights = stack( + alignment_heads.map(([l, h]) => { + if (l >= cross_attentions.length) { + throw new Error( + `Layer index ${l} is out of bounds for cross attentions (length ${cross_attentions.length}).`, + ); + } + return num_frames + ? cross_attentions[l].slice(null, h, null, [0, num_frames]) + : cross_attentions[l].slice(null, h); + }), + ).transpose(1, 0, 2, 3); + + const [std, calculatedMean] = std_mean(weights, -2, 0, true); + + // Normalize and smoothen the weights. + const smoothedWeights = weights.clone(); // [1, 8, seqLength, 1500] + + for (let a = 0; a < smoothedWeights.dims[0]; ++a) { + const aTensor = smoothedWeights[a]; // [8, seqLength, 1500] + + for (let b = 0; b < aTensor.dims[0]; ++b) { + const bTensor = aTensor[b]; // [seqLength, 1500] + + const stdTensorData = std[a][b][0].data; // [1500] + const meanTensorData = calculatedMean[a][b][0].data; // [1500] + + for (let c = 0; c < bTensor.dims[0]; ++c) { + let cTensorData = bTensor[c].data; // [1500] + for (let d = 0; d < cTensorData.length; ++d) { + cTensorData[d] = (cTensorData[d] - meanTensorData[d]) / stdTensorData[d]; + } + + // Apply median filter. + cTensorData.set(medianFilter(cTensorData, median_filter_width)); + } + } + } + + // Average the different cross-attention heads. + const batchedMatrices = [mean(smoothedWeights, 1)]; + + const timestampsShape = generate_outputs.sequences.dims; + + const timestamps = new Tensor( + 'float32', + new Float32Array(timestampsShape[0] * timestampsShape[1]), + timestampsShape, + ); + + // Perform dynamic time warping on each element of the batch. + for (let batch_idx = 0; batch_idx < timestampsShape[0]; ++batch_idx) { + // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions + // as the python implementation + const matrix = batchedMatrices[batch_idx].neg().squeeze_(0); + const [text_indices, time_indices] = dynamic_time_warping(matrix.tolist()); + + const diffs = Array.from( + { length: text_indices.length - 1 }, + (v, i) => text_indices[i + 1] - text_indices[i], + ); + const jumps = mergeArrays([1], diffs).map((x) => !!x); // convert to boolean + + const jump_times = []; + for (let i = 0; i < jumps.length; ++i) { + if (jumps[i]) { + // NOTE: No point in rounding here, since we set to Float32Array later + jump_times.push(time_indices[i] * time_precision); + } + } + timestamps[batch_idx].data.set(jump_times, 1); + } + + return timestamps; + } +} + +export class LiteWhisperForConditionalGeneration extends WhisperForConditionalGeneration {} diff --git a/src/models/pre-trained-models/xlm-pre-trained-model.js b/src/models/pre-trained-models/xlm-pre-trained-model.js new file mode 100644 index 000000000..e477cf130 --- /dev/null +++ b/src/models/pre-trained-models/xlm-pre-trained-model.js @@ -0,0 +1,74 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class XLMPreTrainedModel extends PreTrainedModel {} + +/** + * The bare XLM Model transformer outputting raw hidden-states without any specific head on top. + */ +export class XLMModel extends XLMPreTrainedModel {} + +/** + * The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). + */ +export class XLMWithLMHeadModel extends XLMPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) + */ +export class XLMForSequenceClassification extends XLMPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) + */ +export class XLMForTokenClassification extends XLMPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * XLM Model with a span classification head on top for extractive question-answering tasks + */ +export class XLMForQuestionAnswering extends XLMPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/xlm-roberta-pre-trained-model.js b/src/models/pre-trained-models/xlm-roberta-pre-trained-model.js new file mode 100644 index 000000000..72eb14911 --- /dev/null +++ b/src/models/pre-trained-models/xlm-roberta-pre-trained-model.js @@ -0,0 +1,70 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { + MaskedLMOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +} from '../output.js'; + +export class XLMRobertaPreTrainedModel extends PreTrainedModel {} +export class XLMRobertaModel extends XLMRobertaPreTrainedModel {} + +/** + * XLMRobertaForMaskedLM class for performing masked language modeling on XLMRoberta models. + */ +export class XLMRobertaForMaskedLM extends XLMRobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new MaskedLMOutput(await super._call(model_inputs)); + } +} + +/** + * XLMRobertaForSequenceClassification class for performing sequence classification on XLMRoberta models. + */ +export class XLMRobertaForSequenceClassification extends XLMRobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * XLMRobertaForTokenClassification class for performing token classification on XLMRoberta models. + */ +export class XLMRobertaForTokenClassification extends XLMRobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} An object containing the model's output logits for token classification. + */ + async _call(model_inputs) { + return new TokenClassifierOutput(await super._call(model_inputs)); + } +} + +/** + * XLMRobertaForQuestionAnswering class for performing question answering on XLMRoberta models. + */ +export class XLMRobertaForQuestionAnswering extends XLMRobertaPreTrainedModel { + /** + * Calls the model on new inputs. + * + * @param {Object} model_inputs The inputs to the model. + * @returns {Promise} returned object + */ + async _call(model_inputs) { + return new QuestionAnsweringModelOutput(await super._call(model_inputs)); + } +} diff --git a/src/models/pre-trained-models/yolos-pre-trained-model.js b/src/models/pre-trained-models/yolos-pre-trained-model.js new file mode 100644 index 000000000..6423a795a --- /dev/null +++ b/src/models/pre-trained-models/yolos-pre-trained-model.js @@ -0,0 +1,28 @@ +import { PreTrainedModel } from '../pre-trained-model.js'; +import { ModelOutput } from '../output.js'; +import { Tensor } from '../../utils/tensor.js'; + +export class YolosPreTrainedModel extends PreTrainedModel {} +export class YolosModel extends YolosPreTrainedModel {} +export class YolosForObjectDetection extends YolosPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new YolosObjectDetectionOutput(await super._call(model_inputs)); + } +} + +export class YolosObjectDetectionOutput extends ModelOutput { + /** + * @param {Object} output The output of the model. + * @param {Tensor} output.logits Classification logits (including no-object) for all queries. + * @param {Tensor} output.pred_boxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). + * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). + */ + constructor({ logits, pred_boxes }) { + super(); + this.logits = logits; + this.pred_boxes = pred_boxes; + } +} diff --git a/src/models/processors.js b/src/models/processors.js index 0348315ad..942710a23 100644 --- a/src/models/processors.js +++ b/src/models/processors.js @@ -1,24 +1,24 @@ -export * from './chatterbox/processing_chatterbox.js'; -export * from './florence2/processing_florence2.js'; -export * from './gemma3n/processing_gemma3n.js'; -export * from './grounding_dino/processing_grounding_dino.js'; -export * from './idefics3/processing_idefics3.js'; -export * from './janus/processing_janus.js'; -export * from './jina_clip/processing_jina_clip.js'; -export * from './llava/processing_llava.js'; -export * from './mgp_str/processing_mgp_str.js'; -export * from './moonshine/processing_moonshine.js'; -export * from './owlvit/processing_owlvit.js'; -export * from './phi3_v/processing_phi3_v.js'; -export * from './paligemma/processing_paligemma.js'; -export * from './pyannote/processing_pyannote.js'; -export * from './qwen2_vl/processing_qwen2_vl.js'; -export * from './sam/processing_sam.js'; -export * from './sam2/processing_sam2.js'; -export * from './smolvlm/processing_smolvlm.js'; -export * from './speecht5/processing_speecht5.js'; -export * from './ultravox/processing_ultravox.js'; -export * from './voxtral/processing_voxtral.js'; -export * from './wav2vec2/processing_wav2vec2.js'; -export * from './wav2vec2_with_lm/processing_wav2vec2_with_lm.js'; -export * from './whisper/processing_whisper.js'; +export * from './model-processors/chatterbox/processing_chatterbox.js'; +export * from './model-processors/florence2/processing_florence2.js'; +export * from './model-processors/gemma3n/processing_gemma3n.js'; +export * from './model-processors/grounding_dino/processing_grounding_dino.js'; +export * from './model-processors/idefics3/processing_idefics3.js'; +export * from './model-processors/janus/processing_janus.js'; +export * from './model-processors/jina_clip/processing_jina_clip.js'; +export * from './model-processors/llava/processing_llava.js'; +export * from './model-processors/mgp_str/processing_mgp_str.js'; +export * from './model-processors/moonshine/processing_moonshine.js'; +export * from './model-processors/owlvit/processing_owlvit.js'; +export * from './model-processors/phi3_v/processing_phi3_v.js'; +export * from './model-processors/paligemma/processing_paligemma.js'; +export * from './model-processors/pyannote/processing_pyannote.js'; +export * from './model-processors/qwen2_vl/processing_qwen2_vl.js'; +export * from './model-processors/sam/processing_sam.js'; +export * from './model-processors/sam2/processing_sam2.js'; +export * from './model-processors/smolvlm/processing_smolvlm.js'; +export * from './model-processors/speecht5/processing_speecht5.js'; +export * from './model-processors/ultravox/processing_ultravox.js'; +export * from './model-processors/voxtral/processing_voxtral.js'; +export * from './model-processors/wav2vec2/processing_wav2vec2.js'; +export * from './model-processors/wav2vec2_with_lm/processing_wav2vec2_with_lm.js'; +export * from './model-processors/whisper/processing_whisper.js'; diff --git a/src/models/pvt/image_processing_pvt.js b/src/models/pvt/image_processing_pvt.js deleted file mode 100644 index 702af349e..000000000 --- a/src/models/pvt/image_processing_pvt.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class PvtImageProcessor extends ImageProcessor {} diff --git a/src/models/registry.js b/src/models/registry.js new file mode 100644 index 000000000..c6fccfc7e --- /dev/null +++ b/src/models/registry.js @@ -0,0 +1,993 @@ +import { + MODEL_TYPES, + MODEL_TYPE_MAPPING, + MODEL_NAME_TO_CLASS_MAPPING, + MODEL_CLASS_TO_NAME_MAPPING, + PreTrainedModel, +} from './pre-trained-model.js'; + +import { + ASTModel, + ASTForAudioClassification, + AlbertModel, + AlbertForSequenceClassification, + AlbertForQuestionAnswering, + AlbertForMaskedLM, + ApertusModel, + ApertusForCausalLM, + ArceeModel, + ArceeForCausalLM, + BartModel, + BartForConditionalGeneration, + BartForSequenceClassification, + BeitModel, + BeitForImageClassification, + BertModel, + BertForMaskedLM, + BertForSequenceClassification, + BertForTokenClassification, + BertForQuestionAnswering, + BlenderbotModel, + BlenderbotForConditionalGeneration, + BlenderbotSmallModel, + BlenderbotSmallForConditionalGeneration, + BloomModel, + BloomForCausalLM, + CLIPModel, + CLIPTextModelWithProjection, + CLIPVisionModelWithProjection, + CLIPSegModel, + CLIPSegForImageSegmentation, + CamembertModel, + CamembertForMaskedLM, + CamembertForSequenceClassification, + CamembertForTokenClassification, + CamembertForQuestionAnswering, + ChatterboxModel, + ChineseCLIPModel, + ClapModel, + ClapTextModelWithProjection, + ClapAudioModelWithProjection, + CodeGenModel, + CodeGenForCausalLM, + CohereModel, + CohereForCausalLM, + ConvBertModel, + ConvBertForMaskedLM, + ConvBertForSequenceClassification, + ConvBertForTokenClassification, + ConvBertForQuestionAnswering, + ConvNextModel, + ConvNextForImageClassification, + ConvNextV2Model, + ConvNextV2ForImageClassification, + DFineModel, + DFineForObjectDetection, + DINOv3ConvNextModel, + DINOv3ViTModel, + DPTModel, + DPTForDepthEstimation, + DacModel, + DacEncoderModel, + DacDecoderModel, + DebertaModel, + DebertaForMaskedLM, + DebertaForSequenceClassification, + DebertaForTokenClassification, + DebertaForQuestionAnswering, + DebertaV2Model, + DebertaV2ForMaskedLM, + DebertaV2ForSequenceClassification, + DebertaV2ForTokenClassification, + DebertaV2ForQuestionAnswering, + DecisionTransformerModel, + DeiTModel, + DeiTForImageClassification, + DepthAnythingForDepthEstimation, + DepthProForDepthEstimation, + DetrModel, + DetrForObjectDetection, + DetrForSegmentation, + Dinov2Model, + Dinov2ForImageClassification, + Dinov2WithRegistersModel, + Dinov2WithRegistersForImageClassification, + DistilBertModel, + DistilBertForSequenceClassification, + DistilBertForTokenClassification, + DistilBertForQuestionAnswering, + DistilBertForMaskedLM, + DonutSwinModel, + EfficientNetModel, + EfficientNetForImageClassification, + ElectraModel, + ElectraForMaskedLM, + ElectraForSequenceClassification, + ElectraForTokenClassification, + ElectraForQuestionAnswering, + Ernie4_5_Model, + Ernie4_5_ForCausalLM, + EsmModel, + EsmForMaskedLM, + EsmForSequenceClassification, + EsmForTokenClassification, + ExaoneModel, + ExaoneForCausalLM, + FalconModel, + FalconForCausalLM, + FastViTModel, + FastViTForImageClassification, + Florence2ForConditionalGeneration, + GLPNModel, + GLPNForDepthEstimation, + GPT2Model, + GPT2LMHeadModel, + GPTBigCodeModel, + GPTBigCodeForCausalLM, + GPTJModel, + GPTJForCausalLM, + GPTNeoModel, + GPTNeoForCausalLM, + GPTNeoXModel, + GPTNeoXForCausalLM, + Gemma2Model, + Gemma2ForCausalLM, + Gemma3Model, + Gemma3ForCausalLM, + Gemma3nForConditionalGeneration, + GemmaModel, + GemmaForCausalLM, + GlmModel, + GlmForCausalLM, + GptOssModel, + GptOssForCausalLM, + GraniteMoeHybridModel, + GraniteMoeHybridForCausalLM, + GraniteModel, + GraniteForCausalLM, + GroundingDinoForObjectDetection, + GroupViTModel, + HeliumModel, + HeliumForCausalLM, + HieraModel, + HieraForImageClassification, + HubertModel, + HubertForCTC, + HubertForSequenceClassification, + IJepaModel, + IJepaForImageClassification, + Idefics3ForConditionalGeneration, + SmolVLMForConditionalGeneration, + JAISModel, + JAISLMHeadModel, + JinaCLIPModel, + JinaCLIPTextModel, + JinaCLIPVisionModel, + Lfm2Model, + Lfm2ForCausalLM, + Llama4ForCausalLM, + LlamaModel, + LlamaForCausalLM, + LlavaForConditionalGeneration, + LlavaOnevisionForConditionalGeneration, + Moondream1ForConditionalGeneration, + LlavaQwen2ForCausalLM, + LongT5Model, + LongT5ForConditionalGeneration, + M2M100Model, + M2M100ForConditionalGeneration, + MBartModel, + MBartForConditionalGeneration, + MBartForSequenceClassification, + MBartForCausalLM, + MPNetModel, + MPNetForMaskedLM, + MPNetForSequenceClassification, + MPNetForTokenClassification, + MPNetForQuestionAnswering, + MT5Model, + MT5ForConditionalGeneration, + MarianModel, + MarianMTModel, + MaskFormerModel, + MaskFormerForInstanceSegmentation, + Metric3DForDepthEstimation, + Metric3Dv2ForDepthEstimation, + MgpstrForSceneTextRecognition, + MimiModel, + MimiEncoderModel, + MimiDecoderModel, + MistralModel, + MistralForCausalLM, + MobileBertModel, + MobileBertForMaskedLM, + MobileBertForSequenceClassification, + MobileBertForQuestionAnswering, + MobileLLMModel, + MobileLLMForCausalLM, + MobileNetV1Model, + MobileNetV1ForImageClassification, + MobileNetV1ForSemanticSegmentation, + MobileNetV2Model, + MobileNetV2ForImageClassification, + MobileNetV2ForSemanticSegmentation, + MobileNetV3Model, + MobileNetV3ForImageClassification, + MobileNetV3ForSemanticSegmentation, + MobileNetV4Model, + MobileNetV4ForImageClassification, + MobileNetV4ForSemanticSegmentation, + MobileViTModel, + MobileViTForImageClassification, + MobileViTV2Model, + MobileViTV2ForImageClassification, + ModernBertDecoderModel, + ModernBertDecoderForCausalLM, + ModernBertModel, + ModernBertForMaskedLM, + ModernBertForSequenceClassification, + ModernBertForTokenClassification, + MoonshineForConditionalGeneration, + MptModel, + MptForCausalLM, + MultiModalityCausalLM, + MusicgenForConditionalGeneration, + NanoChatModel, + NanoChatForCausalLM, + NeoBertModel, + NeoBertForMaskedLM, + NeoBertForSequenceClassification, + NeoBertForTokenClassification, + NeoBertForQuestionAnswering, + NomicBertModel, + OPTModel, + OPTForCausalLM, + Olmo2Model, + Olmo2ForCausalLM, + Olmo3Model, + Olmo3ForCausalLM, + OlmoModel, + OlmoForCausalLM, + OpenELMModel, + OpenELMForCausalLM, + OwlViTModel, + OwlViTForObjectDetection, + Owlv2Model, + Owlv2ForObjectDetection, + PaliGemmaForConditionalGeneration, + ParakeetForCTC, + PatchTSMixerModel, + PatchTSMixerForPrediction, + PatchTSTModel, + PatchTSTForPrediction, + Phi3Model, + Phi3ForCausalLM, + Phi3VForCausalLM, + PhiModel, + PhiForCausalLM, + PvtModel, + PvtForImageClassification, + PyAnnoteModel, + PyAnnoteForAudioFrameClassification, + Qwen2Model, + Qwen2ForCausalLM, + Qwen2VLForConditionalGeneration, + Qwen3Model, + Qwen3ForCausalLM, + RFDetrModel, + RFDetrForObjectDetection, + RTDetrModel, + RTDetrForObjectDetection, + RTDetrV2Model, + RTDetrV2ForObjectDetection, + ResNetModel, + ResNetForImageClassification, + RoFormerModel, + RoFormerForMaskedLM, + RoFormerForSequenceClassification, + RoFormerForTokenClassification, + RoFormerForQuestionAnswering, + RobertaModel, + RobertaForMaskedLM, + RobertaForSequenceClassification, + RobertaForTokenClassification, + RobertaForQuestionAnswering, + Sam2Model, + EdgeTamModel, + Sam3TrackerModel, + SamModel, + SapiensForSemanticSegmentation, + SapiensForDepthEstimation, + SapiensForNormalEstimation, + SegformerForImageClassification, + SegformerForSemanticSegmentation, + SiglipModel, + SiglipTextModel, + SiglipVisionModel, + SmolLM3Model, + SmolLM3ForCausalLM, + SnacModel, + SnacEncoderModel, + SnacDecoderModel, + SpeechT5ForSpeechToText, + SpeechT5ForTextToSpeech, + SpeechT5HifiGan, + SqueezeBertModel, + SqueezeBertForMaskedLM, + SqueezeBertForSequenceClassification, + SqueezeBertForQuestionAnswering, + StableLmModel, + StableLmForCausalLM, + Starcoder2Model, + Starcoder2ForCausalLM, + StyleTextToSpeech2Model, + SupertonicForConditionalGeneration, + Swin2SRModel, + Swin2SRForImageSuperResolution, + SwinModel, + SwinForImageClassification, + SwinForSemanticSegmentation, + T5Model, + T5ForConditionalGeneration, + TableTransformerModel, + TableTransformerForObjectDetection, + TrOCRForCausalLM, + UltravoxModel, + VoxtralForConditionalGeneration, + UniSpeechModel, + UniSpeechForCTC, + UniSpeechForSequenceClassification, + UniSpeechSatModel, + UniSpeechSatForCTC, + UniSpeechSatForSequenceClassification, + UniSpeechSatForAudioFrameClassification, + VaultGemmaModel, + VaultGemmaForCausalLM, + ViTMAEModel, + ViTMSNModel, + ViTMSNForImageClassification, + ViTModel, + ViTForImageClassification, + VisionEncoderDecoderModel, + VitMatteForImageMatting, + VitPoseForPoseEstimation, + VitsModel, + Wav2Vec2BertModel, + Wav2Vec2BertForCTC, + Wav2Vec2BertForSequenceClassification, + Wav2Vec2Model, + Wav2Vec2ForCTC, + Wav2Vec2ForSequenceClassification, + Wav2Vec2ForAudioFrameClassification, + WavLMModel, + WavLMForCTC, + WavLMForSequenceClassification, + WavLMForXVector, + WavLMForAudioFrameClassification, + WeSpeakerResNetModel, + WhisperModel, + WhisperForConditionalGeneration, + LiteWhisperForConditionalGeneration, + XLMModel, + XLMWithLMHeadModel, + XLMForSequenceClassification, + XLMForTokenClassification, + XLMForQuestionAnswering, + XLMRobertaModel, + XLMRobertaForMaskedLM, + XLMRobertaForSequenceClassification, + XLMRobertaForTokenClassification, + XLMRobertaForQuestionAnswering, + YolosModel, + YolosForObjectDetection, +} from './pre-trained-models/index.js'; + +const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ + ['bert', ['BertModel', BertModel]], + ['neobert', ['NeoBertModel', NeoBertModel]], + ['modernbert', ['ModernBertModel', ModernBertModel]], + ['nomic_bert', ['NomicBertModel', NomicBertModel]], + ['roformer', ['RoFormerModel', RoFormerModel]], + ['electra', ['ElectraModel', ElectraModel]], + ['esm', ['EsmModel', EsmModel]], + ['convbert', ['ConvBertModel', ConvBertModel]], + ['camembert', ['CamembertModel', CamembertModel]], + ['deberta', ['DebertaModel', DebertaModel]], + ['deberta-v2', ['DebertaV2Model', DebertaV2Model]], + ['mpnet', ['MPNetModel', MPNetModel]], + ['albert', ['AlbertModel', AlbertModel]], + ['distilbert', ['DistilBertModel', DistilBertModel]], + ['roberta', ['RobertaModel', RobertaModel]], + ['xlm', ['XLMModel', XLMModel]], + ['xlm-roberta', ['XLMRobertaModel', XLMRobertaModel]], + ['clap', ['ClapModel', ClapModel]], + ['clip', ['CLIPModel', CLIPModel]], + ['clipseg', ['CLIPSegModel', CLIPSegModel]], + ['chinese_clip', ['ChineseCLIPModel', ChineseCLIPModel]], + ['siglip', ['SiglipModel', SiglipModel]], + ['jina_clip', ['JinaCLIPModel', JinaCLIPModel]], + ['mobilebert', ['MobileBertModel', MobileBertModel]], + ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]], + ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]], + ['wav2vec2-bert', ['Wav2Vec2BertModel', Wav2Vec2BertModel]], + ['unispeech', ['UniSpeechModel', UniSpeechModel]], + ['unispeech-sat', ['UniSpeechSatModel', UniSpeechSatModel]], + ['hubert', ['HubertModel', HubertModel]], + ['wavlm', ['WavLMModel', WavLMModel]], + ['audio-spectrogram-transformer', ['ASTModel', ASTModel]], + ['vits', ['VitsModel', VitsModel]], + ['pyannote', ['PyAnnoteModel', PyAnnoteModel]], + ['wespeaker-resnet', ['WeSpeakerResNetModel', WeSpeakerResNetModel]], + + ['detr', ['DetrModel', DetrModel]], + ['rt_detr', ['RTDetrModel', RTDetrModel]], + ['rt_detr_v2', ['RTDetrV2Model', RTDetrV2Model]], + ['rf_detr', ['RFDetrModel', RFDetrModel]], + ['d_fine', ['DFineModel', DFineModel]], + ['table-transformer', ['TableTransformerModel', TableTransformerModel]], + ['vit', ['ViTModel', ViTModel]], + ['ijepa', ['IJepaModel', IJepaModel]], + ['pvt', ['PvtModel', PvtModel]], + ['vit_msn', ['ViTMSNModel', ViTMSNModel]], + ['vit_mae', ['ViTMAEModel', ViTMAEModel]], + ['groupvit', ['GroupViTModel', GroupViTModel]], + ['fastvit', ['FastViTModel', FastViTModel]], + ['mobilevit', ['MobileViTModel', MobileViTModel]], + ['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]], + ['owlvit', ['OwlViTModel', OwlViTModel]], + ['owlv2', ['Owlv2Model', Owlv2Model]], + ['beit', ['BeitModel', BeitModel]], + ['deit', ['DeiTModel', DeiTModel]], + ['hiera', ['HieraModel', HieraModel]], + ['convnext', ['ConvNextModel', ConvNextModel]], + ['convnextv2', ['ConvNextV2Model', ConvNextV2Model]], + ['dinov2', ['Dinov2Model', Dinov2Model]], + ['dinov2_with_registers', ['Dinov2WithRegistersModel', Dinov2WithRegistersModel]], + ['dinov3_vit', ['DINOv3ViTModel', DINOv3ViTModel]], + ['dinov3_convnext', ['DINOv3ConvNextModel', DINOv3ConvNextModel]], + ['resnet', ['ResNetModel', ResNetModel]], + ['swin', ['SwinModel', SwinModel]], + ['swin2sr', ['Swin2SRModel', Swin2SRModel]], + ['donut-swin', ['DonutSwinModel', DonutSwinModel]], + ['yolos', ['YolosModel', YolosModel]], + ['dpt', ['DPTModel', DPTModel]], + ['glpn', ['GLPNModel', GLPNModel]], + + ['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]], + ['efficientnet', ['EfficientNetModel', EfficientNetModel]], + + ['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]], + ['patchtst', ['PatchTSTForPrediction', PatchTSTModel]], + ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerModel]], + + ['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]], + ['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]], + ['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]], + ['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]], + + ['maskformer', ['MaskFormerModel', MaskFormerModel]], + ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]], + + ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]], +]); + +const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([ + ['t5', ['T5Model', T5Model]], + ['longt5', ['LongT5Model', LongT5Model]], + ['mt5', ['MT5Model', MT5Model]], + ['bart', ['BartModel', BartModel]], + ['mbart', ['MBartModel', MBartModel]], + ['marian', ['MarianModel', MarianModel]], + ['whisper', ['WhisperModel', WhisperModel]], + ['m2m_100', ['M2M100Model', M2M100Model]], + ['blenderbot', ['BlenderbotModel', BlenderbotModel]], + ['blenderbot-small', ['BlenderbotSmallModel', BlenderbotSmallModel]], +]); + +const MODEL_MAPPING_NAMES_AUTO_ENCODER = new Map([ + ['mimi', ['MimiModel', MimiModel]], + ['dac', ['DacModel', DacModel]], + ['snac', ['SnacModel', SnacModel]], +]); + +const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([ + ['bloom', ['BloomModel', BloomModel]], + ['jais', ['JAISModel', JAISModel]], + ['gpt2', ['GPT2Model', GPT2Model]], + ['gpt_oss', ['GptOssModel', GptOssModel]], + ['gptj', ['GPTJModel', GPTJModel]], + ['gpt_bigcode', ['GPTBigCodeModel', GPTBigCodeModel]], + ['gpt_neo', ['GPTNeoModel', GPTNeoModel]], + ['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]], + ['codegen', ['CodeGenModel', CodeGenModel]], + ['llama', ['LlamaModel', LlamaModel]], + ['apertus', ['ApertusModel', ApertusModel]], + ['nanochat', ['NanoChatModel', NanoChatModel]], + ['arcee', ['ArceeModel', ArceeModel]], + ['lfm2', ['Lfm2Model', Lfm2Model]], + ['smollm3', ['SmolLM3Model', SmolLM3Model]], + ['exaone', ['ExaoneModel', ExaoneModel]], + ['olmo', ['OlmoModel', OlmoModel]], + ['olmo2', ['Olmo2Model', Olmo2Model]], + ['olmo3', ['Olmo3Model', Olmo3Model]], + ['mobilellm', ['MobileLLMModel', MobileLLMModel]], + ['granite', ['GraniteModel', GraniteModel]], + ['granitemoehybrid', ['GraniteMoeHybridModel', GraniteMoeHybridModel]], + ['cohere', ['CohereModel', CohereModel]], + ['gemma', ['GemmaModel', GemmaModel]], + ['gemma2', ['Gemma2Model', Gemma2Model]], + ['vaultgemma', ['VaultGemmaModel', VaultGemmaModel]], + ['gemma3_text', ['Gemma3Model', Gemma3Model]], + ['helium', ['HeliumModel', HeliumModel]], + ['glm', ['GlmModel', GlmModel]], + ['openelm', ['OpenELMModel', OpenELMModel]], + ['qwen2', ['Qwen2Model', Qwen2Model]], + ['qwen3', ['Qwen3Model', Qwen3Model]], + ['phi', ['PhiModel', PhiModel]], + ['phi3', ['Phi3Model', Phi3Model]], + ['mpt', ['MptModel', MptModel]], + ['opt', ['OPTModel', OPTModel]], + ['mistral', ['MistralModel', MistralModel]], + ['ernie4_5', ['Ernie4_5_Model', Ernie4_5_Model]], + ['starcoder2', ['Starcoder2Model', Starcoder2Model]], + ['falcon', ['FalconModel', FalconModel]], + ['stablelm', ['StableLmModel', StableLmModel]], + ['modernbert-decoder', ['ModernBertDecoderModel', ModernBertDecoderModel]], +]); + +export const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([ + ['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]], + ['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]], + ['lite-whisper', ['LiteWhisperForConditionalGeneration', LiteWhisperForConditionalGeneration]], + ['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]], +]); + +const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([ + ['speecht5', ['SpeechT5ForTextToSpeech', SpeechT5ForTextToSpeech]], +]); + +const MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = new Map([ + ['vits', ['VitsModel', VitsModel]], + ['musicgen', ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration]], + ['supertonic', ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration]], +]); + +const MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = new Map([ + ['bert', ['BertForSequenceClassification', BertForSequenceClassification]], + ['neobert', ['NeoBertForSequenceClassification', NeoBertForSequenceClassification]], + ['modernbert', ['ModernBertForSequenceClassification', ModernBertForSequenceClassification]], + ['roformer', ['RoFormerForSequenceClassification', RoFormerForSequenceClassification]], + ['electra', ['ElectraForSequenceClassification', ElectraForSequenceClassification]], + ['esm', ['EsmForSequenceClassification', EsmForSequenceClassification]], + ['convbert', ['ConvBertForSequenceClassification', ConvBertForSequenceClassification]], + ['camembert', ['CamembertForSequenceClassification', CamembertForSequenceClassification]], + ['deberta', ['DebertaForSequenceClassification', DebertaForSequenceClassification]], + ['deberta-v2', ['DebertaV2ForSequenceClassification', DebertaV2ForSequenceClassification]], + ['mpnet', ['MPNetForSequenceClassification', MPNetForSequenceClassification]], + ['albert', ['AlbertForSequenceClassification', AlbertForSequenceClassification]], + ['distilbert', ['DistilBertForSequenceClassification', DistilBertForSequenceClassification]], + ['roberta', ['RobertaForSequenceClassification', RobertaForSequenceClassification]], + ['xlm', ['XLMForSequenceClassification', XLMForSequenceClassification]], + ['xlm-roberta', ['XLMRobertaForSequenceClassification', XLMRobertaForSequenceClassification]], + ['bart', ['BartForSequenceClassification', BartForSequenceClassification]], + ['mbart', ['MBartForSequenceClassification', MBartForSequenceClassification]], + ['mobilebert', ['MobileBertForSequenceClassification', MobileBertForSequenceClassification]], + ['squeezebert', ['SqueezeBertForSequenceClassification', SqueezeBertForSequenceClassification]], +]); + +const MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = new Map([ + ['bert', ['BertForTokenClassification', BertForTokenClassification]], + ['neobert', ['NeoBertForTokenClassification', NeoBertForTokenClassification]], + ['modernbert', ['ModernBertForTokenClassification', ModernBertForTokenClassification]], + ['roformer', ['RoFormerForTokenClassification', RoFormerForTokenClassification]], + ['electra', ['ElectraForTokenClassification', ElectraForTokenClassification]], + ['esm', ['EsmForTokenClassification', EsmForTokenClassification]], + ['convbert', ['ConvBertForTokenClassification', ConvBertForTokenClassification]], + ['camembert', ['CamembertForTokenClassification', CamembertForTokenClassification]], + ['deberta', ['DebertaForTokenClassification', DebertaForTokenClassification]], + ['deberta-v2', ['DebertaV2ForTokenClassification', DebertaV2ForTokenClassification]], + ['mpnet', ['MPNetForTokenClassification', MPNetForTokenClassification]], + ['distilbert', ['DistilBertForTokenClassification', DistilBertForTokenClassification]], + ['roberta', ['RobertaForTokenClassification', RobertaForTokenClassification]], + ['xlm', ['XLMForTokenClassification', XLMForTokenClassification]], + ['xlm-roberta', ['XLMRobertaForTokenClassification', XLMRobertaForTokenClassification]], +]); + +export const MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = new Map([ + ['t5', ['T5ForConditionalGeneration', T5ForConditionalGeneration]], + ['longt5', ['LongT5ForConditionalGeneration', LongT5ForConditionalGeneration]], + ['mt5', ['MT5ForConditionalGeneration', MT5ForConditionalGeneration]], + ['bart', ['BartForConditionalGeneration', BartForConditionalGeneration]], + ['mbart', ['MBartForConditionalGeneration', MBartForConditionalGeneration]], + ['marian', ['MarianMTModel', MarianMTModel]], + ['m2m_100', ['M2M100ForConditionalGeneration', M2M100ForConditionalGeneration]], + ['blenderbot', ['BlenderbotForConditionalGeneration', BlenderbotForConditionalGeneration]], + ['blenderbot-small', ['BlenderbotSmallForConditionalGeneration', BlenderbotSmallForConditionalGeneration]], +]); + +export const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([ + ['bloom', ['BloomForCausalLM', BloomForCausalLM]], + ['gpt2', ['GPT2LMHeadModel', GPT2LMHeadModel]], + ['gpt_oss', ['GptOssForCausalLM', GptOssForCausalLM]], + ['jais', ['JAISLMHeadModel', JAISLMHeadModel]], + ['gptj', ['GPTJForCausalLM', GPTJForCausalLM]], + ['gpt_bigcode', ['GPTBigCodeForCausalLM', GPTBigCodeForCausalLM]], + ['gpt_neo', ['GPTNeoForCausalLM', GPTNeoForCausalLM]], + ['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]], + ['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]], + ['llama', ['LlamaForCausalLM', LlamaForCausalLM]], + ['nanochat', ['NanoChatForCausalLM', NanoChatForCausalLM]], + ['apertus', ['ApertusForCausalLM', ApertusForCausalLM]], + ['llama4_text', ['Llama4ForCausalLM', Llama4ForCausalLM]], + ['arcee', ['ArceeForCausalLM', ArceeForCausalLM]], + ['lfm2', ['Lfm2ForCausalLM', Lfm2ForCausalLM]], + ['smollm3', ['SmolLM3ForCausalLM', SmolLM3ForCausalLM]], + ['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]], + ['olmo', ['OlmoForCausalLM', OlmoForCausalLM]], + ['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]], + ['olmo3', ['Olmo3ForCausalLM', Olmo3ForCausalLM]], + ['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]], + ['granite', ['GraniteForCausalLM', GraniteForCausalLM]], + ['granitemoehybrid', ['GraniteMoeHybridForCausalLM', GraniteMoeHybridForCausalLM]], + ['cohere', ['CohereForCausalLM', CohereForCausalLM]], + ['gemma', ['GemmaForCausalLM', GemmaForCausalLM]], + ['gemma2', ['Gemma2ForCausalLM', Gemma2ForCausalLM]], + ['vaultgemma', ['VaultGemmaForCausalLM', VaultGemmaForCausalLM]], + ['gemma3_text', ['Gemma3ForCausalLM', Gemma3ForCausalLM]], + ['helium', ['HeliumForCausalLM', HeliumForCausalLM]], + ['glm', ['GlmForCausalLM', GlmForCausalLM]], + ['openelm', ['OpenELMForCausalLM', OpenELMForCausalLM]], + ['qwen2', ['Qwen2ForCausalLM', Qwen2ForCausalLM]], + ['qwen3', ['Qwen3ForCausalLM', Qwen3ForCausalLM]], + ['phi', ['PhiForCausalLM', PhiForCausalLM]], + ['phi3', ['Phi3ForCausalLM', Phi3ForCausalLM]], + ['mpt', ['MptForCausalLM', MptForCausalLM]], + ['opt', ['OPTForCausalLM', OPTForCausalLM]], + ['mbart', ['MBartForCausalLM', MBartForCausalLM]], + ['mistral', ['MistralForCausalLM', MistralForCausalLM]], + ['ernie4_5', ['Ernie4_5_ForCausalLM', Ernie4_5_ForCausalLM]], + ['starcoder2', ['Starcoder2ForCausalLM', Starcoder2ForCausalLM]], + ['falcon', ['FalconForCausalLM', FalconForCausalLM]], + ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]], + ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]], + ['modernbert-decoder', ['ModernBertDecoderForCausalLM', ModernBertDecoderForCausalLM]], + + // Also image-text-to-text + ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]], +]); + +const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([ + ['multi_modality', ['MultiModalityCausalLM', MultiModalityCausalLM]], +]); + +const MODEL_FOR_MASKED_LM_MAPPING_NAMES = new Map([ + ['bert', ['BertForMaskedLM', BertForMaskedLM]], + ['neobert', ['NeoBertForMaskedLM', NeoBertForMaskedLM]], + ['modernbert', ['ModernBertForMaskedLM', ModernBertForMaskedLM]], + ['roformer', ['RoFormerForMaskedLM', RoFormerForMaskedLM]], + ['electra', ['ElectraForMaskedLM', ElectraForMaskedLM]], + ['esm', ['EsmForMaskedLM', EsmForMaskedLM]], + ['convbert', ['ConvBertForMaskedLM', ConvBertForMaskedLM]], + ['camembert', ['CamembertForMaskedLM', CamembertForMaskedLM]], + ['deberta', ['DebertaForMaskedLM', DebertaForMaskedLM]], + ['deberta-v2', ['DebertaV2ForMaskedLM', DebertaV2ForMaskedLM]], + ['mpnet', ['MPNetForMaskedLM', MPNetForMaskedLM]], + ['albert', ['AlbertForMaskedLM', AlbertForMaskedLM]], + ['distilbert', ['DistilBertForMaskedLM', DistilBertForMaskedLM]], + ['roberta', ['RobertaForMaskedLM', RobertaForMaskedLM]], + ['xlm', ['XLMWithLMHeadModel', XLMWithLMHeadModel]], + ['xlm-roberta', ['XLMRobertaForMaskedLM', XLMRobertaForMaskedLM]], + ['mobilebert', ['MobileBertForMaskedLM', MobileBertForMaskedLM]], + ['squeezebert', ['SqueezeBertForMaskedLM', SqueezeBertForMaskedLM]], +]); + +const MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ + ['bert', ['BertForQuestionAnswering', BertForQuestionAnswering]], + ['neobert', ['NeoBertForQuestionAnswering', NeoBertForQuestionAnswering]], + ['roformer', ['RoFormerForQuestionAnswering', RoFormerForQuestionAnswering]], + ['electra', ['ElectraForQuestionAnswering', ElectraForQuestionAnswering]], + ['convbert', ['ConvBertForQuestionAnswering', ConvBertForQuestionAnswering]], + ['camembert', ['CamembertForQuestionAnswering', CamembertForQuestionAnswering]], + ['deberta', ['DebertaForQuestionAnswering', DebertaForQuestionAnswering]], + ['deberta-v2', ['DebertaV2ForQuestionAnswering', DebertaV2ForQuestionAnswering]], + ['mpnet', ['MPNetForQuestionAnswering', MPNetForQuestionAnswering]], + ['albert', ['AlbertForQuestionAnswering', AlbertForQuestionAnswering]], + ['distilbert', ['DistilBertForQuestionAnswering', DistilBertForQuestionAnswering]], + ['roberta', ['RobertaForQuestionAnswering', RobertaForQuestionAnswering]], + ['xlm', ['XLMForQuestionAnswering', XLMForQuestionAnswering]], + ['xlm-roberta', ['XLMRobertaForQuestionAnswering', XLMRobertaForQuestionAnswering]], + ['mobilebert', ['MobileBertForQuestionAnswering', MobileBertForQuestionAnswering]], + ['squeezebert', ['SqueezeBertForQuestionAnswering', SqueezeBertForQuestionAnswering]], +]); + +export const MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = new Map([ + ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]], + ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]], + ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]], +]); + +const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([ + ['llava', ['LlavaForConditionalGeneration', LlavaForConditionalGeneration]], + ['llava_onevision', ['LlavaOnevisionForConditionalGeneration', LlavaOnevisionForConditionalGeneration]], + ['moondream1', ['Moondream1ForConditionalGeneration', Moondream1ForConditionalGeneration]], + ['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]], + ['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]], + ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]], + ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]], + ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]], + ['llava_qwen2', ['LlavaQwen2ForCausalLM', LlavaQwen2ForCausalLM]], + ['gemma3n', ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration]], +]); + +const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([ + ['ultravox', ['UltravoxModel', UltravoxModel]], + ['voxtral', ['VoxtralForConditionalGeneration', VoxtralForConditionalGeneration]], +]); + +const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ + ['vision-encoder-decoder', ['VisionEncoderDecoderModel', VisionEncoderDecoderModel]], +]); + +const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([ + ['vit', ['ViTForImageClassification', ViTForImageClassification]], + ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]], + ['pvt', ['PvtForImageClassification', PvtForImageClassification]], + ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]], + ['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]], + ['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]], + ['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]], + ['beit', ['BeitForImageClassification', BeitForImageClassification]], + ['deit', ['DeiTForImageClassification', DeiTForImageClassification]], + ['hiera', ['HieraForImageClassification', HieraForImageClassification]], + ['convnext', ['ConvNextForImageClassification', ConvNextForImageClassification]], + ['convnextv2', ['ConvNextV2ForImageClassification', ConvNextV2ForImageClassification]], + ['dinov2', ['Dinov2ForImageClassification', Dinov2ForImageClassification]], + ['dinov2_with_registers', ['Dinov2WithRegistersForImageClassification', Dinov2WithRegistersForImageClassification]], + ['resnet', ['ResNetForImageClassification', ResNetForImageClassification]], + ['swin', ['SwinForImageClassification', SwinForImageClassification]], + ['segformer', ['SegformerForImageClassification', SegformerForImageClassification]], + ['efficientnet', ['EfficientNetForImageClassification', EfficientNetForImageClassification]], + ['mobilenet_v1', ['MobileNetV1ForImageClassification', MobileNetV1ForImageClassification]], + ['mobilenet_v2', ['MobileNetV2ForImageClassification', MobileNetV2ForImageClassification]], + ['mobilenet_v3', ['MobileNetV3ForImageClassification', MobileNetV3ForImageClassification]], + ['mobilenet_v4', ['MobileNetV4ForImageClassification', MobileNetV4ForImageClassification]], +]); + +const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([ + ['detr', ['DetrForObjectDetection', DetrForObjectDetection]], + ['rt_detr', ['RTDetrForObjectDetection', RTDetrForObjectDetection]], + ['rt_detr_v2', ['RTDetrV2ForObjectDetection', RTDetrV2ForObjectDetection]], + ['rf_detr', ['RFDetrForObjectDetection', RFDetrForObjectDetection]], + ['d_fine', ['DFineForObjectDetection', DFineForObjectDetection]], + ['table-transformer', ['TableTransformerForObjectDetection', TableTransformerForObjectDetection]], + ['yolos', ['YolosForObjectDetection', YolosForObjectDetection]], +]); + +const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([ + ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]], + ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]], + ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]], +]); + +const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([ + // TODO: Do not add new models here + ['detr', ['DetrForSegmentation', DetrForSegmentation]], + ['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]], +]); + +const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([ + ['segformer', ['SegformerForSemanticSegmentation', SegformerForSemanticSegmentation]], + ['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]], + + ['swin', ['SwinForSemanticSegmentation', SwinForSemanticSegmentation]], + ['mobilenet_v1', ['MobileNetV1ForSemanticSegmentation', MobileNetV1ForSemanticSegmentation]], + ['mobilenet_v2', ['MobileNetV2ForSemanticSegmentation', MobileNetV2ForSemanticSegmentation]], + ['mobilenet_v3', ['MobileNetV3ForSemanticSegmentation', MobileNetV3ForSemanticSegmentation]], + ['mobilenet_v4', ['MobileNetV4ForSemanticSegmentation', MobileNetV4ForSemanticSegmentation]], +]); + +const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([ + ['detr', ['DetrForSegmentation', DetrForSegmentation]], + ['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]], +]); + +const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([ + ['sam', ['SamModel', SamModel]], + ['sam2', ['Sam2Model', Sam2Model]], + ['edgetam', ['EdgeTamModel', EdgeTamModel]], + ['sam3_tracker', ['Sam3TrackerModel', Sam3TrackerModel]], +]); + +const MODEL_FOR_CTC_MAPPING_NAMES = new Map([ + ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]], + ['wav2vec2-bert', ['Wav2Vec2BertForCTC', Wav2Vec2BertForCTC]], + ['unispeech', ['UniSpeechForCTC', UniSpeechForCTC]], + ['unispeech-sat', ['UniSpeechSatForCTC', UniSpeechSatForCTC]], + ['wavlm', ['WavLMForCTC', WavLMForCTC]], + ['hubert', ['HubertForCTC', HubertForCTC]], + ['parakeet_ctc', ['ParakeetForCTC', ParakeetForCTC]], +]); + +const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([ + ['wav2vec2', ['Wav2Vec2ForSequenceClassification', Wav2Vec2ForSequenceClassification]], + ['wav2vec2-bert', ['Wav2Vec2BertForSequenceClassification', Wav2Vec2BertForSequenceClassification]], + ['unispeech', ['UniSpeechForSequenceClassification', UniSpeechForSequenceClassification]], + ['unispeech-sat', ['UniSpeechSatForSequenceClassification', UniSpeechSatForSequenceClassification]], + ['wavlm', ['WavLMForSequenceClassification', WavLMForSequenceClassification]], + ['hubert', ['HubertForSequenceClassification', HubertForSequenceClassification]], + ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]], +]); + +const MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = new Map([['wavlm', ['WavLMForXVector', WavLMForXVector]]]); + +const MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = new Map([ + ['unispeech-sat', ['UniSpeechSatForAudioFrameClassification', UniSpeechSatForAudioFrameClassification]], + ['wavlm', ['WavLMForAudioFrameClassification', WavLMForAudioFrameClassification]], + ['wav2vec2', ['Wav2Vec2ForAudioFrameClassification', Wav2Vec2ForAudioFrameClassification]], + ['pyannote', ['PyAnnoteForAudioFrameClassification', PyAnnoteForAudioFrameClassification]], +]); + +const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([ + ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]], +]); + +const MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = new Map([ + ['patchtst', ['PatchTSTForPrediction', PatchTSTForPrediction]], + ['patchtsmixer', ['PatchTSMixerForPrediction', PatchTSMixerForPrediction]], +]); + +const MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = new Map([ + ['swin2sr', ['Swin2SRForImageSuperResolution', Swin2SRForImageSuperResolution]], +]); + +const MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = new Map([ + ['dpt', ['DPTForDepthEstimation', DPTForDepthEstimation]], + ['depth_anything', ['DepthAnythingForDepthEstimation', DepthAnythingForDepthEstimation]], + ['glpn', ['GLPNForDepthEstimation', GLPNForDepthEstimation]], + ['sapiens', ['SapiensForDepthEstimation', SapiensForDepthEstimation]], + ['depth_pro', ['DepthProForDepthEstimation', DepthProForDepthEstimation]], + ['metric3d', ['Metric3DForDepthEstimation', Metric3DForDepthEstimation]], + ['metric3dv2', ['Metric3Dv2ForDepthEstimation', Metric3Dv2ForDepthEstimation]], +]); + +const MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES = new Map([ + ['sapiens', ['SapiensForNormalEstimation', SapiensForNormalEstimation]], +]); + +const MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES = new Map([ + ['vitpose', ['VitPoseForPoseEstimation', VitPoseForPoseEstimation]], +]); + +// NOTE: This is custom to Transformers.js, and is necessary because certain models +// (e.g., CLIP) are split into vision and text components +const MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = new Map([ + ['clip', ['CLIPVisionModelWithProjection', CLIPVisionModelWithProjection]], + ['siglip', ['SiglipVisionModel', SiglipVisionModel]], + ['jina_clip', ['JinaCLIPVisionModel', JinaCLIPVisionModel]], +]); + +const MODEL_CLASS_TYPE_MAPPING = [ + // MODEL_MAPPING_NAMES: + [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES.EncoderOnly], + [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES.EncoderDecoder], + [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES.DecoderOnly], + [MODEL_MAPPING_NAMES_AUTO_ENCODER, MODEL_TYPES.AutoEncoder], + + [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], + [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], + [MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES.DecoderOnly], + [MODEL_FOR_MULTIMODALITY_MAPPING_NAMES, MODEL_TYPES.MultiModality], + [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES.Vision2Seq], + [MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText], + [MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.AudioTextToText], + [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES.MaskGeneration], + [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq], + [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], + + // Custom: + [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly], +]; + +for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) { + // @ts-ignore + for (const [name, model] of mappings.values()) { + MODEL_TYPE_MAPPING.set(name, type); + MODEL_CLASS_TO_NAME_MAPPING.set(model, name); + MODEL_NAME_TO_CLASS_MAPPING.set(name, model); + } +} + +const CUSTOM_MAPPING = [ + // OVERRIDE: + // TODO: Refactor to allow class to specify model + ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen], + ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V], + + ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly], + ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly], + ['JinaCLIPTextModel', JinaCLIPTextModel, MODEL_TYPES.EncoderOnly], + ['ClapTextModelWithProjection', ClapTextModelWithProjection, MODEL_TYPES.EncoderOnly], + ['ClapAudioModelWithProjection', ClapAudioModelWithProjection, MODEL_TYPES.EncoderOnly], + + ['DacEncoderModel', DacEncoderModel, MODEL_TYPES.EncoderOnly], + ['DacDecoderModel', DacDecoderModel, MODEL_TYPES.EncoderOnly], + ['MimiEncoderModel', MimiEncoderModel, MODEL_TYPES.EncoderOnly], + ['MimiDecoderModel', MimiDecoderModel, MODEL_TYPES.EncoderOnly], + ['SnacEncoderModel', SnacEncoderModel, MODEL_TYPES.EncoderOnly], + ['SnacDecoderModel', SnacDecoderModel, MODEL_TYPES.EncoderOnly], + + ['Gemma3nForConditionalGeneration', Gemma3nForConditionalGeneration, MODEL_TYPES.ImageAudioTextToText], + ['SupertonicForConditionalGeneration', SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic], + ['ChatterboxModel', ChatterboxModel, MODEL_TYPES.Chatterbox], +]; +for (const [name, model, type] of CUSTOM_MAPPING) { + MODEL_TYPE_MAPPING.set(name, type); + MODEL_CLASS_TO_NAME_MAPPING.set(model, name); + MODEL_NAME_TO_CLASS_MAPPING.set(name, model); +} + +const CUSTOM_ARCHITECTURES = new Map([ + ['modnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], + ['birefnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], + ['isnet', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], + ['ben', MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES], +]); +for (const [name, mapping] of CUSTOM_ARCHITECTURES.entries()) { + mapping.set(name, ['PreTrainedModel', PreTrainedModel]); + MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly); + MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, name); + MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel); +} + +export { + CUSTOM_ARCHITECTURES, + MODEL_CLASS_TYPE_MAPPING, + MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, + MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, + MODEL_FOR_MASKED_LM_MAPPING_NAMES, + MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, + MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, + MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, + MODEL_FOR_CTC_MAPPING_NAMES, + MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, + MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES, + MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, + MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, + MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, + MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES, + MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES, + MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, + MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES, +}; + +export * from './pre-trained-models/index.js'; diff --git a/src/models/session.js b/src/models/session.js new file mode 100644 index 000000000..ecf942946 --- /dev/null +++ b/src/models/session.js @@ -0,0 +1,244 @@ +import { + createInferenceSession, + deviceToExecutionProviders, + isONNXProxy, + runInferenceSession, +} from '../backends/onnx.js'; +import { getCacheShapes } from '../configs.js'; +import { + DATA_TYPES, + DEFAULT_DEVICE_DTYPE_MAPPING, + DEFAULT_DTYPE_SUFFIX_MAPPING, + isWebGpuFp16Supported, +} from '../utils/dtypes.js'; +import { apis } from '../env.js'; +import { replaceTensors } from '../utils/tensor.js'; +import { validateInputs } from './utils.js'; +import { getCoreModelFile, getModelDataFiles } from '../utils/model-loader.js'; + +/** + * Constructs an InferenceSession using a model file located at the specified path. + * @param {string} pretrained_model_name_or_path The path to the directory containing the model file. + * @param {string} fileName The name of the model file. + * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the model. + * @param {boolean} [is_decoder=false] Whether the model is a decoder model. + * @returns {Promise<{buffer_or_path: Uint8Array|string, session_options: Object, session_config: Object}>} A Promise that resolves to the data needed to create an InferenceSession object. + * @private + */ +async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) { + let custom_config = options.config?.['transformers.js_config'] ?? {}; + + let device = options.device ?? custom_config.device; + if (device && typeof device !== 'string') { + if (device.hasOwnProperty(fileName)) { + device = device[fileName]; + } else { + console.warn(`device not specified for "${fileName}". Using the default device.`); + device = null; + } + } + + // If the device is not specified, we use the default (supported) execution providers. + const selectedDevice = /** @type {import("../utils/devices.js").DeviceType} */ ( + device ?? (apis.IS_NODE_ENV ? 'cpu' : 'wasm') + ); + + const executionProviders = deviceToExecutionProviders(selectedDevice); + + // Update custom config with the selected device's config, if it exists + const device_config = custom_config.device_config ?? {}; + if (device_config.hasOwnProperty(selectedDevice)) { + custom_config = { + ...custom_config, + ...device_config[selectedDevice], + }; + } + + // If options.dtype is specified, we use it to choose the suffix for the model file. + // Otherwise, we use the default dtype for the device. + let dtype = options.dtype ?? custom_config.dtype; + if (typeof dtype !== 'string') { + if (dtype && dtype.hasOwnProperty(fileName)) { + dtype = dtype[fileName]; + } else { + dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32; + console.warn( + `dtype not specified for "${fileName}". Using the default dtype (${dtype}) for this device (${selectedDevice}).`, + ); + } + } + + if (dtype === DATA_TYPES.auto) { + // Try to choose the auto dtype based on the custom config + let config_dtype = custom_config.dtype; + if (typeof config_dtype !== 'string') { + config_dtype = config_dtype?.[fileName]; + } + + if (config_dtype && config_dtype !== DATA_TYPES.auto && DATA_TYPES.hasOwnProperty(config_dtype)) { + // Defined by the config, and is not "auto" + dtype = config_dtype; + } else { + // Choose default dtype based on device, falling back to fp32 + dtype = DEFAULT_DEVICE_DTYPE_MAPPING[selectedDevice] ?? DATA_TYPES.fp32; + } + } + + const selectedDtype = /** @type {import("../utils/dtypes.js").DataType} */ (dtype); + + if (!DEFAULT_DTYPE_SUFFIX_MAPPING.hasOwnProperty(selectedDtype)) { + throw new Error(`Invalid dtype: ${selectedDtype}. Should be one of: ${Object.keys(DATA_TYPES).join(', ')}`); + } else if ( + selectedDevice === 'webgpu' && + // NOTE: Currently, we assume that the Native WebGPU EP always supports fp16. In future, we will add a check for this. + !apis.IS_NODE_ENV && + selectedDtype === DATA_TYPES.fp16 && + !(await isWebGpuFp16Supported()) + ) { + throw new Error(`The device (${selectedDevice}) does not support fp16.`); + } + + // Only valid for models with a decoder + const kv_cache_dtype_config = custom_config.kv_cache_dtype; + const kv_cache_dtype = kv_cache_dtype_config + ? typeof kv_cache_dtype_config === 'string' + ? kv_cache_dtype_config + : (kv_cache_dtype_config[selectedDtype] ?? 'float32') + : undefined; + + if (kv_cache_dtype && !['float32', 'float16'].includes(kv_cache_dtype)) { + throw new Error(`Invalid kv_cache_dtype: ${kv_cache_dtype}. Should be one of: float32, float16`); + } + + const session_config = { + dtype: selectedDtype, + kv_cache_dtype, + device: selectedDevice, + }; + + // Construct the model file suffix + const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[selectedDtype]; + + const session_options = { ...options.session_options }; + + // Overwrite `executionProviders` if not specified + session_options.executionProviders ??= executionProviders; + + // Overwrite `freeDimensionOverrides` if specified in config and not set in session options + const free_dimension_overrides = custom_config.free_dimension_overrides; + if (free_dimension_overrides) { + session_options.freeDimensionOverrides ??= free_dimension_overrides; + } else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) { + console.warn( + `WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${selectedDevice}"]. ` + + `When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`, + ); + } + + const bufferOrPathPromise = getCoreModelFile(pretrained_model_name_or_path, fileName, options, suffix); + + // Handle onnx external data files + const use_external_data_format = options.use_external_data_format ?? custom_config.use_external_data_format; + const externalData = await getModelDataFiles( + pretrained_model_name_or_path, + fileName, + suffix, + options, + use_external_data_format, + session_options, + ); + + if (externalData.length > 0 && !apis.IS_NODE_ENV) { + session_options.externalData = externalData; + } + + if (is_decoder && selectedDevice === 'webgpu' && kv_cache_dtype_config !== false) { + const shapes = getCacheShapes(options.config, { + prefix: 'present', + }); + if (Object.keys(shapes).length > 0 && !isONNXProxy()) { + // Only set preferredOutputLocation if shapes are present and we aren't proxying ONNX + /** @type {Record} */ + const preferredOutputLocation = {}; + for (const key in shapes) { + preferredOutputLocation[key] = 'gpu-buffer'; + } + session_options.preferredOutputLocation = preferredOutputLocation; + } + } + + const buffer_or_path = await bufferOrPathPromise; + + return { buffer_or_path, session_options, session_config }; +} + +/** + * Helper function to create multiple InferenceSession objects. + * + * @param {string} pretrained_model_name_or_path The path to the directory containing the model file. + * @param {Record} names The names of the model files to load. + * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the model. + * @param {string} [decoder_name] The name of the decoder model, if any. + * @returns {Promise>} A Promise that resolves to a dictionary of InferenceSession objects. + * @private + */ +export async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = undefined) { + return Object.fromEntries( + await Promise.all( + Object.keys(names).map(async (name) => { + const { buffer_or_path, session_options, session_config } = await getSession( + pretrained_model_name_or_path, + names[name], + options, + name === decoder_name, + ); + const session = await createInferenceSession(buffer_or_path, session_options, session_config); + return [name, session]; + }), + ), + ); +} + +/** + * Executes an InferenceSession using the specified inputs. + * NOTE: `inputs` must contain at least the input names of the model. + * - If additional inputs are passed, they will be ignored. + * - If inputs are missing, an error will be thrown. + * + * @param {Object} session The InferenceSession object to run. + * @param {Object} inputs An object that maps input names to input tensors. + * @returns {Promise} A Promise that resolves to an object that maps output names to output tensors. + * @private + */ +export async function sessionRun(session, inputs) { + const checkedInputs = validateInputs(session, inputs); + try { + // pass the original ort tensor + const ortFeed = Object.fromEntries(Object.entries(checkedInputs).map(([k, v]) => [k, v.ort_tensor])); + const output = await runInferenceSession(session, ortFeed); + return replaceTensors(output); + } catch (e) { + // Error messages can be long (nested) and uninformative. For this reason, + // we apply minor formatting to show the most important information + const formatted = Object.fromEntries( + Object.entries(checkedInputs).map(([k, tensor]) => { + // Extract these properties from the underlying ORT tensor + const unpacked = { + type: tensor.type, + dims: tensor.dims, + location: tensor.location, + }; + if (unpacked.location !== 'gpu-buffer') { + // Only return the data if it's not a GPU buffer + unpacked.data = tensor.data; + } + return [k, unpacked]; + }), + ); + + // This usually occurs when the inputs are of the wrong type. + console.error(`An error occurred during model execution: "${e}".`); + console.error('Inputs given to model:', formatted); + throw e; + } +} diff --git a/src/models/siglip/image_processing_siglip.js b/src/models/siglip/image_processing_siglip.js deleted file mode 100644 index 4651b9c73..000000000 --- a/src/models/siglip/image_processing_siglip.js +++ /dev/null @@ -1,3 +0,0 @@ -import { ImageProcessor } from '../../base/image_processors_utils.js'; - -export class SiglipImageProcessor extends ImageProcessor {} diff --git a/src/models/speecht5/feature_extraction_speecht5.js b/src/models/speecht5/feature_extraction_speecht5.js deleted file mode 100644 index 881f9b9ad..000000000 --- a/src/models/speecht5/feature_extraction_speecht5.js +++ /dev/null @@ -1,3 +0,0 @@ -import { FeatureExtractor } from '../../base/feature_extraction_utils.js'; - -export class SpeechT5FeatureExtractor extends FeatureExtractor {} diff --git a/src/models/utils.js b/src/models/utils.js new file mode 100644 index 000000000..6912136d2 --- /dev/null +++ b/src/models/utils.js @@ -0,0 +1,573 @@ +// JS doesn't support mixins, so we define some reused functions here, and allow "this" to be passed in +import { pick } from '../utils/core.js'; +import { cat, full_like, ones, Tensor, toI64Tensor, zeros_like, boolTensor, full } from '../utils/tensor.js'; +import { max } from '../utils/maths.js'; +import { sessionRun } from './session.js'; +import { getModelJSON } from '../utils/hub.js'; +import { isONNXProxy } from '../backends/onnx.js'; +import { Seq2SeqLMOutput } from './output.js'; + +/** + * Perform forward pass on the seq2seq model (both encoder and decoder). + * @param {Object} self The seq2seq model object. + * @param {Object} model_inputs The input object for the model containing encoder and decoder inputs. + * @returns {Promise} Promise that resolves with the output of the seq2seq model. + * @private + */ +export async function seq2seqForward(self, model_inputs) { + let { encoder_outputs, input_ids, decoder_input_ids, ...other_decoder_inputs } = model_inputs; + // Encode if needed + if (!encoder_outputs) { + const encoder_inputs = pick(model_inputs, self.sessions['model'].inputNames); + // Encoder outputs are not given, so we must compute them. + encoder_outputs = (await encoderForward(self, encoder_inputs)).last_hidden_state; + } + + other_decoder_inputs.input_ids = decoder_input_ids; + other_decoder_inputs.encoder_hidden_states = encoder_outputs; + + if (self.sessions['decoder_model_merged'].inputNames.includes('encoder_attention_mask')) { + other_decoder_inputs.encoder_attention_mask = model_inputs.attention_mask; + } + + return await decoderForward(self, other_decoder_inputs, true); +} + +/** + * Forward pass of an encoder model. + * @param {Object} self The encoder model. + * @param {Object} model_inputs The input data to be used for the forward pass. + * @returns {Promise} The model's outputs. + * @private + */ +export async function encoderForward(self, model_inputs) { + const session = self.sessions['model']; + const encoderFeeds = pick(model_inputs, session.inputNames); + + if (session.inputNames.includes('inputs_embeds') && !encoderFeeds.inputs_embeds) { + if (!model_inputs.input_ids) { + throw new Error('Both `input_ids` and `inputs_embeds` are missing in the model inputs.'); + } + encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids }); + } + if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) { + if (!encoderFeeds.input_ids) { + throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.'); + } + // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it, + // but they weren't created by the tokenizer. + encoderFeeds.token_type_ids = zeros_like(encoderFeeds.input_ids); + } + if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) { + if (!encoderFeeds.pixel_values) { + throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.'); + } + // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it, + // but they weren't created by the processor. + const dims = encoderFeeds.pixel_values.dims; + encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]); + } + + return await sessionRun(session, encoderFeeds); +} + +export async function autoEncoderForward(self, model_inputs) { + const encoded = await self.encode(model_inputs); + const decoded = await self.decode(encoded); + return decoded; +} + +/** + * Forward pass of a decoder model. + * @param {Object} self The decoder model. + * @param {Object} model_inputs The input data to be used for the forward pass. + * @returns {Promise} The logits and past key values. + * @private + */ +export async function decoderForward(self, model_inputs, is_encoder_decoder = false) { + const session = self.sessions[is_encoder_decoder ? 'decoder_model_merged' : 'model']; + + const { past_key_values, ...new_model_inputs } = model_inputs; + + if (session.inputNames.includes('use_cache_branch')) { + new_model_inputs.use_cache_branch = boolTensor(!!past_key_values); + } + if ( + session.inputNames.includes('position_ids') && + new_model_inputs.attention_mask && + !new_model_inputs.position_ids + ) { + // NOTE: Handle a special case for paligemma/gemma3 models, where positions are 1-indexed + const start_index = ['paligemma', 'gemma3_text', 'gemma3'].includes(self.config.model_type) ? 1 : 0; + new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index); + } + + // Unpack the `past_key_values` object into model inputs + self.addPastKeyValues(new_model_inputs, past_key_values); + + // Select only the inputs that are needed for the current session + const fixed = pick(new_model_inputs, session.inputNames); + return await sessionRun(session, fixed); +} + +/** + * Abstract forward pass function for image-text-to-text or audio-text-to-text models. + * @param {Object} self The model object. + * @param {Object} params Additional parameters. + * @param {Function} [params.encode_function] The function to encode the modality values. + * @param {Function} [params.merge_function] The function to merge the modality features with the input embeddings. + * @param {string} [params.modality_input_name] The modality input name. + * @param {string} [params.modality_output_name] The modality output name. + * @param {Tensor} [params.input_ids=null] + * @param {Tensor} [params.attention_mask=null] + * @param {Tensor} [params.position_ids=null] + * @param {Tensor} [params.inputs_embeds=null] + * @param {Tensor} [params.past_key_values=null] + * @param {Object} [params.generation_config=null] + * @param {Object} [params.logits_processor=null] + * @returns {Promise} The model's output tensor + * @private + */ +export async function genericTextToTextForward( + self, + { + // Generic parameters: + encode_function, + merge_function, + modality_input_name, + modality_output_name, + + // Produced by the tokenizer/processor: + input_ids = null, + attention_mask = null, + + // Used during generation: + position_ids = null, + inputs_embeds = null, + past_key_values = null, + + // Generic generation parameters + generation_config = null, + logits_processor = null, + + // Additional parameters + ...kwargs + }, +) { + const modality_values = kwargs[modality_input_name]; + if (!inputs_embeds) { + // 1. Extract the text embeddings. + inputs_embeds = await self.encode_text({ input_ids, ...kwargs }); + + // 2. Possibly, merge text and modality values + if (modality_values && input_ids.dims[1] !== 1) { + const modality_features = await encode_function({ + // Pass the modality values under its expected key. + // The caller knows whether this is audio or image. + [modality_input_name]: modality_values, + ...kwargs, + }); + ({ inputs_embeds, attention_mask } = merge_function({ + [modality_output_name]: modality_features, + inputs_embeds, + input_ids, + attention_mask, + })); + } else if (past_key_values && modality_values && input_ids.dims[1] === 1) { + // This branch handles the cache case. + const target_length = input_ids.dims[1]; // always 1 + const past_length = Object.values(past_key_values)[0].dims.at(-2); + + attention_mask = cat( + [ + ones([input_ids.dims[0], past_length]), + attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]]), + ], + 1, + ); + } + } + + if (!position_ids) { + if (self.config.model_type === 'qwen2_vl') { + // Special case for qwen2_vl models + // @ts-ignore + const { image_grid_thw, video_grid_thw } = kwargs; + [position_ids] = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask); + } + } + + // 3. Call the decoder forward using the updated inputs. + const outputs = await decoderForward( + self, + { + inputs_embeds, + past_key_values, + attention_mask, + position_ids, + generation_config, + logits_processor, + }, + true, + ); + return outputs; +} + +/** + * Forward pass of an audio-text-to-text model. + * @param {Object} self The audio-text-to-text model. + * @param {Object} params The inputs for the audio-text-to-text forward pass. + * @returns {Promise} The model's output tensor. + * @private + */ +export async function audioTextToTextForward(self, params) { + return await genericTextToTextForward(self, { + ...params, + modality_input_name: 'audio_values', + modality_output_name: 'audio_features', + encode_function: self.encode_audio.bind(self), + merge_function: self._merge_input_ids_with_audio_features.bind(self), + }); +} + +/** + * Forward pass of an image-text-to-text model. + * @param {Object} self The image-text-to-text model. + * @param {Object} params The inputs for the image-text-to-text forward pass. + * @returns {Promise} The model's output tensor. + * @private + */ +export async function imageTextToTextForward(self, params) { + return await genericTextToTextForward(self, { + ...params, + modality_input_name: 'pixel_values', + modality_output_name: 'image_features', + encode_function: self.encode_image.bind(self), + merge_function: self._merge_input_ids_with_image_features.bind(self), + }); +} + +/** + * Helper function to perform the following: + * ```python + * x = attention_mask.long().cumsum(-1) - 1 + * x.masked_fill_(attention_mask == 0, 1) + * ``` + * @param {Tensor} attention_mask + * @returns {{data: BigInt64Array, dims: number[]}} + */ +export function cumsum_masked_fill(attention_mask, start_index = 0) { + const [bz, seq_len] = attention_mask.dims; + const attn_mask_data = attention_mask.data; + + const data = new BigInt64Array(attn_mask_data.length); + for (let i = 0; i < bz; ++i) { + const start = i * seq_len; + let sum = BigInt(start_index); + for (let j = 0; j < seq_len; ++j) { + const index = start + j; + if (attn_mask_data[index] === 0n) { + data[index] = BigInt(1); + } else { + // === 1n + data[index] = sum; + sum += attn_mask_data[index]; + } + } + } + return { data, dims: attention_mask.dims }; +} + +/** + * If the model supports providing position_ids, we create position_ids on the fly for batch generation, + * by computing the cumulative sum of the attention mask along the sequence length dimension. + * + * Equivalent to: + * ```python + * position_ids = attention_mask.long().cumsum(-1) - 1 + * position_ids.masked_fill_(attention_mask == 0, 1) + * if past_key_values: + * position_ids = position_ids[:, -input_ids.shape[1] :] + * ``` + */ +export function createPositionIds(model_inputs, past_key_values = null, start_index = 0) { + const { input_ids, inputs_embeds, attention_mask } = model_inputs; + + const { data, dims } = cumsum_masked_fill(attention_mask, start_index); + let position_ids = new Tensor('int64', data, dims); + if (past_key_values) { + const offset = -(input_ids ?? inputs_embeds).dims.at(1); + position_ids = position_ids.slice(null, [offset, null]); + } + return position_ids; +} + +export function decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { + const past_length = model_inputs.past_key_values ? Object.values(model_inputs.past_key_values)[0].dims.at(-2) : 0; + + if (!model_inputs.attention_mask) { + // If the attention mask is not provided, we attempt to infer based on provided inputs + let dims; + for (const key of ['input_ids', 'inputs_embeds', 'position_ids']) { + if (model_inputs[key]) { + dims = model_inputs[key].dims; + break; + } + } + if (!dims) { + throw new Error('attention_mask is not provided, and unable to infer its shape from model inputs.'); + } + model_inputs.attention_mask = ones([dims[0], past_length + dims[1]]); + } + + if (model_inputs.past_key_values) { + const { input_ids, attention_mask } = model_inputs; + + // Keep only the unprocessed tokens: + // 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + // some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + // input) + if (attention_mask && attention_mask.dims[1] > input_ids.dims[1]) { + // NOTE: not needed since we only pass the generated tokens to the next forward pass + // const offset = -(attention_mask.dims[1] - past_length); + // model_inputs.input_ids = input_ids.slice(null, [offset, null]); + } + // 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. + // We can discard input_ids based on the past_length. + else if (past_length < input_ids.dims[1]) { + // NOTE: Required for phi models. + // See https://github.com/huggingface/transformers/issues/30809#issuecomment-2111918479 for more information. + model_inputs.input_ids = input_ids.slice(null, [past_length, null]); + } + // 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + else { + } + } + + return model_inputs; +} + +export function encoder_decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { + if (model_inputs.past_key_values) { + input_ids = input_ids.map((x) => [x.at(-1)]); + } + + return { + ...model_inputs, + decoder_input_ids: toI64Tensor(input_ids), + }; +} + +export function multimodal_text_to_text_prepare_inputs_for_generation(self, ...args) { + if (self.config.is_encoder_decoder) { + return encoder_decoder_prepare_inputs_for_generation(self, ...args); + } else { + return decoder_prepare_inputs_for_generation(self, ...args); + } +} + +export function multimodality_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { + const has_past_key_values = !!model_inputs.past_key_values; + + if (generation_config.guidance_scale !== null && generation_config.guidance_scale > 1) { + if (has_past_key_values) { + model_inputs.input_ids = cat([model_inputs.input_ids, model_inputs.input_ids], 0); + // NOTE: attention_mask handled in generation + } else { + model_inputs.input_ids = cat( + [model_inputs.input_ids, full_like(model_inputs.input_ids, BigInt(generation_config.pad_token_id))], + 0, + ); + model_inputs.attention_mask = cat( + [model_inputs.attention_mask, full_like(model_inputs.attention_mask, 0n)], + 0, + ); + } + } + + if (has_past_key_values || !model_inputs.pixel_values) { + model_inputs.pixel_values = full([0, 0, 3, 384, 384], 1.0); + } + + if (has_past_key_values) { + const num_img_tokens = 0; + const num_text_tokens = 1; + const has_image = num_img_tokens > 0 ? 1 : 0; + + const batch_size = 1; + model_inputs.images_seq_mask = new Tensor( + 'bool', + new Array(num_img_tokens + num_text_tokens).fill(true).fill(false, 0, num_text_tokens), + [batch_size, num_img_tokens + num_text_tokens], + ); + model_inputs.images_emb_mask = new Tensor('bool', new Array(num_img_tokens).fill(!!has_image), [ + batch_size, + 1, + num_img_tokens, + ]); + } + return model_inputs; +} + +export function chatterbox_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config) { + if (!model_inputs.position_ids && self.sessions['embed_tokens'].inputNames.includes('position_ids')) { + // If position_ids are not provided, we create them on the fly using the position of the START_SPEECH_TOKEN + const START_SPEECH_TOKEN = 6561; + if (model_inputs.input_ids.dims[1] === 1) { + const position_ids = Array.from( + { + length: input_ids.length, + }, + (_, i) => input_ids[i].length - input_ids[i].findLastIndex((x) => x == START_SPEECH_TOKEN) - 1, + ); + model_inputs.position_ids = new Tensor('int64', position_ids, [input_ids.length, 1]); + } else { + const batched_input_ids = model_inputs.input_ids.tolist(); + const position_ids_list = batched_input_ids.map((ids) => { + let position = 0; + return ids.map((id) => (id >= START_SPEECH_TOKEN ? 0 : position++)); + }); + model_inputs.position_ids = new Tensor('int64', position_ids_list.flat(), model_inputs.input_ids.dims); + } + } + if (model_inputs.input_ids.dims[1] === 1) { + // We are in generation mode and no longer need the audio inputs + delete model_inputs.audio_values; + delete model_inputs.audio_features; + delete model_inputs.audio_tokens; + delete model_inputs.speaker_embeddings; + delete model_inputs.speaker_features; + } + return decoder_prepare_inputs_for_generation(self, input_ids, model_inputs, generation_config); +} + +/** + * Validate model inputs + * @param {Object} session The InferenceSession object that will be run. + * @param {Object} inputs The inputs to check. + * @returns {Record} The checked inputs. + * @throws {Error} If any inputs are missing. + * @private + */ +export function validateInputs(session, inputs) { + /** + * NOTE: Create either a shallow or deep copy based on `onnx.wasm.proxy` + * @type {Record} + */ + const checkedInputs = Object.create(null); + const missingInputs = []; + for (const inputName of session.inputNames) { + const tensor = inputs[inputName]; + // Rare case where one of the model's input names corresponds to a built-in + // object name (e.g., toString), which would cause a simple (!tensor) check to fail, + // because it's not undefined but a function. + if (!(tensor instanceof Tensor)) { + missingInputs.push(inputName); + continue; + } + // NOTE: When `env.wasm.proxy is true` the tensor is moved across the Worker + // boundary, transferring ownership to the worker and invalidating the tensor. + // So, in this case, we simply sacrifice a clone for it. + checkedInputs[inputName] = isONNXProxy() ? tensor.clone() : tensor; + } + if (missingInputs.length > 0) { + throw new Error( + `An error occurred during model execution: "Missing the following inputs: ${missingInputs.join(', ')}.`, + ); + } + + const numInputsProvided = Object.keys(inputs).length; + const numInputsNeeded = session.inputNames.length; + if (numInputsProvided > numInputsNeeded) { + // No missing inputs, but too many inputs were provided. + // Warn the user and ignore the extra inputs. + let ignored = Object.keys(inputs).filter((inputName) => !session.inputNames.includes(inputName)); + console.warn( + `WARNING: Too many inputs were provided (${numInputsProvided} > ${numInputsNeeded}). The following inputs will be ignored: "${ignored.join(', ')}".`, + ); + } + + return checkedInputs; +} + +export function default_merge_input_ids_with_features({ + modality_token_id, + inputs_embeds, + modality_features, + input_ids, + attention_mask, +}) { + const token_positions = input_ids.tolist().map((ids) => + ids.reduce((acc, x, idx) => { + if (x == modality_token_id) acc.push(idx); + return acc; + }, []), + ); + const n_tokens = token_positions.reduce((acc, x) => acc + x.length, 0); + const n_features = modality_features.dims[0]; + if (n_tokens !== n_features) { + throw new Error(`Number of tokens and features do not match: tokens: ${n_tokens}, features ${n_features}`); + } + + // Equivalent to performing a masked_scatter + let img = 0; + for (let i = 0; i < token_positions.length; ++i) { + const tokens = token_positions[i]; + const embeds = inputs_embeds[i]; + for (let j = 0; j < tokens.length; ++j) { + embeds[tokens[j]].data.set(modality_features[img++].data); + } + } + return { inputs_embeds, attention_mask }; +} + +export function default_merge_input_ids_with_image_features({ + image_token_id, + inputs_embeds, + image_features, + input_ids, + attention_mask, +}) { + return default_merge_input_ids_with_features({ + modality_token_id: image_token_id, + inputs_embeds, + modality_features: image_features, + input_ids, + attention_mask, + }); +} + +export function default_merge_input_ids_with_audio_features({ + audio_token_id, + inputs_embeds, + audio_features, + input_ids, + attention_mask, +}) { + return default_merge_input_ids_with_features({ + modality_token_id: audio_token_id, + inputs_embeds, + modality_features: audio_features, + input_ids, + attention_mask, + }); +} + +/** + * Helper function to load multiple optional configuration files + * @param {string} pretrained_model_name_or_path The path to the directory containing the config file. + * @param {Record} names The names of the config files to load. + * @param {import('../utils/hub.js').PretrainedModelOptions} options Additional options for loading the configs. + * @returns {Promise>} A Promise that resolves to a dictionary of configuration objects. + * @private + */ +export async function getOptionalConfigs(pretrained_model_name_or_path, names, options) { + return Object.fromEntries( + await Promise.all( + Object.keys(names).map(async (name) => { + const config = await getModelJSON(pretrained_model_name_or_path, names[name], false, options); + return [name, config]; + }), + ), + ); +} diff --git a/src/pipelines.js b/src/pipelines.js index 02392cf9d..78adf6b76 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -14,7 +14,7 @@ */ import { AutoTokenizer } from './tokenizers.js'; -import { AutoProcessor } from './models/auto/processing_auto.js'; +import { AutoProcessor } from './models/model-processors/auto/processing_auto.js'; import { AutoModel, AutoModelForSequenceClassification, diff --git a/src/pipelines/_base.js b/src/pipelines/_base.js index 4011ba33c..a8aacc8ea 100644 --- a/src/pipelines/_base.js +++ b/src/pipelines/_base.js @@ -1,5 +1,5 @@ import { PreTrainedTokenizer } from '../tokenizers.js'; -import { PreTrainedModel } from '../models.js'; +import { PreTrainedModel } from '../models/pre-trained-model.js'; import { Processor } from '../base/processing_utils.js'; import { Callable } from '../utils/generic.js'; diff --git a/src/tokenizers.js b/src/tokenizers.js index 0e69372d5..daf596312 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -31,7 +31,7 @@ import { PriorityQueue, TokenLattice, CharTrie, DictionarySplitter, LRUCache } f import { Template } from '@huggingface/jinja'; -import { WHISPER_LANGUAGE_MAPPING } from './models/whisper/common_whisper.js'; +import { WHISPER_LANGUAGE_MAPPING } from './models/model-processors/whisper/common_whisper.js'; /** * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties. diff --git a/src/transformers.js b/src/transformers.js index 2212cb251..65bb98c39 100644 --- a/src/transformers.js +++ b/src/transformers.js @@ -25,16 +25,16 @@ export * from './utils/tensor.js'; export * from './utils/maths.js'; export { FeatureExtractor } from './base/feature_extraction_utils.js'; -export * from './models/feature_extractors.js'; -export * from './models/auto/feature_extraction_auto.js'; +export * from './models/feature-extractors.js'; +export * from './models/model-processors/auto/feature_extraction_auto.js'; export { ImageProcessor } from './base/image_processors_utils.js'; -export * from './models/image_processors.js'; -export * from './models/auto/image_processing_auto.js'; +export * from './models/image-processors.js'; +export * from './models/model-processors/auto/image_processing_auto.js'; export { Processor } from './base/processing_utils.js'; export * from './models/processors.js'; -export * from './models/auto/processing_auto.js'; +export * from './models/model-processors/auto/processing_auto.js'; export * from './generation/streamers.js'; export * from './generation/stopping_criteria.js'; diff --git a/src/utils/hub.js b/src/utils/hub.js index 07c4f492c..de87f9147 100755 --- a/src/utils/hub.js +++ b/src/utils/hub.js @@ -92,45 +92,19 @@ export async function getFile(urlOrPath) { } /** - * Retrieves a file from either a remote URL using the Fetch API or from the local file system using the FileSystem API. - * If the filesystem is available and `env.useCache = true`, the file will be downloaded and cached. + * Builds the resource paths and URLs for a model file. + * Can be used to get the resource URL or path without loading the file. * * @param {string} path_or_repo_id This can be either: * - a string, the *model id* of a model repo on huggingface.co. * - a path to a *directory* potentially containing the file. - * @param {string} filename The name of the file to locate in `path_or_repo`. - * @param {boolean} [fatal=true] Whether to throw an error if the file is not found. + * @param {string} filename The name of the file to locate. * @param {PretrainedOptions} [options] An object containing optional parameters. - * @param {boolean} [return_path=false] Whether to return the path of the file instead of the file content. - * - * @throws Will throw an error if the file is not found and `fatal` is true. - * @returns {Promise} A Promise that resolves with the file content as a Uint8Array if `return_path` is false, or the file path as a string if `return_path` is true. + * @param {import('./cache.js').CacheInterface | null} [cache] The cache instance to use for determining cache keys. + * @returns {{ requestURL: string, localPath: string, remoteURL: string, proposedCacheKey: string, validModelId: boolean }} + * An object containing all the paths and URLs for the resource. */ -export async function getModelFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false) { - if (!env.allowLocalModels) { - // User has disabled local models, so we just make sure other settings are correct. - - if (options.local_files_only) { - throw Error( - 'Invalid configuration detected: local models are disabled (`env.allowLocalModels=false`) but you have requested to only use local models (`local_files_only=true`).', - ); - } else if (!env.allowRemoteModels) { - throw Error( - 'Invalid configuration detected: both local and remote models are disabled. Fix by setting `env.allowLocalModels` or `env.allowRemoteModels` to `true`.', - ); - } - } - - // Initiate file retrieval - dispatchCallback(options.progress_callback, { - status: 'initiate', - name: path_or_repo_id, - file: filename, - }); - - /** @type {import('./cache.js').CacheInterface | null} */ - const cache = await getCache(options?.cache_dir); - +export function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = null) { const revision = options.revision ?? 'main'; const requestURL = pathJoin(path_or_repo_id, filename); @@ -144,8 +118,6 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti filename, ); - /** @type {string} */ - let cacheKey; const proposedCacheKey = cache instanceof FileCache ? // Choose cache key for filesystem cache @@ -156,19 +128,125 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti : pathJoin(path_or_repo_id, revision, filename) : remoteURL; + return { + requestURL, + localPath, + remoteURL, + proposedCacheKey, + validModelId, + }; +} + +/** + * Checks if a resource exists in cache. + * + * @param {import('./cache.js').CacheInterface | null} cache The cache instance to check. + * @param {string} localPath The local path to try first. + * @param {string} proposedCacheKey The proposed cache key to try second. + * @returns {Promise} + * The cached response if found, undefined otherwise. + */ +export async function checkCachedResource(cache, localPath, proposedCacheKey) { + if (!cache) { + return undefined; + } + + // A caching system is available, so we try to get the file from it. + // 1. We first try to get from cache using the local path. In some environments (like deno), + // non-URL cache keys are not allowed. In these cases, `response` will be undefined. + // 2. If no response is found, we try to get from cache using the remote URL or file system cache. + return await tryCache(cache, localPath, proposedCacheKey); +} + +/** + * Stores a resource in the cache. + * + * @param {import('./cache.js').CacheInterface} cache The cache instance to store in. + * @param {string} cacheKey The cache key to use. + * @param {Response|import('./hub/FileResponse.js').default} response The response to cache. + * @param {Uint8Array} [result] The result buffer if already read. + * @param {PretrainedOptions & { _path_or_repo_id?: string, _filename?: string }} [options] Options containing progress callback and context for progress updates. + * @returns {Promise} + */ +export async function storeCachedResource(cache, cacheKey, response, result, options = {}) { + // Check again whether request is in cache. If not, we add the response to the cache + if ((await cache.match(cacheKey)) !== undefined) { + return; + } + + if (!result) { + // We haven't yet read the response body, so we need to do so now. + // Ensure progress updates include consistent metadata. + const wrapped_progress = options.progress_callback + ? (data) => + dispatchCallback(options.progress_callback, { + status: 'progress', + name: options._path_or_repo_id, + file: options._filename, + ...data, + }) + : undefined; + await cache.put(cacheKey, /** @type {Response} */ (response), wrapped_progress); + } else if (typeof response !== 'string') { + // NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files + await cache + .put( + cacheKey, + new Response(/** @type {any} */ (result), { + headers: response.headers, + }), + ) + .catch((err) => { + // Do not crash if unable to add to cache (e.g., QuotaExceededError). + // Rather, log a warning and proceed with execution. + console.warn(`Unable to add response to browser cache: ${err}.`); + }); + } +} + +/** + * Loads a resource file from local or remote sources. + * + * @param {string} path_or_repo_id This can be either: + * - a string, the *model id* of a model repo on huggingface.co. + * - a path to a *directory* potentially containing the file. + * @param {string} filename The name of the file to locate. + * @param {boolean} [fatal=true] Whether to throw an error if the file is not found. + * @param {PretrainedOptions} [options] An object containing optional parameters. + * @param {boolean} [return_path=false] Whether to return the path of the file instead of the file content. + * @param {import('./cache.js').CacheInterface | null} [cache] The cache instance to use. + * @param {{ requestURL: string, localPath: string, remoteURL: string, proposedCacheKey: string, validModelId: boolean }} [paths] Pre-built paths object. + * + * @throws Will throw an error if the file is not found and `fatal` is true. + * @returns {Promise} A Promise that resolves with the file content as a Uint8Array if `return_path` is false, or the file path as a string if `return_path` is true. + */ +export async function loadResourceFile( + path_or_repo_id, + filename, + fatal = true, + options = {}, + return_path = false, + cache = null, + paths = null, +) { + // Use pre-built paths or build them + if (!paths) { + paths = buildResourcePaths(path_or_repo_id, filename, options, cache); + } + + const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = paths; + + /** @type {string} */ + let cacheKey; + // Whether to cache the final response in the end. let toCacheResponse = false; /** @type {Response|import('./hub/FileResponse.js').default|undefined|string} */ let response; - if (cache) { - // A caching system is available, so we try to get the file from it. - // 1. We first try to get from cache using the local path. In some environments (like deno), - // non-URL cache keys are not allowed. In these cases, `response` will be undefined. - // 2. If no response is found, we try to get from cache using the remote URL or file system cache. - response = await tryCache(cache, localPath, proposedCacheKey); - } + // Check cache + response = await checkCachedResource(cache, localPath, proposedCacheKey); const cacheHit = response !== undefined; if (!cacheHit) { @@ -295,38 +373,17 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti // i.e., do not cache FileResponses (prevents duplication) toCacheResponse && cacheKey && - // Check again whether request is in cache. If not, we add the response to the cache - (await cache.match(cacheKey)) === undefined + typeof response !== 'string' ) { - if (!result) { - // We haven't yet read the response body, so we need to do so now. - // Ensure progress updates include consistent metadata. - const wrapped_progress = options.progress_callback - ? (data) => - dispatchCallback(options.progress_callback, { - status: 'progress', - name: path_or_repo_id, - file: filename, - ...data, - }) - : undefined; - await cache.put(cacheKey, /** @type {Response} */ (response), wrapped_progress); - } else if (typeof response !== 'string') { - // NOTE: We use `new Response(buffer, ...)` instead of `response.clone()` to handle LFS files - await cache - .put( - cacheKey, - new Response(/** @type {any} */ (result), { - headers: response.headers, - }), - ) - .catch((err) => { - // Do not crash if unable to add to cache (e.g., QuotaExceededError). - // Rather, log a warning and proceed with execution. - console.warn(`Unable to add response to browser cache: ${err}.`); - }); - } + // Store temporary context for progress callbacks in cache storage + const extendedOptions = { + ...options, + _path_or_repo_id: path_or_repo_id, + _filename: filename, + }; + await storeCachedResource(cache, cacheKey, response, result, extendedOptions); } + dispatchCallback(options.progress_callback, { status: 'done', name: path_or_repo_id, @@ -357,6 +414,49 @@ export async function getModelFile(path_or_repo_id, filename, fatal = true, opti throw new Error('Unable to get model file path or buffer.'); } +/** + * Retrieves a file from either a remote URL using the Fetch API or from the local file system using the FileSystem API. + * If the filesystem is available and `env.useCache = true`, the file will be downloaded and cached. + * + * @param {string} path_or_repo_id This can be either: + * - a string, the *model id* of a model repo on huggingface.co. + * - a path to a *directory* potentially containing the file. + * @param {string} filename The name of the file to locate in `path_or_repo`. + * @param {boolean} [fatal=true] Whether to throw an error if the file is not found. + * @param {PretrainedOptions} [options] An object containing optional parameters. + * @param {boolean} [return_path=false] Whether to return the path of the file instead of the file content. + * + * @throws Will throw an error if the file is not found and `fatal` is true. + * @returns {Promise} A Promise that resolves with the file content as a Uint8Array if `return_path` is false, or the file path as a string if `return_path` is true. + */ +export async function getModelFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false) { + if (!env.allowLocalModels) { + // User has disabled local models, so we just make sure other settings are correct. + + if (options.local_files_only) { + throw Error( + 'Invalid configuration detected: local models are disabled (`env.allowLocalModels=false`) but you have requested to only use local models (`local_files_only=true`).', + ); + } else if (!env.allowRemoteModels) { + throw Error( + 'Invalid configuration detected: both local and remote models are disabled. Fix by setting `env.allowLocalModels` or `env.allowRemoteModels` to `true`.', + ); + } + } + + dispatchCallback(options.progress_callback, { + status: 'initiate', + name: path_or_repo_id, + file: filename, + }); + + /** @type {import('./cache.js').CacheInterface | null} */ + const cache = await getCache(options?.cache_dir); + const paths = buildResourcePaths(path_or_repo_id, filename, options, cache); + + return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache, paths); +} + /** * Fetches a text file from a given path and file name. * diff --git a/src/utils/model-loader.js b/src/utils/model-loader.js new file mode 100644 index 000000000..4bc33952d --- /dev/null +++ b/src/utils/model-loader.js @@ -0,0 +1,99 @@ +import { getModelFile, MAX_EXTERNAL_DATA_CHUNKS } from './hub.js'; +import { apis } from '../env.js'; + +/** + * Loads the core model file. + * + * @param {string} pretrained_model_name_or_path The path to the directory containing the model file. + * @param {string} fileName The base name of the model file (without suffix or extension). + * @param {import('./hub.js').PretrainedModelOptions} options Additional options for loading the model. + * @param {string} suffix The suffix to append to the file name (e.g., '_q4', '_quantized'). + * @returns {Promise} A Promise that resolves to the model file buffer or path. + */ +export async function getCoreModelFile(pretrained_model_name_or_path, fileName, options, suffix) { + const baseName = `${fileName}${suffix}.onnx`; + const fullPath = `${options.subfolder ?? ''}/${baseName}`; + + return await getModelFile( + pretrained_model_name_or_path, + fullPath, + true, + options, + apis.IS_NODE_ENV, + ); +} + +/** + * Loads external data files for a model. + * + * @param {string} pretrained_model_name_or_path The path to the directory containing the model files. + * @param {string} fileName The base name of the model file (without suffix or extension). + * @param {string} suffix The suffix to append to the file name (e.g., '_q4'). + * @param {import('./hub.js').PretrainedModelOptions} options Additional options for loading the model. + * @param {import('./hub.js').ExternalData|Record|undefined} use_external_data_format External data format configuration. + * @param {any} [session_options] Optional session options that may contain externalData configuration. + * @returns {Promise>} A Promise that resolves to an array of external data files. + */ +export async function getModelDataFiles( + pretrained_model_name_or_path, + fileName, + suffix, + options, + use_external_data_format, + session_options = {}, +) { + const baseName = `${fileName}${suffix}.onnx`; + const return_path = apis.IS_NODE_ENV; + + /** @type {Promise[]} */ + let externalDataPromises = []; + + if (use_external_data_format) { + let external_data_format; + if (typeof use_external_data_format === 'object') { + if (use_external_data_format.hasOwnProperty(baseName)) { + external_data_format = use_external_data_format[baseName]; + } else if (use_external_data_format.hasOwnProperty(fileName)) { + external_data_format = use_external_data_format[fileName]; + } else { + external_data_format = false; + } + } else { + external_data_format = use_external_data_format; + } + + const num_chunks = +external_data_format; // (false=0, true=1, number remains the same) + if (num_chunks > MAX_EXTERNAL_DATA_CHUNKS) { + throw new Error( + `The number of external data chunks (${num_chunks}) exceeds the maximum allowed value (${MAX_EXTERNAL_DATA_CHUNKS}).`, + ); + } + for (let i = 0; i < num_chunks; ++i) { + const path = `${baseName}_data${i === 0 ? '' : '_' + i}`; + const fullPath = `${options.subfolder ?? ''}/${path}`; + externalDataPromises.push( + new Promise(async (resolve, reject) => { + const data = await getModelFile( + pretrained_model_name_or_path, + fullPath, + true, + options, + return_path, + ); + resolve(data instanceof Uint8Array ? { path, data } : path); + }), + ); + } + } else if (session_options.externalData !== undefined) { + externalDataPromises = session_options.externalData.map(async (ext) => { + // if the external data is a string, fetch the file and replace the string with its content + if (typeof ext.data === 'string') { + const ext_buffer = await getModelFile(pretrained_model_name_or_path, ext.data, true, options); + return { ...ext, data: ext_buffer }; + } + return ext; + }); + } + + return Promise.all(externalDataPromises); +} \ No newline at end of file diff --git a/src/utils/tensor.js b/src/utils/tensor.js index e708c79b1..3fe0fac78 100644 --- a/src/utils/tensor.js +++ b/src/utils/tensor.js @@ -1583,3 +1583,64 @@ export function quantize_embeddings(tensor, precision) { return new Tensor(dtype, outputData, [tensor.dims[0], tensor.dims[1] / 8]); } + +/** + * Replaces ONNX Tensor objects with custom Tensor objects to support additional functions. + * @param {Object} obj The object to replace tensor objects in. + * @returns {Object} The object with tensor objects replaced by custom Tensor objects. + * @private + */ +export function replaceTensors(obj) { + for (let prop in obj) { + if (isONNXTensor(obj[prop])) { + obj[prop] = new Tensor(obj[prop]); + } else if (typeof obj[prop] === 'object') { + replaceTensors(obj[prop]); + } + } + return obj; +} + +/** + * Converts an array or Tensor of integers to an int64 Tensor. + * @param {any[]|Tensor} items The input integers to be converted. + * @returns {Tensor} The int64 Tensor with the converted values. + * @throws {Error} If the input array is empty or the input is a batched Tensor and not all sequences have the same length. + * @private + */ +export function toI64Tensor(items) { + if (items instanceof Tensor) { + return items; + } + // items is an array + if (items.length === 0) { + throw Error('items must be non-empty'); + } + + if (Array.isArray(items[0])) { + // batched + if (items.some((x) => x.length !== items[0].length)) { + throw Error( + "Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.", + ); + } + + return new Tensor('int64', BigInt64Array.from(items.flat().map((x) => BigInt(x))), [ + items.length, + items[0].length, + ]); + } else { + //flat + return new Tensor('int64', BigInt64Array.from(items.map((x) => BigInt(x))), [1, items.length]); + } +} + +/** + * Creates a boolean tensor with a single value. + * @param {boolean} value The value of the tensor. + * @returns {Tensor} The boolean tensor. + * @private + */ +export function boolTensor(value) { + return new Tensor('bool', [value], [1]); +} diff --git a/webpack.config.js b/webpack.config.js deleted file mode 100644 index d1e264ac2..000000000 --- a/webpack.config.js +++ /dev/null @@ -1,224 +0,0 @@ -import { fileURLToPath } from "node:url"; -import path from "node:path"; -import fs from "node:fs"; -import webpack from "webpack"; -import TerserPlugin from "terser-webpack-plugin"; - -const __dirname = path.dirname(fileURLToPath(import.meta.url)); - -/** - * Plugin to strip the "node:" prefix from module requests. - * - * This is necessary to ensure both web and node builds work correctly, - * otherwise we would get an error like: - * ``` - * Module build failed: UnhandledSchemeError: Reading from "node:path" is not handled by plugins (Unhandled scheme). - * Webpack supports "data:" and "file:" URIs by default. - * You may need an additional plugin to handle "node:" URIs. - * ``` - * - * NOTE: We then do not need to use the `node:` prefix in the resolve.alias configuration. - */ -class StripNodePrefixPlugin extends webpack.NormalModuleReplacementPlugin { - constructor() { - super(/^node:(.+)$/, (resource) => { - resource.request = resource.request.replace(/^node:/, ""); - }); - } -} - -/** - * Plugin to post-process build files. Required to solve certain issues with ESM module output. - * See https://github.com/webpack/webpack/issues/17121 for more information. - * - * @see https://webpack.js.org/contribute/writing-a-plugin/ - */ -class PostBuildPlugin { - static completed = false; - - apply(compiler) { - compiler.hooks.done.tap("PostBuildPlugin", () => { - if (!process.env.WEBPACK_SERVE && !PostBuildPlugin.completed) { - // Ensure we only run this once - PostBuildPlugin.completed = true; - return; - } - const dist = path.join(__dirname, "dist"); - const ORT_JSEP_FILE = "ort-wasm-simd-threaded.asyncify.mjs"; - const ORT_BUNDLE_FILE = "ort.webgpu.bundle.min.mjs"; - - // 1. Copy unbundled asyncify file - { - const src = path.join( - __dirname, - "node_modules/onnxruntime-web/dist", - ORT_JSEP_FILE, - ); - const dest = path.join(dist, ORT_JSEP_FILE); - fs.copyFileSync(src, dest); - } - - // 2. Remove unnecessary files - { - const file = path.join(dist, ORT_BUNDLE_FILE); - if (fs.existsSync(file)) fs.unlinkSync(file); - } - }); - } -} - -/** - * Helper function to create webpack configurations. - * @param {Object} options Options for creating a webpack target. - * @param {string} options.name Name of output file. - * @param {string} options.suffix Suffix of output file. - * @param {string} options.type Type of library. - * @param {string} options.ignoreModules The list of modules to ignore. - * @param {string} options.externalModules The list of modules to set as external. - * @param {Object[]} options.plugins List of plugins to use. - * @returns {import('webpack').Configuration} One webpack target. - */ -function buildConfig({ - name = "", - suffix = ".js", - type = "module", // 'module' | 'commonjs' - ignoreModules = [], - externalModules = [], - plugins = [], -} = {}) { - const outputModule = type === "module"; - const alias = Object.fromEntries( - ignoreModules.map((module) => [module, false]), - ); - - /** @type {import('webpack').Configuration} */ - const config = { - mode: "development", - devtool: "source-map", - entry: { - [`transformers${name}`]: "./src/transformers.js", - [`transformers${name}.min`]: "./src/transformers.js", - }, - output: { - filename: `[name]${suffix}`, - path: path.join(__dirname, "dist"), - library: { - type, - }, - assetModuleFilename: "[name][ext]", - chunkFormat: false, - }, - optimization: { - minimize: true, - minimizer: [ - new TerserPlugin({ - test: new RegExp(`\\.min\\${suffix}$`), - - // Do not bundle with comments. - // See https://webpack.js.org/plugins/terser-webpack-plugin/#remove-comments for more information. - terserOptions: { - output: { - comments: false, - }, - }, - extractComments: false, - }), - ], - }, - experiments: { - outputModule, - }, - resolve: { alias }, - - externals: externalModules, - - // Development server - devServer: { - static: { - directory: __dirname, - }, - port: 8080, - }, - plugins, - }; - - if (outputModule) { - config.module = { - parser: { - javascript: { - importMeta: false, - }, - }, - }; - } else { - config.externalsType = "commonjs"; - } - - return config; -} - -// Do not bundle onnxruntime-web when packaging for Node.js. -// Instead, we use the native library (onnxruntime-node). -const NODE_IGNORE_MODULES = ["onnxruntime-web"]; - -// Do not bundle the following modules with webpack (mark as external) -// NOTE: This is necessary for both type="module" and type="commonjs", -// and will be ignored when building for web (only used for node/deno) -const NODE_EXTERNAL_MODULES = [ - "onnxruntime-common", - "onnxruntime-node", - "sharp", - "node:fs", - "node:path", - "node:url", - "node:stream", - "node:stream/promises", -]; - -// Do not bundle node-only packages when bundling for the web. -// NOTE: We can exclude the "node:" prefix for built-in modules here, -// since we apply the `StripNodePrefixPlugin` to strip it. -const WEB_IGNORE_MODULES = ["onnxruntime-node", "sharp", "fs", "path", "url", "stream", "stream/promises"]; - -// Do not bundle the following modules with webpack (mark as external) -const WEB_EXTERNAL_MODULES = ["onnxruntime-common", "onnxruntime-web"]; - -// Web-only build -const WEB_BUILD = buildConfig({ - name: ".web", - type: "module", - ignoreModules: WEB_IGNORE_MODULES, - externalModules: WEB_EXTERNAL_MODULES, - plugins: [new StripNodePrefixPlugin(), new PostBuildPlugin()], -}); - -// Web-only build, bundled with onnxruntime-web -const BUNDLE_BUILD = buildConfig({ - type: "module", - ignoreModules: WEB_IGNORE_MODULES, - plugins: [new StripNodePrefixPlugin(), new PostBuildPlugin()], -}); - -// Node-compatible builds -const NODE_BUILDS = [ - buildConfig({ - name: ".node", - suffix: ".mjs", - type: "module", - ignoreModules: NODE_IGNORE_MODULES, - externalModules: NODE_EXTERNAL_MODULES, - }), - buildConfig({ - name: ".node", - suffix: ".cjs", - type: "commonjs", - ignoreModules: NODE_IGNORE_MODULES, - externalModules: NODE_EXTERNAL_MODULES, - }), -]; - -// When running with `webpack serve`, only build the web target. -const BUILDS = process.env.WEBPACK_SERVE - ? [BUNDLE_BUILD] - : [BUNDLE_BUILD, WEB_BUILD, ...NODE_BUILDS]; -export default BUILDS;