Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added assets/PixPin_2024-01-25_08-45-54.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/client-icon.ico
Binary file not shown.
Binary file added assets/server-icon.ico
Binary file not shown.
Binary file added assets/start_server_and_client_in_tray.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/start_server_or_client_in_tray.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
93 changes: 93 additions & 0 deletions hint_while_recording.ahk
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#Requires AutoHotkey v2.0
CoordMode("ToolTip", "Screen")


~CapsLock::{
if hwnd := GetCaretPosEx(&x, &y, &w, &h){
; 能够获取到文本光标时,提示信息在光标位置,且x坐标向右偏移5
x := x + 5
}
else{
; 获取不到文本光标时,提示信息在当前窗口的位置
WinGetPos &X, &Y, &W, &H, "A"
x := X + W * 0.25
y := Y + H * 0.7
}
ToolTip("✦语音输入中‧‧‧", x, y) ; 提示信息内容
KeyWait("CapsLock")
ToolTip()
return
}


GetCaretPosEx(&x?, &y?, &w?, &h?) {
x := h := w := h := 0
static iUIAutomation := 0, hOleacc := 0, IID_IAccessible, guiThreadInfo, _ := init()
if !iUIAutomation || ComCall(8, iUIAutomation, "ptr*", eleFocus := ComValue(13, 0), "int") || !eleFocus.Ptr
goto useAccLocation
if !ComCall(16, eleFocus, "int", 10002, "ptr*", valuePattern := ComValue(13, 0), "int") && valuePattern.Ptr
if !ComCall(5, valuePattern, "int*", &isReadOnly := 0) && isReadOnly
return 0
useAccLocation:
; use IAccessible::accLocation
hwndFocus := DllCall("GetGUIThreadInfo", "uint", DllCall("GetWindowThreadProcessId", "ptr", WinExist("A"), "ptr", 0, "uint"), "ptr", guiThreadInfo) && NumGet(guiThreadInfo, A_PtrSize == 8 ? 16 : 12, "ptr") || WinExist()
if hOleacc && !DllCall("Oleacc\AccessibleObjectFromWindow", "ptr", hwndFocus, "uint", 0xFFFFFFF8, "ptr", IID_IAccessible, "ptr*", accCaret := ComValue(13, 0), "int") && accCaret.Ptr {
NumPut("ushort", 3, varChild := Buffer(24, 0))
if !ComCall(22, accCaret, "int*", &x := 0, "int*", &y := 0, "int*", &w := 0, "int*", &h := 0, "ptr", varChild, "int")
return hwndFocus
}
if iUIAutomation && eleFocus {
; use IUIAutomationTextPattern2::GetCaretRange
if ComCall(16, eleFocus, "int", 10024, "ptr*", textPattern2 := ComValue(13, 0), "int") || !textPattern2.Ptr
goto useGetSelection
if ComCall(10, textPattern2, "int*", &isActive := 0, "ptr*", caretTextRange := ComValue(13, 0), "int") || !caretTextRange.Ptr || !isActive
goto useGetSelection
if !ComCall(10, caretTextRange, "ptr*", &rects := 0, "int") && rects && (rects := ComValue(0x2005, rects, 1)).MaxIndex() >= 3 {
x := rects[0], y := rects[1], w := rects[2], h := rects[3]
return hwndFocus
}
useGetSelection:
; use IUIAutomationTextPattern::GetSelection
if textPattern2.Ptr
textPattern := textPattern2
else if ComCall(16, eleFocus, "int", 10014, "ptr*", textPattern := ComValue(13, 0), "int") || !textPattern.Ptr
goto useGUITHREADINFO
if ComCall(5, textPattern, "ptr*", selectionRangeArray := ComValue(13, 0), "int") || !selectionRangeArray.Ptr
goto useGUITHREADINFO
if ComCall(3, selectionRangeArray, "int*", &length := 0, "int") || length <= 0
goto useGUITHREADINFO
if ComCall(4, selectionRangeArray, "int", 0, "ptr*", selectionRange := ComValue(13, 0), "int") || !selectionRange.Ptr
goto useGUITHREADINFO
if ComCall(10, selectionRange, "ptr*", &rects := 0, "int") || !rects
goto useGUITHREADINFO
rects := ComValue(0x2005, rects, 1)
if rects.MaxIndex() < 3 {
if ComCall(6, selectionRange, "int", 0, "int") || ComCall(10, selectionRange, "ptr*", &rects := 0, "int") || !rects
goto useGUITHREADINFO
rects := ComValue(0x2005, rects, 1)
if rects.MaxIndex() < 3
goto useGUITHREADINFO
}
x := rects[0], y := rects[1], w := rects[2], h := rects[3]
return hwndFocus
}
useGUITHREADINFO:
if hwndCaret := NumGet(guiThreadInfo, A_PtrSize == 8 ? 48 : 28, "ptr") {
if DllCall("GetWindowRect", "ptr", hwndCaret, "ptr", clientRect := Buffer(16)) {
w := NumGet(guiThreadInfo, 64, "int") - NumGet(guiThreadInfo, 56, "int")
h := NumGet(guiThreadInfo, 68, "int") - NumGet(guiThreadInfo, 60, "int")
DllCall("ClientToScreen", "ptr", hwndCaret, "ptr", guiThreadInfo.Ptr + 56)
x := NumGet(guiThreadInfo, 56, "int")
y := NumGet(guiThreadInfo, 60, "int")
return hwndCaret
}
}
return 0
static init() {
try
iUIAutomation := ComObject("{E22AD333-B25F-460C-83D0-0581107395C9}", "{30CBE57D-D9D0-452A-AB13-7AC5AC4825EE}")
hOleacc := DllCall("LoadLibraryW", "str", "Oleacc.dll", "ptr")
NumPut("int64", 0x11CF3C3D618736E0, "int64", 0x719B3800AA000C81, IID_IAccessible := Buffer(16))
guiThreadInfo := Buffer(A_PtrSize == 8 ? 72 : 48), NumPut("uint", guiThreadInfo.Size, guiThreadInfo)
}
}
Binary file added hint_while_recording.exe
Binary file not shown.
33 changes: 33 additions & 0 deletions python_modules.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
archspec @ file:///croot/archspec_1697725767277/work
boltons @ file:///C:/ci_311/boltons_1677729932371/work
Brotli @ file:///C:/ci_311/brotli-split_1676435766766/work
certifi @ file:///C:/b/abs_91u83siphd/croot/certifi_1700501720658/work/certifi
cffi @ file:///C:/b/abs_924gv1kxzj/croot/cffi_1700254355075/work
charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
colorama @ file:///C:/ci_311/colorama_1676422310965/work
conda @ file:///C:/b/abs_88drkd35d4/croot/conda_1701719561256/work
conda-content-trust @ file:///C:/b/abs_e3bcpyv7sw/croot/conda-content-trust_1693490654398/work
conda-libmamba-solver @ file:///croot/conda-libmamba-solver_1702997573971/work/src
conda-package-handling @ file:///C:/b/abs_b9wp3lr1gn/croot/conda-package-handling_1691008700066/work
conda_package_streaming @ file:///C:/b/abs_6c28n38aaj/croot/conda-package-streaming_1690988019210/work
cryptography @ file:///C:/b/abs_e8cnom_zw_/croot/cryptography_1702071486468/work
distro @ file:///C:/b/abs_a3uni_yez3/croot/distro_1701455052240/work
idna @ file:///C:/ci_311/idna_1676424932545/work
jsonpatch @ file:///tmp/build/80754af9/jsonpatch_1615747632069/work
jsonpointer==2.1
libmambapy @ file:///C:/b/abs_efpsdwt4ya/croot/mamba-split_1698782663578/work/libmambapy
menuinst @ file:///C:/b/abs_e8p75b4m9q/croot/menuinst_1702390332729/work
packaging @ file:///C:/b/abs_28t5mcoltc/croot/packaging_1693575224052/work
platformdirs @ file:///C:/b/abs_b6z_yqw_ii/croot/platformdirs_1692205479426/work
pluggy @ file:///C:/ci_311/pluggy_1676422178143/work
pycosat @ file:///C:/b/abs_31zywn1be3/croot/pycosat_1696537126223/work
pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
pyOpenSSL @ file:///C:/b/abs_08f38zyck4/croot/pyopenssl_1690225407403/work
PySocks @ file:///C:/ci_311/pysocks_1676425991111/work
requests @ file:///C:/b/abs_316c2inijk/croot/requests_1690400295842/work
ruamel.yaml @ file:///C:/ci_311/ruamel.yaml_1676439214109/work
tqdm @ file:///C:/b/abs_f76j9hg7pv/croot/tqdm_1679561871187/work
truststore @ file:///C:/b/abs_55z7b3r045/croot/truststore_1695245455435/work
urllib3 @ file:///C:/b/abs_9cmlsrm3ys/croot/urllib3_1698257595508/work
win-inet-pton @ file:///C:/ci_311/win_inet_pton_1676425458225/work
zstandard==0.19.0
88 changes: 52 additions & 36 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## CapsWriter-Offline

![image-20240108115946521](assets/image-20240108115946521.png)
![image-20240108115946521](assets/image-20240108115946521.png)

这是 `CapsWriter-Offline` ,一个 PC 端的语音输入、字幕转录工具。

Expand All @@ -9,13 +9,13 @@
1. 按下键盘上的 `大写锁定键`,录音开始,当松开 `大写锁定键` 时,就会识别你的录音,并将识别结果立刻输入
2. 将音视频文件拖动到客户端打开,即可转录生成 srt 字幕

视频教程:[CapsWriter-Offline 电脑端离线语音输入工具](https://www.bilibili.com/video/BV1tt4y1d75s/)
视频教程:[CapsWriter-Offline 电脑端离线语音输入工具](https://www.bilibili.com/video/BV1tt4y1d75s/)

## 特性

1. 完全离线、无限时长、低延迟、高准确率、中英混输、自动阿拉伯数字、自动调整中英间隔
2. 热词功能:可以在 `hot-en.txt hot-zh.txt hot-rule.txt` 中添加三种热词,客户端动态载入
3. 日记功能:默认每次录音识别后,识别结果记录在 `年份/月份/日期.md` ,录音文件保存在 `年份/月份/assets`
3. 日记功能:默认每次录音识别后,识别结果记录在 `年份/月份/日期.md` ,录音文件保存在 `年份/月份/assets`
4. 关键词日记:识别结果若以关键词开头,会被记录在 `年份/月份/关键词-日期.md`,关键词在 `keywords.txt` 中定义
5. 转录功能:将音视频文件拖动到客户端打开,即可转录生成 srt 字幕
6. 服务端、客户端分离,可以服务多台客户端
Expand Down Expand Up @@ -43,14 +43,33 @@

下载地址:

- 百度盘: https://pan.baidu.com/s/1zNHstoWZDJVynCBz2yS9vg 提取码: eu4c
- GitHub Release: [Releases · HaujetZhao/CapsWriter-Offline](https://github.com/HaujetZhao/CapsWriter-Offline/releases)
- 百度盘: https://pan.baidu.com/s/1zNHstoWZDJVynCBz2yS9vg 提取码: eu4c
- GitHub Release: [Releases · HaujetZhao/CapsWriter-Offline](https://github.com/HaujetZhao/CapsWriter-Offline/releases)

(百度网盘容易掉链接,补链接太麻烦了,我不一定会补链接。GitHub Releases 界面下载是最可靠的。)

![image-20240108114351535](assets/image-20240108114351535.png)
![image-20240108114351535](assets/image-20240108114351535.png)

## 图形界面包 ( 仅 Windows 端)

1. 基于 [PySide6](https://pypi.org/project/PySide6/) 的 GUI,默认使用 [Qt-Material](https://github.com/UN-GCPDS/qt-material) dark_yellow 主题;基于 [PyStand](https://github.com/skywind3000/PyStand) 绿化便携 `start.exe`。
2. 支持最小化到系统托盘。
3. ~~Server 和 Client 以 tab 集成在一个 gui,无任务栏占用。~~
4. ~~双击 start.exe 运行,可自行设置开机自启动。~~ 不建议使用,参考[这里](https://github.com/HaujetZhao/CapsWriter-Offline/pull/53#issuecomment-1903681063)。
5. 已包含所有 Python 环境和 models 模型,解压即用。
6. 支持转录功能,将文件拖动到 `start_client_gui.exe`
7. `hint_while_recording.exe`跟随`start_client_gui.exe`启停,实现按下 Capslock 键会在光标处提示 [✦ 语音输入中‧‧‧](https://github.com/HaujetZhao/CapsWriter-Offline/issues/52#issuecomment-1905758203)

下载地址:

- 123 盘:https://www.123pan.com/s/qBxUVv-H4Zq3.html 提取码:h8vb)
- GitHub Release: [Releases · H1DDENADM1N/CapsWriter-Offline](https://github.com/H1DDENADM1N/CapsWriter-Offline/releases)

![start_server_and_client_in_tray](assets/start_server_and_client_in_tray.gif)

![start_server_or_client_in_tray](assets/start_server_or_client_in_tray.gif)

![PixPin_2024-01-25_08-45-54](assets/PixPin_2024-01-25_08-45-54.png)

## 功能:热词

Expand All @@ -60,7 +79,7 @@

- 英文热词请写到 `hot-en.txt` 文件,每行一个,替换依据为字母拼写

- 自定义规则热词请写到 `hot-rule.txt` 文件,每行一个,将搜索和替换词以等号隔开,如 `毫安时 = mAh`
- 自定义规则热词请写到 `hot-rule.txt` 文件,每行一个,将搜索和替换词以等号隔开,如 `毫安时 = mAh`

你可以在 `core_client.py` 文件中配置是否匹配中文多音字,是否严格匹配拼音声调。

Expand All @@ -72,22 +91,20 @@

![image-20230531221314983](assets/image-20230531221314983.png)



## 功能:日记、关键词

默认每次语音识别结束后,会以年、月为分类,保存录音文件和识别结果:

- 录音文件存放在「年/月/assets」文件夹下
- 识别结果存放在「年/月/日.md」Markdown 文件中

例如今天是2023年6月5号,示例:
例如今天是 2023 年 6 月 5 号,示例:

1. 语音输入任一句话后,录音就会被保存到 `2023/06/assets` 路径下,以时间和识别结果命名,并将识别结果保存到 `2023/06/05.md` 文件中,方便我日后查阅
2. 例如我在 `keywords.txt` 中定义了关键词「健康」,用于随时记录自己的身体状况,吃完饭后我可以按住 `CapsLock` 说「健康今天中午吃了大米炒饭」,由于识别结果以「健康」关键词开头,这条识别记录就会被保存到 `2023/06/05-健康.md` 中
3. 例如我在 `keywords.txt` 中定义了关键词「重要」,用于随时记录突然的灵感,有想法时我就可以按住 `CapsLock` 说「重要,xx问题可以用xxxx方法解决」,由于识别结果以「重要」关键词开头,这条识别记录就会被保存到 `2023/06/05-重要.md` 中
3. 例如我在 `keywords.txt` 中定义了关键词「重要」,用于随时记录突然的灵感,有想法时我就可以按住 `CapsLock` 说「重要,xx 问题可以用 xxxx 方法解决」,由于识别结果以「重要」关键词开头,这条识别记录就会被保存到 `2023/06/05-重要.md` 中

![image-20230604144824341](assets/image-20230604144824341.png)
![image-20230604144824341](assets/image-20230604144824341.png)

## 功能:转录文件

Expand All @@ -98,47 +115,42 @@
- `merge.txt` 文件,包含了带标点的整段结果
- `srt` 文件,字幕文件

如果生成的字幕有微小错误,可以在分行的 `txt` 文件中修改,然后将 `txt` 文件拖动到客户端打开,客户端检测到输入的是 `txt` 文件,就会查到同名的 `json` 文件,结合 `json` 文件中的字级时间戳和 `txt` 文件中修正结果,更新 `srt` 字幕文件。
如果生成的字幕有微小错误,可以在分行的 `txt` 文件中修改,然后将 `txt` 文件拖动到客户端打开,客户端检测到输入的是 `txt` 文件,就会查到同名的 `json` 文件,结合 `json` 文件中的字级时间戳和 `txt` 文件中修正结果,更新 `srt` 字幕文件。

## 注意事项

1. 当用户安装了 `FFmpeg` 时,会以 `mp3` 格式保存录音;当用户没有装 `FFmpeg` 时,会以 `wav` 格式保存录音
2. 音视频文件转录功能依赖于 `FFmpeg`,打包版本已内置 `FFmpeg`
2. 音视频文件转录功能依赖于 `FFmpeg`,打包版本已内置 `FFmpeg`
3. 默认的快捷键是 `caps lock`,你可以打开 `core_client.py` 进行修改
4. MacOS 无法监测到 `caps lock` 按键,可改为 `right shift` 按键

## 修改配置

你可以编辑 `config.py` ,在开头部分有注释,指导你修改服务端、客户端的:

- 连接的地址和端口,默认是 `127.0.0.1` 和 `6006`
- 连接的地址和端口,默认是 `127.0.0.1` 和 `6006`
- 键盘快捷键
- 是否要保存录音文件
- 要移除识别结果末尾的哪些标点,(如果你想把句尾的问号也删除掉,可以在这边加上)

![image-20240108114558762](assets/image-20240108114558762.png)



![image-20240108114558762](assets/image-20240108114558762.png)

## 下载模型

服务端使用了 [sherpa-onnx](https://k2-fsa.github.io/sherpa/onnx/index.html) ,载入阿里巴巴开源的 [Paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) 模型([转为量化的onnx格式](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html)),来作语音识别,整个模型约 230MB 大小。下载有已转换好的模型文件:
服务端使用了 [sherpa-onnx](https://k2-fsa.github.io/sherpa/onnx/index.html) ,载入阿里巴巴开源的 [Paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch) 模型([转为量化的 onnx 格式](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html)),来作语音识别,整个模型约 230MB 大小。下载有已转换好的模型文件:

- [csukuangfj/sherpa-onnx-paraformer-zh-2023-09-14](https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-09-14)
- [csukuangfj/sherpa-onnx-paraformer-zh-2023-09-14](https://huggingface.co/csukuangfj/sherpa-onnx-paraformer-zh-2023-09-14)

另外,还使用了阿里巴巴的标点符号模型,大小约 1GB:

- [CT-Transformer标点-中英文-通用-large-onnx](https://www.modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx/summary)

**模型文件太大,并没有包含在 GitHub 库里面,你可以从百度网盘或者 GitHub Releases 界面下载已经转换好的模型文件,解压后,将 `models` 文件夹放到软件根目录**


- [CT-Transformer 标点-中英文-通用-large-onnx](https://www.modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx/summary)

**模型文件太大,并没有包含在 GitHub 库里面,你可以从百度网盘或者 GitHub Releases 界面下载已经转换好的模型文件,解压后,将 `models` 文件夹放到软件根目录**

## 源码安装依赖

### \[New\] Linux 端

```bash
# for core_server.py
pip install -r requirements-server.txt -i https://mirror.sjtu.edu.cn/pypi/web/simple
Expand All @@ -148,9 +160,10 @@ pip install -r requirements-server.txt -i https://mirror.sjtu.edu.cn/pypi/web/s
pip install -r requirements-client.txt -i https://mirror.sjtu.edu.cn/pypi/web/simple
sudo apt-get install xclip # 让core_client.py正常运行
```

**运行方式**
`core_server.py` # 无需以 root 权限运行
`core_client.py` # 注意: 必须以 root 权限运行!!
`core_server.py` # 无需以 root 权限运行
`core_client.py` # 注意: 必须以 root 权限运行!!

### Windows 端

Expand All @@ -159,7 +172,7 @@ pip install -r requirements-server.txt
pip install -r requirements-client.txt
```

有些依赖在 `Python 3.11` 还暂时不无法安装,建议使用 `Python 3.8 - Python3.10`
有些依赖在 `Python 3.11` 还暂时不无法安装,建议使用 `Python 3.8 - Python3.10`

### Mac 端

Expand All @@ -175,25 +188,28 @@ python3 setup.py install

## 源码运行

1. 运行 `core_server.py` 脚本,会载入 Paraformer 模型识别模型和标点模型(这会占用2GB的内存,载入时长约 50 秒)
1. 运行 `core_server.py` 脚本,会载入 Paraformer 模型识别模型和标点模型(这会占用 2GB 的内存,载入时长约 50 秒)
2. 运行 `core_client.py` 脚本,它会打开系统默认麦克风,开始监听按键(`MacOS` 端需要 `sudo`)
3. 按住 `CapsLock` 键,录音开始,松开 `CapsLock` 键,录音结束,识别结果立马被输入(录音时长短于0.3秒不算
3. 按住 `CapsLock` 键,录音开始,松开 `CapsLock` 键,录音结束,识别结果立马被输入(录音时长短于 0.3 秒不算

MacOS 端注意事项:

- MacOS 上监听 `CapsLock` 键可能会出错,需要快捷键修改为其他按键,如 `right shift`
- MacOS 上监听 `CapsLock` 键可能会出错,需要快捷键修改为其他按键,如 `right shift`

## 打包方法
Windows/MacOS/Linux均使用如下命令完成打包:

Windows/MacOS/Linux 均使用如下命令完成打包:
`pyinstaller build.spec`

## 运行方式
### Linux
双击 `run.sh` 自动输入sudo密码且实现左右分屏展示

### Linux

双击 `run.sh` 自动输入 sudo 密码且实现左右分屏展示
![](./assets/run-sh.png)

## 打赏

如果你愿意,可以以打赏的方式支持我一下:

![sponsor](assets/sponsor.jpg)
![sponsor](assets/sponsor.jpg)
2 changes: 1 addition & 1 deletion requirements-client.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
rich
keyboard
pyclip
pyperclip
numpy
sounddevice
websockets
Expand Down
Loading