When gravitational waves reach Earth, they usually give a strain of $\delta L \over L$$=10^{-21}$.
+When gravitational waves reach Earth, they usually give a strain of $\delta L \over L = 10^{-21}$.
If we assume that they scale with the distance the same way electromagnetic waves do, thus following the inverse square law, we can get an estimate of the distance needed.
LIGO detected the first merger of black holes at 1.3 billion light years away.
If we would get to 1 light year away from the merger, under the above hypothesis we would get a strain of $10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}$. This means that on 1 meter length we would notice a 1 mm oscillation, which is something we are able to sense.
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html index 771c4c49..b8fc706a 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html @@ -7,8 +7,7 @@ M_{\odot} M_{\odot} M_{\odot} -\delta L \over L -=10^{-21} +\delta L \over L = 10^{-21} 10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3} 1/r 10^{-21} \times (1.3 \cdot 10^9)=10^{-9} diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 79d010aa..679690a0 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -511,6 +511,7 @@ def test_math_recognizer_html(self): # print('answers::::::::', answers) # self.write_to_html(answers, test_case['input'][0]) # 检查行内公式抽取正确性 + # 检查行内公式抽取正确性 if test_case.get('expected_inline', None): # 从所有parts中提取所有行内公式 all_inline_formulas = [] @@ -521,16 +522,40 @@ def test_math_recognizer_html(self): for inline_elem in inline_elements: formula = inline_elem.text.replace('\n', '').strip() all_inline_formulas.append(formula) - # print(f"Found {len(all_inline_formulas)} total inline formulas") - # print(f"Total new_parts: {len(new_parts)}") + expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip() expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula] - # print(f"Expected {len(expect_inline_formulas)} inline formulas") + + # 如果数量不匹配,输出详细信息 + if len(all_inline_formulas) != len(expect_inline_formulas): + print("\n" + "=" * 80) + print("行内公式抽取出错!") + print("=" * 80) + print(f"出错样例: {test_case['input']}") + print(f"预期公式数: {len(expect_inline_formulas)}") + print(f"实际公式数: {len(all_inline_formulas)}") + print("\n预期公式列表:") + for i, formula in enumerate(expect_inline_formulas, 1): + print(f" {i}. {formula}") + print("\n实际公式列表:") + for i, formula in enumerate(all_inline_formulas, 1): + print(f" {i}. {formula}") + + # 找出差异 + print("\n差异分析:") + if len(all_inline_formulas) > len(expect_inline_formulas): + print(f"多提取了 {len(all_inline_formulas) - len(expect_inline_formulas)} 个公式:") + extra_formulas = all_inline_formulas[len(expect_inline_formulas):] + for i, formula in enumerate(extra_formulas, 1): + print(f" {i}. {formula}") + else: + print(f"少提取了 {len(expect_inline_formulas) - len(all_inline_formulas)} 个公式:") + missing_formulas = expect_inline_formulas[len(all_inline_formulas):] + for i, formula in enumerate(missing_formulas, 1): + print(f" {i}. {formula}") + print("=" * 80 + "\n") + self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas)) - for expect, formula in zip(expect_inline_formulas, all_inline_formulas): - # print('inline expect::::::::', expect) - # print('inline answer::::::::', formula) - self.assertEqual(expect, formula) def write_to_html(self, answers, file_name): file_name = file_name.split('.')[0] diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json index d8f038a2..34236da4 100644 --- a/tests/llm_web_kit/input/assets/content_json.json +++ b/tests/llm_web_kit/input/assets/content_json.json @@ -168,7 +168,7 @@ "content": { "math_content": "a^2 + b^2 = c^2", "math_type": "latex", - "by": "None" + "by": "mathjax_mock" } }, { From 08daa672a20b4cb8033798fef92998aeb797c69d Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Wed, 22 Oct 2025 16:17:11 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E7=AE=80=E5=8C=96math=E6=8A=BD=E5=8F=96?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/common.py | 18 +++++++++++------- .../recognizer/cc_math/tag_common_modify.py | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index 9dea5eba..c1ce7377 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -537,7 +537,7 @@ def fix_mathml_superscript(self, mathml_str): parent.remove(msup) return etree.tostring(root, encoding='unicode', pretty_print=True) - def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func) -> HtmlElement: + def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement) -> HtmlElement: """替换数学公式节点. Args: @@ -593,12 +593,16 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str: print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$')) print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)')) print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)')) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'`x=(-b +- sqrt(b^2 - 4ac))/(2a)`
'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'like this: \`E=mc^2\`
'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.
'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'`(x+1)/x^2``1/3245`
'),None,True)) - print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end
'),None,False)) - print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'\( \newcommand{\norm}[1]{\| #1 \|}\)
'),None,False)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'`x=(-b +- sqrt(b^2 - 4ac))/(2a)`
'), + True)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'like this: \`E=mc^2\`
'), True)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.
'), + True)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'`(x+1)/x^2``1/3245`
'), True)) + print(cm.replace_math('ccmath-interline','latex','', html_to_element(r'start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end
'), + False)) + print(cm.replace_math('ccmath-inline','latex','', html_to_element(r'\( \newcommand{\norm}[1]{\| #1 \|}\)
'), + False)) # cm.url = 'mathhelpforum.com' # print(cm.wrap_math_md_custom(r'\begin{align} a^2+b=c\end{align}\
')) # print(cm.wrap_math_md_custom(r'
dz=\frac{1}{2}\frac{dx}{\cos ^2 x}
')) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py index 6801cf43..aaa4a9de 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py @@ -17,7 +17,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement): tail = node.tail new_span.tail = None for new_tag, math_type in tag_math_type_list: - new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None) + new_span = cm.replace_math(new_tag, math_type, math_render, new_span) new_span.tail = tail replace_element(node,new_span) From 1ee8049828f0b162929d123636437b0cf24d1825 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Wed, 22 Oct 2025 16:27:34 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E7=AE=80=E5=8C=96math=E6=8A=BD=E5=8F=96?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/common.py | 10 ----- .../extractor/html/recognizer/test_math.py | 39 ++++--------------- 2 files changed, 7 insertions(+), 42 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index c1ce7377..1d5ef3a2 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -593,16 +593,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str: print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$')) print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)')) print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)')) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'
`x=(-b +- sqrt(b^2 - 4ac))/(2a)`
'), - True)) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'like this: \`E=mc^2\`
'), True)) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.
'), - True)) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'`(x+1)/x^2``1/3245`
'), True)) - print(cm.replace_math('ccmath-interline','latex','', html_to_element(r'start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end
'), - False)) - print(cm.replace_math('ccmath-inline','latex','', html_to_element(r'\( \newcommand{\norm}[1]{\| #1 \|}\)
'), - False)) # cm.url = 'mathhelpforum.com' # print(cm.wrap_math_md_custom(r'\begin{align} a^2+b=c\end{align}\
')) # print(cm.wrap_math_md_custom(r'
dz=\frac{1}{2}\frac{dx}{\cos ^2 x}
')) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 679690a0..79d010aa 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -511,7 +511,6 @@ def test_math_recognizer_html(self): # print('answers::::::::', answers) # self.write_to_html(answers, test_case['input'][0]) # 检查行内公式抽取正确性 - # 检查行内公式抽取正确性 if test_case.get('expected_inline', None): # 从所有parts中提取所有行内公式 all_inline_formulas = [] @@ -522,40 +521,16 @@ def test_math_recognizer_html(self): for inline_elem in inline_elements: formula = inline_elem.text.replace('\n', '').strip() all_inline_formulas.append(formula) - + # print(f"Found {len(all_inline_formulas)} total inline formulas") + # print(f"Total new_parts: {len(new_parts)}") expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip() expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula] - - # 如果数量不匹配,输出详细信息 - if len(all_inline_formulas) != len(expect_inline_formulas): - print("\n" + "=" * 80) - print("行内公式抽取出错!") - print("=" * 80) - print(f"出错样例: {test_case['input']}") - print(f"预期公式数: {len(expect_inline_formulas)}") - print(f"实际公式数: {len(all_inline_formulas)}") - print("\n预期公式列表:") - for i, formula in enumerate(expect_inline_formulas, 1): - print(f" {i}. {formula}") - print("\n实际公式列表:") - for i, formula in enumerate(all_inline_formulas, 1): - print(f" {i}. {formula}") - - # 找出差异 - print("\n差异分析:") - if len(all_inline_formulas) > len(expect_inline_formulas): - print(f"多提取了 {len(all_inline_formulas) - len(expect_inline_formulas)} 个公式:") - extra_formulas = all_inline_formulas[len(expect_inline_formulas):] - for i, formula in enumerate(extra_formulas, 1): - print(f" {i}. {formula}") - else: - print(f"少提取了 {len(expect_inline_formulas) - len(all_inline_formulas)} 个公式:") - missing_formulas = expect_inline_formulas[len(all_inline_formulas):] - for i, formula in enumerate(missing_formulas, 1): - print(f" {i}. {formula}") - print("=" * 80 + "\n") - + # print(f"Expected {len(expect_inline_formulas)} inline formulas") self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas)) + for expect, formula in zip(expect_inline_formulas, all_inline_formulas): + # print('inline expect::::::::', expect) + # print('inline answer::::::::', formula) + self.assertEqual(expect, formula) def write_to_html(self, answers, file_name): file_name = file_name.split('.')[0]