diff --git a/src/pyodide/internal/workers-api/src/workers/__init__.py b/src/pyodide/internal/workers-api/src/workers/__init__.py index 4c699da00b0..429b5a551e0 100644 --- a/src/pyodide/internal/workers-api/src/workers/__init__.py +++ b/src/pyodide/internal/workers-api/src/workers/__init__.py @@ -11,6 +11,7 @@ FormData, FormDataValue, Headers, + HTMLRewriter, JSBody, Request, RequestInitCfProperties, @@ -37,6 +38,7 @@ "File", "FormData", "FormDataValue", + "HTMLRewriter", "Headers", "JSBody", "Request", diff --git a/src/pyodide/internal/workers-api/src/workers/_workers.py b/src/pyodide/internal/workers-api/src/workers/_workers.py index 404981fff2c..12b12874104 100644 --- a/src/pyodide/internal/workers-api/src/workers/_workers.py +++ b/src/pyodide/internal/workers-api/src/workers/_workers.py @@ -1366,6 +1366,105 @@ async def wrapped_run(self, event=None, step=None, /, *args, **kwargs): cls.run = wrapped_run +_ELEMENT_HANDLER_METHODS = ("element", "comments", "text") +_DOCUMENT_HANDLER_METHODS = ("doctype", "comments", "text", "end") + + +def _make_js_handler(handler, method_names, proxies): + js_obj = Object.new() + for name in method_names: + method = getattr(handler, name, None) + if method is not None: + proxy = _pyodide_entrypoint_helper.createHandlerGuard(create_proxy(method)) + proxies.append(proxy) + setattr(js_obj, name, proxy) + return js_obj + + +class HTMLRewriter: + def __init__(self): + self._handlers: list[tuple] = [] + # Testing only, stores the proxies created for handlers + self._last_handler_proxies: list[JsProxy] | None = None + + def on(self, selector: str, handlers: object) -> "HTMLRewriter": + self._handlers.append(("element", selector, handlers)) + return self + + def onDocument(self, handlers: object) -> "HTMLRewriter": + self._handlers.append(("document", handlers)) + return self + + def transform(self, response: Response) -> "Response": + js_rewriter = js.HTMLRewriter.new() + handler_proxies: list[JsProxy] = [] + + for handler_info in self._handlers: + if handler_info[0] == "element": + _, selector, handler = handler_info + js_handler = _make_js_handler( + handler, _ELEMENT_HANDLER_METHODS, handler_proxies + ) + js_rewriter.on(selector, js_handler) + else: + _, handler = handler_info + js_handler = _make_js_handler( + handler, _DOCUMENT_HANDLER_METHODS, handler_proxies + ) + js_rewriter.onDocument(js_handler) + + self._last_handler_proxies = handler_proxies + transformed = js_rewriter.transform(response.js_object) + + if transformed.body is None: + return Response(transformed) + + reader = transformed.body.getReader() + + def cleanup(): + for proxy in handler_proxies: + proxy.destroy() + + async def start(controller): + try: + while True: + result = await reader.read() + if result.done: + controller.close() + break + controller.enqueue(result.value) + except Exception as e: + controller.error(e) + finally: + cleanup() + + async def cancel(reason): + try: + if reader: + await reader.cancel(reason) + finally: + cleanup() + + start_proxy = create_proxy(start) + cancel_proxy = create_proxy(cancel) + handler_proxies.append(start_proxy) + handler_proxies.append(cancel_proxy) + + wrapped_body = js.ReadableStream.new( + start=start_proxy, + cancel=cancel_proxy, + ) + + return Response( + js.Response.new( + wrapped_body, + status=transformed.status, + statusText=transformed.statusText, + headers=transformed.headers, + ) + ) + + class DurableObject: """ Base class used to define a Durable Object. diff --git a/src/pyodide/python-entrypoint-helper.ts b/src/pyodide/python-entrypoint-helper.ts index 87dc9e63f61..d69b5ef850d 100644 --- a/src/pyodide/python-entrypoint-helper.ts +++ b/src/pyodide/python-entrypoint-helper.ts @@ -69,6 +69,94 @@ function patchWaitUntil(ctx: { waitUntilPatched.add(ctx); } +/** + * Represents a Python proxy object that can be called and has a destroy method. + */ +interface PythonHandlerProxy { + (...args: unknown[]): unknown; + destroy(): void; + [key: string | symbol]: unknown; +} + +/** + * Creates a guard wrapper around a Python handler proxy. + * + * The guard forwards all calls to the underlying Python proxy while it's active. + * Once destroy() is called, the guard becomes inert - any subsequent calls + * return undefined instead of throwing "Object has already been destroyed" errors. + * + * This is added to prevent Python handlers being used after Python has destroyed the proxy. + * TODO(later): Ideally, we should control the lifetime of the proxy and destroy it when we are certain that + * it is no longer needed. + * + */ +export function createHandlerGuard(pythonProxy: PythonHandlerProxy): any { + let active = true; + + return new Proxy(function () {}, { + get(_target, prop): unknown { + if (prop === 'destroy') { + return () => { + if (active) { + active = false; + try { + pythonProxy.destroy(); + } catch (_e) { + // Ignore errors during destroy + } + } + }; + } + + if (prop === '_active') { + return active; + } + + // After destruction, return no-op for any method call + if (!active) { + return () => undefined; + } + + // Forward property access to the Python proxy + const value = pythonProxy[prop]; + + // If it's a function, wrap it to handle potential async calls + if (typeof value === 'function') { + return (...args: unknown[]) => { + return (value as (...args: unknown[]) => unknown).apply( + pythonProxy, + args + ); + }; + } + + return value; + }, + + apply(_target, _thisArg, args): unknown { + if (!active) { + return undefined; + } + return pythonProxy(...args); + }, + + has(_target, prop): boolean { + if (prop === 'destroy') { + return true; + } + + if (prop === '_active') { + return true; + } + + if (!active) { + return false; + } + return prop in pythonProxy; + }, + }); +} + export type PyodideEntrypointHelper = { doAnImport: (mod: string) => Promise; cloudflareWorkersModule: { env: any }; @@ -76,6 +164,7 @@ export type PyodideEntrypointHelper = { workerEntrypoint: any; patchWaitUntil: typeof patchWaitUntil; patch_env_helper: (patch: unknown) => Generator; + createHandlerGuard: typeof createHandlerGuard; }; // Function to import JavaScript modules from Python @@ -101,6 +190,7 @@ export async function setDoAnImport( workerEntrypoint, patchWaitUntil, patch_env_helper, + createHandlerGuard, }; } diff --git a/src/workerd/server/tests/python/pytest/pytest.wd-test b/src/workerd/server/tests/python/pytest/pytest.wd-test index a71007186ae..25b55ee484e 100644 --- a/src/workerd/server/tests/python/pytest/pytest.wd-test +++ b/src/workerd/server/tests/python/pytest/pytest.wd-test @@ -8,6 +8,7 @@ const unitTests :Workerd.Config = ( (name = "main.py", pythonModule = embed "pytest/main.py"), (name = "tests/test_env.py", pythonModule = embed "pytest/tests/test_env.py"), (name = "tests/test_fs.py", pythonModule = embed "pytest/tests/test_fs.py"), + (name = "tests/test_htmlrewriter.py", pythonModule = embed "pytest/tests/test_htmlrewriter.py"), (name = "tests/test_import_from_javascript.py", pythonModule = embed "pytest/tests/test_import_from_javascript.py"), (name = "tests/test_dynlib_loading.py", pythonModule = embed "pytest/tests/test_dynlib_loading.py"), %PYTHON_VENDORED_MODULES% @@ -16,6 +17,7 @@ const unitTests :Workerd.Config = ( %PYTHON_FEATURE_FLAGS, "python_no_global_handlers", "unwrap_custom_thenables", + "streams_enable_constructors", ], bindings = [ ( diff --git a/src/workerd/server/tests/python/pytest/tests/test_htmlrewriter.py b/src/workerd/server/tests/python/pytest/tests/test_htmlrewriter.py new file mode 100644 index 00000000000..14a8962a033 --- /dev/null +++ b/src/workerd/server/tests/python/pytest/tests/test_htmlrewriter.py @@ -0,0 +1,247 @@ +import asyncio + +import pytest +from workers import HTMLRewriter, Response + +from pyodide.ffi import JsException + + +@pytest.mark.asyncio +async def test_sync_element_handler(): + html = "
Hello
" + response = Response(html, headers={"content-type": "text/html"}) + + class Handler: + def __init__(self): + self.called = False + + def element(self, el): + self.called = True + el.setAttribute("data-modified", "true") + + handler = Handler() + rewriter = HTMLRewriter() + rewriter.on("div", handler) + result = rewriter.transform(response) + + text = await result.text() + assert handler.called, "Handler should have been called" + assert 'data-modified="true"' in text, f"Expected modified attribute in: {text}" + + +@pytest.mark.asyncio +async def test_async_element_handler(): + html = "
Hello
" + response = Response(html, headers={"content-type": "text/html"}) + + class AsyncHandler: + def __init__(self): + self.called = False + + async def element(self, el): + await asyncio.sleep(0.01) + self.called = True + el.setAttribute("data-async", "true") + + handler = AsyncHandler() + rewriter = HTMLRewriter() + rewriter.on("div", handler) + result = rewriter.transform(response) + + text = await result.text() + assert handler.called, "Async handler should have been called" + assert 'data-async="true"' in text, f"Expected async attribute in: {text}" + + +@pytest.mark.asyncio +async def test_document_handler(): + html = "Test" + response = Response(html, headers={"content-type": "text/html"}) + + class DocHandler: + def __init__(self): + self.saw_doctype = False + self.saw_end = False + + def doctype(self, doctype): + self.saw_doctype = True + + def end(self, end): + self.saw_end = True + end.append("", html=True) + + handler = DocHandler() + rewriter = HTMLRewriter() + rewriter.onDocument(handler) + result = rewriter.transform(response) + + text = await result.text() + assert handler.saw_doctype, "Should have seen doctype" + assert handler.saw_end, "Should have seen end" + assert "" in text, f"Expected appended content in: {text}" + + +@pytest.mark.asyncio +async def test_multiple_handlers(): + html = "
D
S" + response = Response(html, headers={"content-type": "text/html"}) + + class Handler1: + def element(self, el): + el.setAttribute("data-h1", "true") + + class Handler2: + def element(self, el): + el.setAttribute("data-h2", "true") + + rewriter = HTMLRewriter() + rewriter.on("div", Handler1()) + rewriter.on("span", Handler2()) + + result = rewriter.transform(response) + text = await result.text() + + assert 'data-h1="true"' in text, f"Handler1 didn't work: {text}" + assert 'data-h2="true"' in text, f"Handler2 didn't work: {text}" + + +@pytest.mark.asyncio +async def test_rewriter_reuse(): + html = "
Test
" + + class Counter: + def __init__(self): + self.count = 0 + + def element(self, el): + self.count += 1 + el.setAttribute("data-count", str(self.count)) + + counter = Counter() + rewriter = HTMLRewriter() + rewriter.on("div", counter) + + result1 = rewriter.transform(Response(html, headers={"content-type": "text/html"})) + text1 = await result1.text() + assert 'data-count="1"' in text1, f"First transform failed: {text1}" + assert counter.count == 1 + + result2 = rewriter.transform(Response(html, headers={"content-type": "text/html"})) + text2 = await result2.text() + assert 'data-count="2"' in text2, f"Second transform failed: {text2}" + assert counter.count == 2 + + +@pytest.mark.asyncio +async def test_stream_cancellation(): + html = "
Test
" + response = Response(html, headers={"content-type": "text/html"}) + + class Handler: + def __init__(self): + self.called = False + + def element(self, el): + self.called = True + + handler = Handler() + rewriter = HTMLRewriter() + rewriter.on("div", handler) + + result = rewriter.transform(response) + + body = result.js_object.body + reader = body.getReader() + await reader.cancel() + + try: + await reader.read() + except Exception: + pass # Expected - stream was cancelled + + +def is_proxy_destroyed(proxy) -> bool: + try: + return not getattr(proxy, "_active", True) + except JsException as e: + return "Object has already been destroyed" in str(e) + + +@pytest.mark.asyncio +async def test_proxy_cleanup_on_completion(): + html = "
Test
" + response = Response(html, headers={"content-type": "text/html"}) + + class Handler: + def element(self, el): + el.setAttribute("data-test", "true") + + rewriter = HTMLRewriter() + rewriter.on("div", Handler()) + result = rewriter.transform(response) + + proxies = rewriter._last_handler_proxies + assert proxies is not None + assert len(proxies) == 3 # 1 handler + 2 internal proxies + + for proxy in proxies: + assert not is_proxy_destroyed(proxy), "Proxy should be alive before consumption" + + text = await result.text() + assert 'data-test="true"' in text + + for proxy in proxies: + assert is_proxy_destroyed(proxy), "Proxy should be destroyed after consumption" + + +@pytest.mark.asyncio +async def test_proxy_cleanup_on_cancel(): + html = "
Test
" + response = Response(html, headers={"content-type": "text/html"}) + + class Handler: + def element(self, el): + el.setAttribute("data-test", "true") + + rewriter = HTMLRewriter() + rewriter.on("div", Handler()) + result = rewriter.transform(response) + + proxies = rewriter._last_handler_proxies + assert proxies is not None + assert len(proxies) == 3 # 1 handler + 2 internal proxies + + for proxy in proxies: + assert not is_proxy_destroyed(proxy), "Proxy should be alive before cancel" + + body = result.js_object.body + reader = body.getReader() + await reader.cancel() + + try: + await reader.read() + except Exception: + pass # Expected - stream was cancelled + + for proxy in proxies: + assert is_proxy_destroyed(proxy), "Proxy should be destroyed after cancel" + + +@pytest.mark.asyncio +async def test_exception_handling(): + html = "
Test
" + response = Response(html, headers={"content-type": "text/html"}) + + class Handler: + def element(self, el): + el.setAttribute("data-test", "true") + raise RuntimeError("Test exception") + + rewriter = HTMLRewriter() + rewriter.on("div", Handler()) + result = rewriter.transform(response) + + try: + await result.text() + except Exception as e: + assert "PythonError" in str(e)