diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..29dda20c
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,18 @@
+{
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Run pwb.py with test.py",
+ "type": "python",
+ "request": "launch",
+ "program": "/home/lokas/PycharmProjects/pythonProject3/core_stable/pwb.py",
+ "args": [
+ "-dir:/home/lokas/PycharmProjects/pythonProject3/core_stable",
+ "/home/lokas/PycharmProjects/pythonProject3/code/tasks/InfoboxSync/test.py"
+ ],
+ "console": "integratedTerminal",
+ "justMyCode": false,
+ "python": "/usr/bin/python3.9"
+ }
+ ]
+}
diff --git a/output/paul_abasolo.json b/output/paul_abasolo.json
new file mode 100644
index 00000000..66ef445b
--- /dev/null
+++ b/output/paul_abasolo.json
@@ -0,0 +1,800 @@
+{
+ "page_title": "Paul Abasolo",
+ "template_type": "football_biography",
+ "arabic_fields": {
+ "أندية_الشباب": {
+ "value": [
+ "Lauaxeta Ikastola",
+ "[[Athletic Bilbao]]"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 2,
+ "original_keys": [
+ "youthclubs1",
+ "youthclubs2"
+ ]
+ },
+ "سنوات_الشباب": {
+ "value": [
+ "1995–1996",
+ "1996–2002"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 2,
+ "original_keys": [
+ "youthyears1",
+ "youthyears2"
+ ]
+ },
+ "أندية": {
+ "value": [
+ "[[CD Basconia|Basconia]]",
+ "[[Barakaldo CF|Barakaldo]]",
+ "[[SD Eibar|Eibar]]",
+ "→ [[SD Lemona|Lemona]] (loan)",
+ "→ [[Logroñés CF|Logroñés]] (loan)",
+ "[[Logroñés CF|Logroñés]]",
+ "[[Real Unión]]",
+ "Iurretako",
+ "[[SD Lemona|Lemona]]",
+ "[[Real Oviedo|Oviedo]]",
+ "[[Sestao River Club|Sestao]]",
+ "[[Amurrio Club|Amurrio]]",
+ "[[Zamudio SD|Zamudio]]",
+ "[[Club Portugalete|Portugalete]]",
+ "Batea"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 15,
+ "original_keys": [
+ "clubs1",
+ "clubs2",
+ "clubs3",
+ "clubs4",
+ "clubs5",
+ "clubs6",
+ "clubs7",
+ "clubs8",
+ "clubs9",
+ "clubs10",
+ "clubs11",
+ "clubs12",
+ "clubs13",
+ "clubs14",
+ "clubs15"
+ ]
+ },
+ "سنوات": {
+ "value": [
+ "2002–2003",
+ "2003–2004",
+ "2004–2006",
+ "2005",
+ "2005–2006",
+ "2006–2007",
+ "2007–2010",
+ "2010",
+ "2011",
+ "2011–2012",
+ "2012–2013",
+ "2014",
+ "2015–2016",
+ "2016–2017",
+ "2018–2021"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 15,
+ "original_keys": [
+ "years1",
+ "years2",
+ "years3",
+ "years4",
+ "years5",
+ "years6",
+ "years7",
+ "years8",
+ "years9",
+ "years10",
+ "years11",
+ "years12",
+ "years13",
+ "years14",
+ "years15"
+ ]
+ },
+ "مباريات": {
+ "value": [
+ "35",
+ "24",
+ "2",
+ "16",
+ "24",
+ "29",
+ "82",
+ "11",
+ "21",
+ "26",
+ "13",
+ "45",
+ "12",
+ "41"
+ ],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 14,
+ "original_keys": [
+ "caps1",
+ "caps2",
+ "caps3",
+ "caps4",
+ "caps5",
+ "caps6",
+ "caps7",
+ "caps9",
+ "caps10",
+ "caps11",
+ "caps12",
+ "caps13",
+ "caps14",
+ "caps15"
+ ]
+ },
+ "أهداف": {
+ "value": [
+ "5",
+ "1",
+ "0",
+ "4",
+ "2",
+ "8",
+ "12",
+ "1",
+ "2",
+ "0",
+ "5",
+ "17",
+ "8",
+ "10"
+ ],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 14,
+ "original_keys": [
+ "goals1",
+ "goals2",
+ "goals3",
+ "goals4",
+ "goals5",
+ "goals6",
+ "goals7",
+ "goals9",
+ "goals10",
+ "goals11",
+ "goals12",
+ "goals13",
+ "goals14",
+ "goals15"
+ ]
+ },
+ "منتخب_وطني": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "سنوات_وطنية": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "مباريات_وطنية": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 0,
+ "original_keys": []
+ },
+ "أهداف_وطنية": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 0,
+ "original_keys": []
+ },
+ "أندية_مدرب": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "سنوات_مدرب": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "اسم": {
+ "value": "Paul Abasolo",
+ "type": "text",
+ "original_key": "name",
+ "validation": {
+ "is_valid": true,
+ "length": 12,
+ "has_special_chars": false
+ }
+ },
+ "الاسم الكامل": {
+ "value": "Paul Abasolo Amantegi",
+ "type": "text",
+ "original_key": "fullname",
+ "validation": {
+ "is_valid": true,
+ "length": 21,
+ "has_special_chars": false
+ }
+ },
+ "تاريخ الولادة": {
+ "value": "{{birth date and age|1984|6|29|df=yes}}",
+ "type": "raw",
+ "original_key": "birth_date",
+ "validation": {
+ "is_valid": true
+ }
+ },
+ "مكان الولادة": {
+ "value": "[[Durango, Spain]]",
+ "type": "raw",
+ "original_key": "birth_place",
+ "validation": {
+ "is_valid": true
+ }
+ },
+ "الطول": {
+ "value": 1.84,
+ "type": "number",
+ "original_key": "height",
+ "validation": {
+ "is_valid": true,
+ "numeric_value": 1.84,
+ "has_units": true
+ },
+ "numeric_value": 1.84
+ },
+ "المركز": {
+ "value": "[[Forward (association football)|Forward]]",
+ "type": "raw",
+ "original_key": "position",
+ "validation": {
+ "is_valid": true
+ }
+ },
+ "مجموع_مباريات": {
+ "value": 381.0,
+ "type": "number",
+ "original_key": "totalcaps",
+ "validation": {
+ "is_valid": true,
+ "numeric_value": 381.0,
+ "has_units": true
+ },
+ "numeric_value": 381.0
+ },
+ "إجمالي الأهداف": {
+ "value": 75.0,
+ "type": "number",
+ "original_key": "totalgoals",
+ "validation": {
+ "is_valid": true,
+ "numeric_value": 75.0,
+ "has_units": true
+ },
+ "numeric_value": 75.0
+ }
+ },
+ "metadata": {
+ "categories": [
+ "1984 births",
+ "Living people",
+ "Footballers from Durango, Biscay",
+ "Spanish men's footballers",
+ "Men's association football forwards",
+ "Segunda División players",
+ "Segunda División B players",
+ "Tercera División players",
+ "Divisiones Regionales de Fútbol players",
+ "CD Basconia footballers",
+ "Athletic Bilbao footballers",
+ "Barakaldo CF footballers",
+ "SD Eibar footballers",
+ "SD Lemona footballers",
+ "Logroñés CF footballers",
+ "Real Unión footballers",
+ "Real Oviedo players",
+ "Sestao River Club footballers",
+ "Amurrio Club footballers",
+ "Zamudio SD players",
+ "Club Portugalete players",
+ "People convicted of sexual assault",
+ "21st-century Spanish sportsmen"
+ ],
+ "links": [
+ "Durango, Spain",
+ "Forward (association football)",
+ "Athletic Bilbao",
+ "CD Basconia",
+ "Barakaldo CF",
+ "SD Eibar",
+ "SD Lemona",
+ "Logroñés CF",
+ "Logroñés CF",
+ "Real Unión",
+ "SD Lemona",
+ "Real Oviedo",
+ "Sestao River Club",
+ "Amurrio Club",
+ "Zamudio SD",
+ "Club Portugalete",
+ "Association football",
+ "Forward (association football)",
+ "Durango, Spain",
+ "Biscay",
+ "Athletic Bilbao",
+ "farm team",
+ "CD Basconia",
+ "Mundo Deportivo",
+ "Segunda División B",
+ "Basque Country (autonomous community)",
+ "SD Eibar",
+ "2004–05 Segunda División",
+ "Segunda División",
+ "SD Lemona",
+ "Logroñés CF",
+ "El Correo",
+ "2009–10 Segunda División",
+ "Football in Spain",
+ "Marca (newspaper)",
+ "El Mundo (Spain)",
+ "Real Unión",
+ "2008–09 Segunda División B",
+ "ABC (newspaper)",
+ "Real Oviedo",
+ "Sestao River Club",
+ "sexual assault",
+ "Government of Spain",
+ "La Nueva España",
+ "Argia (magazine)"
+ ],
+ "template_name": "football_biography",
+ "total_mapped_fields": 20,
+ "original_field_count": 70
+ },
+ "raw_content": "{{short description|Spanish footballer}}\n{{family name hatnote|Abasolo|Amantegi|lang=Spanish}}\n{{Use dmy dates|date=January 2024}}\n{{Infobox football biography\n| name = Paul Abasolo\n| image = \n| fullname = Paul Abasolo Amantegi\n| birth_date = {{birth date and age|1984|6|29|df=yes}} \n| birth_place = [[Durango, Spain]]\n| height = {{height|m=1.84}}\n| position = [[Forward (association football)|Forward]]\n| currentclub = \n| clubnumber = \n| youthyears1 = 1995–1996 | youthclubs1 = Lauaxeta Ikastola\n| youthyears2 = 1996–2002 | youthclubs2 = [[Athletic Bilbao]]\n| years1 = 2002–2003 | clubs1 = [[CD Basconia|Basconia]] | caps1 = 35 | goals1 = 5\n| years2 = 2003–2004 | clubs2 = [[Barakaldo CF|Barakaldo]] | caps2 = 24 | goals2 = 1\n| years3 = 2004–2006 | clubs3 = [[SD Eibar|Eibar]] | caps3 = 2 | goals3 = 0\n| years4 = 2005 | clubs4 = → [[SD Lemona|Lemona]] (loan) | caps4 = 16 | goals4 = 4\n| years5 = 2005–2006 | clubs5 = → [[Logroñés CF|Logroñés]] (loan) | caps5 = 24 | goals5 = 2\n| years6 = 2006–2007 | clubs6 = [[Logroñés CF|Logroñés]] | caps6 = 29 | goals6 = 8\n| years7 = 2007–2010 | clubs7 = [[Real Unión]] | caps7 = 82 | goals7 = 12\n| years8 = 2010 | clubs8 = Iurretako | caps8 = | goals8 =\n| years9 = 2011 | clubs9 = [[SD Lemona|Lemona]] | caps9 = 11 | goals9 = 1\n| years10 = 2011–2012 | clubs10 = [[Real Oviedo|Oviedo]] | caps10 = 21 | goals10 = 2\n| years11 = 2012–2013 | clubs11 = [[Sestao River Club|Sestao]] | caps11 = 26 | goals11 = 0\n| years12 = 2014 | clubs12 = [[Amurrio Club|Amurrio]] | caps12 = 13 | goals12 = 5 \n| years13 = 2015–2016 | clubs13 = [[Zamudio SD|Zamudio]] | caps13 = 45 | goals13 = 17\n| years14 = 2016–2017 | clubs14 = [[Club Portugalete|Portugalete]] | caps14 = 12 | goals14 = 8\n| years15 = 2018–2021 | clubs15 = Batea | caps15 = 41 | goals15 = 10\n| totalcaps = 381 | totalgoals = 75\n| club-update =\n| nationalteam-update =\n}}\n'''Paul Abasolo Amantegi''' ({{IPA|es|pawl aβaˈsolo amanˈtexi}}; born 29 June 1984) is a Spanish former [[Association football|footballer]] who played as a [[Forward (association football)|forward]].\n\n==Club career==\nBorn in [[Durango, Spain|Durango]], [[Biscay]], Abasolo spent seven years connected with [[Athletic Bilbao]], six in the youth system and one with the [[farm team]], [[CD Basconia]].[[https://www.mundodeportivo.com/20111207/athletic-bilbao/entrevista-abasolo-gozada-jugar-athletic_54239906805.html Abasolo: \"Para mí es una gozada ver jugar a este Athletic\" (Abasolo: \"I'm having a blast watching this Athletic play\")]; [[Mundo Deportivo]], 7 December 2011 (in Spanish)] Released in 2003, he played the better part of the following six years in the [[Segunda División B]] and in his native [[Basque Country (autonomous community)|Basque Country]], the sole exception being [[SD Eibar]] in the first part of the [[2004–05 Segunda División|2004–05 season]] in the [[Segunda División]], with that club loaning him consecutively to two other teams in division three, [[SD Lemona]][[http://hemeroteca.mundodeportivo.com/preview/2005/02/01/pagina-28/1348145/pdf.html#&mode=fullScreen Cuatro fichajes sobre la bocina (Four signings at the buzzer)]; Mundo Deportivo, 1 February 2005 (in Spanish)] and [[Logroñés CF]].[[https://www.elcorreo.com/vizcaya/20070716/deportes/la-rioja/logrones-inicia-trabajo-jugadores-20070716.html El Logroñés CF inicia hoy el trabajo con 20 jugadores (Logroñés CF start working today with 20 players)]; [[El Correo]], 23 July 2007 (in Spanish)]\n\nIn the [[2009–10 Segunda División|2009–10 campaign]], Abasolo competed for the second time in the second tier of [[Football in Spain|Spanish football]], scoring four goals[[http://www.marca.com/2009/10/03/futbol/2adivision/1254596582.html El Real Unión se aprovecha de un Castellón que no levanta cabeza (Real Unión take advantage of sunken Castellón)] {{Webarchive|url=https://web.archive.org/web/20140821065415/http://www.marca.com/2009/10/03/futbol/2adivision/1254596582.html|date=21 August 2014}}; [[Marca (newspaper)|Marca]], 3 October 2009 (in Spanish)][[http://www.marca.com/2009/10/11/futbol/2adivision/1255290535.html Un gran Real Unión dejó sin dos puntos al Betis (Great Real Unión rob Betis of two points)] {{Webarchive|url=https://web.archive.org/web/20140821065450/http://www.marca.com/2009/10/11/futbol/2adivision/1255290535.html|date=21 August 2014}}; Marca, 11 October 2009 (in Spanish)][[http://www.elmundo.es/elmundo/2009/10/25/paisvasco/1256496810.html El Real Unión cae 2–1 ante el Cádiz en el Carranza con un gol de Ogbeche (Real Unión fall 2–1 against Cádiz at the Carranza with Ogbeche goal)] {{Webarchive|url=https://web.archive.org/web/20160609194820/http://www.elmundo.es/elmundo/2009/10/25/paisvasco/1256496810.html|date=9 June 2016}}; [[El Mundo (Spain)|El Mundo]], 25 October 2009 (in Spanish)][[http://www.marca.com/2010/05/22/futbol/2adivision/1274555627.html El Real Unión cree en la salvación ante un 'novato' Numancia (Real Unión believe in survival against 'rookie' Numancia)] {{Webarchive|url=https://web.archive.org/web/20171118222012/http://www.marca.com/2010/05/22/futbol/2adivision/1274555627.html|date=18 November 2017}}; Marca, 22 May 2010 (in Spanish)] in 34 games for [[Real Unión]][[http://www.marca.com/2009/07/15/futbol/equipos/real_union/1247674204.html Paul Abasolo no jugará con el Athletic (Paul Abasolo will not play for Athletic)] {{Webarchive|url=https://web.archive.org/web/20120110114532/http://www.marca.com/2009/07/15/futbol/equipos/real_union/1247674204.html|date=10 January 2012}}; Marca, 15 July 2009 (in Spanish)] as they suffered relegation one year after [[2008–09 Segunda División B|being promoted]].[[https://www.abc.es/deportes/futbol/hercules-primera-201006190000_noticia.html El Hércules vuelve a Primera catorce años después (Hércules return to ''Primera'' fourteen years later)]; [[ABC (newspaper)|ABC]], 19 June 2010 (in Spanish)] After a few months playing with a regional league side, he resumed his career in the third division with Lemona, [[Real Oviedo]] and [[Sestao River Club]].[[https://www.eldesmarque.com/noticias/pais-vasco/20160602/neira-abasolo-y-zarrabeitia-mas-calidad-para-el-portugalete_60053537.html Neira, Abasolo y Zarrabeitia, más calidad para el Portugalete (Neira, Abasolo and Zarrabeitia, more skill for Portugalete)]; El Desmarque, 2 June 2016 (in Spanish)]\n\n==Conviction==\nConvicted of [[sexual assault]] in July 2010 for having attacked three young women, Abasolo was acquitted on a fourth charge due to doubts of the alleged victim.[[http://www.elmundo.es/elmundo/2010/07/06/paisvasco/1278430901.html Condenan a un ex futbolista del Real Unión a 3 años de cárcel por abusos sexuales (Real Unión footballer sentenced to 3 years in jail for sexual assault)] {{Webarchive|url=https://web.archive.org/web/20180620153200/http://www.elmundo.es/elmundo/2010/07/06/paisvasco/1278430901.html|date=20 June 2018}}; El Mundo, 6 July 2010 (in Spanish)] He was eventually pardoned by the [[Government of Spain]], but this fact prevented him from being hired by his former club Athletic Bilbao.[[http://www.lne.es/deportes/2012/01/21/abasolo-indultado-tres-delitos-agresion-sexual/1187384.html Abasolo, indultado de tres delitos de agresión sexual (Abasolo, pardoned on three sexual assault charges)] {{Webarchive|url=https://web.archive.org/web/20160602031215/http://www.lne.es/deportes/2012/01/21/abasolo-indultado-tres-delitos-agresion-sexual/1187384.html|date=2 June 2016}}; [[La Nueva España]], 21 January 2012 (in Spanish)][[http://www.argia.com/argia-astekaria/2317/abasolo-auzia Abasolo auzia: Indultuak zabaldutako zauriak (The Abasolo case: the wounds opened by the pardon)] {{Webarchive|url=https://web.archive.org/web/20130514161758/http://www.argia.com/argia-astekaria/2317/abasolo-auzia|date=14 May 2013}}; [[Argia (magazine)|Argia]], 1 April 2012 (in Basque)]\n\n==References==\n{{Reflist}}\n\n==External links==\n*{{BDFutbol|5033}}\n*{{Futbolme|37}}\n*{{Athletic Bilbao profile|id=461/abasolo}}\n*{{LaPreferente|37445}}\n*{{Soccerway|paul-abasolo-amantegi/61737}}\n\n{{DEFAULTSORT:Abasolo, Paul}}\n[[Category:1984 births]]\n[[Category:Living people]]\n[[Category:Footballers from Durango, Biscay]]\n[[Category:Spanish men's footballers]]\n[[Category:Men's association football forwards]]\n[[Category:Segunda División players]]\n[[Category:Segunda División B players]]\n[[Category:Tercera División players]]\n[[Category:Divisiones Regionales de Fútbol players]]\n[[Category:CD Basconia footballers]]\n[[Category:Athletic Bilbao footballers]]\n[[Category:Barakaldo CF footballers]]\n[[Category:SD Eibar footballers]]\n[[Category:SD Lemona footballers]]\n[[Category:Logroñés CF footballers]]\n[[Category:Real Unión footballers]]\n[[Category:Real Oviedo players]]\n[[Category:Sestao River Club footballers]]\n[[Category:Amurrio Club footballers]]\n[[Category:Zamudio SD players]]\n[[Category:Club Portugalete players]]\n[[Category:People convicted of sexual assault]]\n[[Category:21st-century Spanish sportsmen]]",
+ "arabic_title": "بول أباسولو",
+ "translated_fields": {
+ "أندية_الشباب": {
+ "value": [
+ "Lauaxeta Ikastola",
+ "[[Athletic Bilbao]]"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 2,
+ "original_keys": [
+ "youthclubs1",
+ "youthclubs2"
+ ],
+ "translated_value": [
+ "لاوكسيتا إيكاستولا",
+ "[[Athletic Bilbao|أتلتيك بلباو]]"
+ ],
+ "translation_confidence": 0.9
+ },
+ "سنوات_الشباب": {
+ "value": [
+ "1995–1996",
+ "1996–2002"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 2,
+ "original_keys": [
+ "youthyears1",
+ "youthyears2"
+ ],
+ "translated_value": [
+ "1995–1996",
+ "1996–2002"
+ ],
+ "translation_confidence": 0.9
+ },
+ "أندية": {
+ "value": [
+ "[[CD Basconia|Basconia]]",
+ "[[Barakaldo CF|Barakaldo]]",
+ "[[SD Eibar|Eibar]]",
+ "→ [[SD Lemona|Lemona]] (loan)",
+ "→ [[Logroñés CF|Logroñés]] (loan)",
+ "[[Logroñés CF|Logroñés]]",
+ "[[Real Unión]]",
+ "Iurretako",
+ "[[SD Lemona|Lemona]]",
+ "[[Real Oviedo|Oviedo]]",
+ "[[Sestao River Club|Sestao]]",
+ "[[Amurrio Club|Amurrio]]",
+ "[[Zamudio SD|Zamudio]]",
+ "[[Club Portugalete|Portugalete]]",
+ "Batea"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 15,
+ "original_keys": [
+ "clubs1",
+ "clubs2",
+ "clubs3",
+ "clubs4",
+ "clubs5",
+ "clubs6",
+ "clubs7",
+ "clubs8",
+ "clubs9",
+ "clubs10",
+ "clubs11",
+ "clubs12",
+ "clubs13",
+ "clubs14",
+ "clubs15"
+ ],
+ "translated_value": [
+ "[[CD Basconia|باسكونيا]]",
+ "[[Barakaldo CF|باراكالدو]]",
+ "[[SD Eibar|إيبار]]",
+ "→ [[SD Lemona|ليمونا]] (إعارة)",
+ "→ [[Logroñés CF|لوغروينيس]] (إعارة)",
+ "[[Logroñés CF|لوغروينيس]]",
+ "[[Real Unión|ريال يونيون]]",
+ "إيوريتاكو",
+ "[[SD Lemona|ليمونا]]",
+ "[[Real Oviedo|أوفييدو]]",
+ "[[Sestao River Club|سستاو]]",
+ "[[Amurrio Club|أموريو]]",
+ "[[Zamudio SD|زاموديو]]",
+ "[[Club Portugalete|بورتوغاليتي]]",
+ "باتيا"
+ ],
+ "translation_confidence": 0.9
+ },
+ "سنوات": {
+ "value": [
+ "2002–2003",
+ "2003–2004",
+ "2004–2006",
+ "2005",
+ "2005–2006",
+ "2006–2007",
+ "2007–2010",
+ "2010",
+ "2011",
+ "2011–2012",
+ "2012–2013",
+ "2014",
+ "2015–2016",
+ "2016–2017",
+ "2018–2021"
+ ],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 15,
+ "original_keys": [
+ "years1",
+ "years2",
+ "years3",
+ "years4",
+ "years5",
+ "years6",
+ "years7",
+ "years8",
+ "years9",
+ "years10",
+ "years11",
+ "years12",
+ "years13",
+ "years14",
+ "years15"
+ ],
+ "translated_value": [
+ "2002–2003",
+ "2003–2004",
+ "2004–2006",
+ "2005",
+ "2005–2006",
+ "2006–2007",
+ "2007–2010",
+ "2010",
+ "2011",
+ "2011–2012",
+ "2012–2013",
+ "2014",
+ "2015–2016",
+ "2016–2017",
+ "2018–2021"
+ ],
+ "translation_confidence": 0.9
+ },
+ "مباريات": {
+ "value": [
+ "35",
+ "24",
+ "2",
+ "16",
+ "24",
+ "29",
+ "82",
+ "11",
+ "21",
+ "26",
+ "13",
+ "45",
+ "12",
+ "41"
+ ],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 14,
+ "original_keys": [
+ "caps1",
+ "caps2",
+ "caps3",
+ "caps4",
+ "caps5",
+ "caps6",
+ "caps7",
+ "caps9",
+ "caps10",
+ "caps11",
+ "caps12",
+ "caps13",
+ "caps14",
+ "caps15"
+ ],
+ "translated_value": [
+ "35",
+ "24",
+ "2",
+ "16",
+ "24",
+ "29",
+ "82",
+ "11",
+ "21",
+ "26",
+ "13",
+ "45",
+ "12",
+ "41"
+ ],
+ "translation_confidence": 0.9
+ },
+ "أهداف": {
+ "value": [
+ "5",
+ "1",
+ "0",
+ "4",
+ "2",
+ "8",
+ "12",
+ "1",
+ "2",
+ "0",
+ "5",
+ "17",
+ "8",
+ "10"
+ ],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 14,
+ "original_keys": [
+ "goals1",
+ "goals2",
+ "goals3",
+ "goals4",
+ "goals5",
+ "goals6",
+ "goals7",
+ "goals9",
+ "goals10",
+ "goals11",
+ "goals12",
+ "goals13",
+ "goals14",
+ "goals15"
+ ],
+ "translated_value": [
+ "5",
+ "1",
+ "0",
+ "4",
+ "2",
+ "8",
+ "12",
+ "1",
+ "2",
+ "0",
+ "5",
+ "17",
+ "8",
+ "10"
+ ],
+ "translation_confidence": 0.9
+ },
+ "منتخب_وطني": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "سنوات_وطنية": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "مباريات_وطنية": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 0,
+ "original_keys": []
+ },
+ "أهداف_وطنية": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "number",
+ "count": 0,
+ "original_keys": []
+ },
+ "أندية_مدرب": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "سنوات_مدرب": {
+ "value": [],
+ "type": "numbered",
+ "item_type": "raw",
+ "count": 0,
+ "original_keys": []
+ },
+ "اسم": {
+ "value": "Paul Abasolo",
+ "type": "text",
+ "original_key": "name",
+ "validation": {
+ "is_valid": true,
+ "length": 12,
+ "has_special_chars": false
+ },
+ "translated_value": "بول أباسولو",
+ "translation_confidence": 0.9
+ },
+ "الاسم الكامل": {
+ "value": "Paul Abasolo Amantegi",
+ "type": "text",
+ "original_key": "fullname",
+ "validation": {
+ "is_valid": true,
+ "length": 21,
+ "has_special_chars": false
+ },
+ "translated_value": "بول أباسولو أمانتيغي",
+ "translation_confidence": 0.9
+ },
+ "تاريخ الولادة": {
+ "value": "{{birth date and age|1984|6|29|df=yes}}",
+ "type": "raw",
+ "original_key": "birth_date",
+ "validation": {
+ "is_valid": true
+ },
+ "translated_value": "{{birth date and age|1984|6|29|df=yes}}",
+ "translation_confidence": 0.9
+ },
+ "مكان الولادة": {
+ "value": "[[Durango, Spain]]",
+ "type": "raw",
+ "original_key": "birth_place",
+ "validation": {
+ "is_valid": true
+ },
+ "translated_value": "[[Durango, Spain|دورانجو، إسبانيا]]",
+ "translation_confidence": 0.9
+ },
+ "الطول": {
+ "value": 1.84,
+ "type": "number",
+ "original_key": "height",
+ "validation": {
+ "is_valid": true,
+ "numeric_value": 1.84,
+ "has_units": true
+ },
+ "numeric_value": 1.84
+ },
+ "المركز": {
+ "value": "[[Forward (association football)|Forward]]",
+ "type": "raw",
+ "original_key": "position",
+ "validation": {
+ "is_valid": true
+ },
+ "translated_value": "[[Forward (association football)|مهاجم]]",
+ "translation_confidence": 0.9
+ },
+ "مجموع_مباريات": {
+ "value": 381.0,
+ "type": "number",
+ "original_key": "totalcaps",
+ "validation": {
+ "is_valid": true,
+ "numeric_value": 381.0,
+ "has_units": true
+ },
+ "numeric_value": 381.0
+ },
+ "إجمالي الأهداف": {
+ "value": 75.0,
+ "type": "number",
+ "original_key": "totalgoals",
+ "validation": {
+ "is_valid": true,
+ "numeric_value": 75.0,
+ "has_units": true
+ },
+ "numeric_value": 75.0
+ }
+ },
+ "translation_metadata": {
+ "service": "Google Gemini AI",
+ "target_language": "ar",
+ "translation_method": "single_request",
+ "total_fields": 20,
+ "translated_fields": 11,
+ "success": true
+ },
+ "translated_title": "بول أباسولو",
+ "arabic_template": "{{واو|صندوق معلومات سيرة كرة قدم\n|\n| أندية_الشباب1 = لاوكسيتا إيكاستولا\n| أندية_الشباب2 = [[Athletic Bilbao|أتلتيك بلباو]]\n| سنوات_الشباب1 = 1995–1996\n| سنوات_الشباب2 = 1996–2002\n| أندية1 = [[نادي باسكونيا|باسكونيا]]\n| أندية2 = [[نادي باراكالدو|باراكالدو]]\n| أندية3 = [[SD Eibar|إيبار]]\n| أندية4 = → [[SD Lemona|ليمونا]] (إعارة)\n| أندية5 = → [[Logroñés CF|لوغروينيس]] (إعارة)\n| أندية6 = [[Logroñés CF|لوغروينيس]]\n| أندية7 = [[Real Unión|ريال يونيون]]\n| أندية8 = إيوريتاكو\n| أندية9 = [[SD Lemona|ليمونا]]\n| أندية10 = [[Real Oviedo|أوفييدو]]\n| أندية11 = [[نادي سيستاو ريفر|سستاو]]\n| أندية12 = [[Amurrio Club|أموريو]]\n| أندية13 = [[Zamudio SD|زاموديو]]\n| أندية14 = [[Club Portugalete|بورتوغاليتي]]\n| أندية15 = باتيا\n| سنوات1 = 2002–2003\n| سنوات2 = 2003–2004\n| سنوات3 = 2004–2006\n| سنوات4 = 2005\n| سنوات5 = 2005–2006\n| سنوات6 = 2006–2007\n| سنوات7 = 2007–2010\n| سنوات8 = 2010\n| سنوات9 = 2011\n| سنوات10 = 2011–2012\n| سنوات11 = 2012–2013\n| سنوات12 = 2014\n| سنوات13 = 2015–2016\n| سنوات14 = 2016–2017\n| سنوات15 = 2018–2021\n| مباريات1 = 35\n| مباريات2 = 24\n| مباريات3 = 2\n| مباريات4 = 16\n| مباريات5 = 24\n| مباريات6 = 29\n| مباريات7 = 82\n| مباريات8 = 11\n| مباريات9 = 21\n| مباريات10 = 26\n| مباريات11 = 13\n| مباريات12 = 45\n| مباريات13 = 12\n| مباريات14 = 41\n| أهداف1 = 5\n| أهداف2 = 1\n| أهداف3 = 0\n| أهداف4 = 4\n| أهداف5 = 2\n| أهداف6 = 8\n| أهداف7 = 12\n| أهداف8 = 1\n| أهداف9 = 2\n| أهداف10 = 0\n| أهداف11 = 5\n| أهداف12 = 17\n| أهداف13 = 8\n| أهداف14 = 10\n| اسم = بول أباسولو\n| الاسم الكامل = بول أباسولو أمانتيغي\n| تاريخ الولادة = {{واو|birth date and age|1984|6|29|df=yes}}\n| مكان الولادة = [[دورانغو (بسكاي)|دورانجو، إسبانيا]]\n| الطول = 1.84\n| المركز = [[Forward (association football)|مهاجم]]\n| مجموع_مباريات = 381.0\n| إجمالي الأهداف = 75.0\n}}",
+ "construct_metadata": {
+ "template_type": "football_biography",
+ "field_count": 14,
+ "builder_name": "Arabic Football_Biography Builder",
+ "template_name": "صندوق معلومات سيرة كرة قدم"
+ },
+ "localization_metadata": {
+ "links_replaced": 4,
+ "templates_localized": 2,
+ "waou_templates_inserted": 2,
+ "localization_errors": []
+ },
+ "publish_metadata": {
+ "page_title": "بول أباسولو",
+ "edit_summary": "تحديث قالب السيرة الذاتية باستخدام InfoboxSync - football_biography",
+ "revision_id": 71876884,
+ "publish_success": true,
+ "published_at": "2025-08-28T10:34:26Z"
+ }
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 7348bc2b..dcf7c8df 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ SQLAlchemy == 2.0.43
typing-extensions == 4.14.1
waybackpy~=3.0.6
wikitextparser~=0.56.4
+litellm~=1.40.0
diff --git a/tasks/InfoboxSync/README.md b/tasks/InfoboxSync/README.md
new file mode 100644
index 00000000..30bc04a7
--- /dev/null
+++ b/tasks/InfoboxSync/README.md
@@ -0,0 +1,547 @@
+# InfoboxSync Pipeline
+
+A sophisticated Wikipedia infobox synchronization pipeline using advanced design patterns and pywikibot integration.
+
+## Overview
+
+This pipeline fetches Arabic Wikipedia pages, finds their corresponding English pages, extracts infobox data, and processes it through multiple stages for synchronization purposes.
+
+## Architecture & Design Patterns
+
+### 1. **Template Method Pattern**
+- Used in `WikipediaFetcher` abstract base class
+- Defines the skeleton of the page fetching algorithm
+- Allows subclasses to customize specific steps
+
+### 2. **Observer Pattern**
+- `FetchObserver` interface for monitoring fetch operations
+- `LoggingFetchObserver` implementation for logging
+- Allows multiple observers to monitor the same fetch operation
+
+### 3. **Strategy Pattern**
+- `WikipediaSyncFetcher` uses different fetch strategies
+- Separate fetchers for Arabic and English Wikipedia
+- Easy to extend with new language-specific strategies
+
+### 4. **Factory Pattern**
+- Creation of appropriate fetchers based on site/language
+- Centralized fetcher creation in `WikipediaSyncFetcher`
+
+### 5. **Data Class Pattern**
+- `PageInfo` dataclass for structured page information
+- Clean data transfer between pipeline stages
+
+### 6. **Strategy Pattern (Parse Stage)**
+- `InfoboxParser` abstract base class for different template parsers
+- `FootballBiographyParser` for football biography templates
+- `GenericInfoboxParser` for other template types
+- `InfoboxParserFactory` creates appropriate parsers based on template type
+- Allows pipeline to specify which template parser to use
+
+### 7. **Strategy Pattern (Map Stage)**
+- `FieldMapper` abstract base class for different field type mappers
+- `TextFieldMapper`, `NumberFieldMapper`, `ImageFieldMapper`, `LinkFieldMapper`, `MixedFieldMapper` implementations
+- `NumberedFieldMapper` for handling numbered sequences (years1, clubs1, caps1, etc.)
+- `TemplateMapper` abstract base class for template-specific field mapping
+- `FootballBiographyMapper` with English to Arabic field mappings
+- `TemplateMapperFactory` and `FieldMapperFactory` for creating appropriate mappers
+- Supports field type validation and numbered field grouping
+
+### 8. **Strategy Pattern (Translate Stage)**
+- `TranslationService` abstract base class for different translation services
+- `GeminiTranslator` implementation using Google Gemini AI via LiteLLM
+- `TranslationServiceFactory` for creating appropriate translation services
+- Template-based prompt system with external prompt files
+- Single-request translation for optimal efficiency
+- Supports field-by-field and template-level translation strategies
+
+### 9. **Strategy Pattern (Construct Stage)**
+- `TemplateBuilder` abstract base class for different template builders
+- `ArabicTemplateBuilder` implementation for Arabic Wikipedia templates
+- `TemplateBuilderFactory` for creating appropriate builders
+- Smart field formatting for different data types
+- Template validation and quality estimation
+- Support for multiple Arabic Wikipedia template types
+
+## Pipeline Stages
+
+1. **Fetch**: Uses pywikibot to check Arabic page existence and find English equivalent
+2. **Parse**: Extracts infobox data from Wikipedia wikitext using wikitextparser and Strategy Pattern
+3. **Map**: Maps English field names to Arabic equivalents using Strategy Pattern with field type validation
+4. **Translate**: Translates English infobox data to Arabic using Google Gemini AI with single-request optimization
+5. **Construct**: Constructs Arabic Wikipedia templates from translated data using Strategy Pattern
+6. **Publish**: Publishes the Arabic template directly to Arabic Wikipedia using pywikibot
+7. **Save**: Saves processed data as JSON files
+
+## Usage
+
+### Basic Usage
+
+```python
+from tasks.InfoboxSync.test import run_wikipedia_pipeline
+
+# Sync an Arabic Wikipedia page
+result_path = run_wikipedia_pipeline("مصر") # Egypt in Arabic
+print(f"Data saved to: {result_path}")
+```
+
+### Advanced Usage
+
+```python
+from tasks.InfoboxSync.fetch.fetch import fetch_wikipedia_data
+
+# Direct access to fetch stage
+wiki_data = fetch_wikipedia_data("محمد بن سلمان")
+if wiki_data['sync_possible']:
+ arabic_page = wiki_data['arabic']
+ english_page = wiki_data['english']
+ print(f"Arabic title: {arabic_page.title}")
+ print(f"English title: {english_page.title}")
+```
+
+### Using Different Template Types
+
+```python
+from tasks.InfoboxSync.parse.parse import parse_data
+
+# Parse football biography infobox
+data = {'title': 'Player Name', 'content': wikitext_content}
+football_data = parse_data(data, 'football_biography')
+
+# Parse person infobox
+person_data = parse_data(data, 'person')
+
+# Parse custom template
+custom_data = parse_data(data, 'infobox custom_template')
+```
+
+### Field Mapping with Different Types
+
+```python
+from tasks.InfoboxSync.map.field_mappers import TextFieldMapper, NumberFieldMapper
+
+# Text field mapping
+text_mapper = TextFieldMapper("name", "الاسم")
+mapped = text_mapper.map_field("Lionel Messi")
+
+# Number field mapping
+number_mapper = NumberFieldMapper("height", "الطول")
+mapped = number_mapper.map_field("1.70 m")
+
+# Numbered field mapping (groups years1, years2, etc.)
+from tasks.InfoboxSync.map.template_mapper import FootballBiographyMapper
+mapper = FootballBiographyMapper()
+mapped_data = mapper.map_infobox(infobox_data)
+```
+
+### Translation with AI
+
+```python
+from tasks.InfoboxSync.translate.translate import translate_data
+
+# Translate mapped data to Arabic using Gemini AI
+result = translate_data(mapped_data, target_lang='ar')
+
+if result['translation_metadata']['success']:
+ translated_fields = result['translated_fields']
+ print(f"Translated {result['translation_metadata']['translated_fields']} fields")
+else:
+ print(f"Translation failed: {result['translation_metadata']['error']}")
+```
+
+### Template Building
+
+```python
+from tasks.InfoboxSync.construct.build import construct_arabic_template
+
+# Construct Arabic Wikipedia template from translated data
+result = construct_arabic_template(translated_data, template_type='football_biography')
+
+if build_result.success:
+ arabic_template = build_result.template_text
+ print(f"Constructed template with {build_result.field_count} fields")
+ print(arabic_template)
+else:
+ print(f"Construction failed: {build_result.errors}")
+```
+
+## Dependencies
+
+- `pywikibot`: For Wikipedia API interactions
+- `wikitextparser`: For advanced wikitext parsing
+- `litellm`: For Google Gemini AI integration
+- Install with: `pip install pywikibot wikitextparser litellm`
+
+## Configuration
+
+Before using, configure pywikibot:
+```bash
+pywikibot generate_user_files
+```
+
+Or set up your user configuration file as needed for your Wikipedia bot account.
+
+For translation, set your Google AI API key:
+```bash
+export GEMINI_API_KEY="your-google-ai-api-key"
+```
+
+## Error Handling
+
+The pipeline includes comprehensive error handling for:
+- Missing Arabic pages
+- Missing corresponding English pages
+- Network/API errors
+- Parsing errors
+- Field mapping errors
+- Translation service errors
+- Template construction errors
+- File I/O errors
+
+## Data Flow
+
+1. **Input**: Arabic page title
+2. **Arabic Check**: Verify page exists on ar.wikipedia.org
+3. **English Lookup**: Find corresponding English page via langlinks
+4. **Content Fetch**: Retrieve English page content
+5. **Parse**: Extract infobox data using wikitextparser and Strategy Pattern
+6. **Map**: Map English fields to Arabic using Strategy Pattern with field type validation
+7. **Translate**: Translate English infobox data to Arabic using Google Gemini AI with single-request optimization
+9. **Construct**: Construct Arabic Wikipedia template from translated data
+9. **Publish**: Publish the Arabic template directly to Arabic Wikipedia using pywikibot
+10. **Save**: Store results as JSON
+
+## Extension Points
+
+### Adding New Languages
+```python
+class GermanFetcher(WikipediaFetcher):
+ def get_site_name(self) -> str:
+ return 'de'
+```
+
+### Custom Observers
+```python
+class MetricsObserver(FetchObserver):
+ def on_page_check_complete(self, page_info: PageInfo):
+ # Send metrics to monitoring system
+ pass
+```
+
+### Adding New Template Parsers
+```python
+from tasks.InfoboxSync.parse.base_parser import InfoboxParser
+
+class CustomTemplateParser(InfoboxParser):
+ def __init__(self):
+ super().__init__("infobox custom")
+
+ def parse_infobox(self, wikitext: str) -> Dict[str, Any]:
+ # Custom parsing logic here
+ pass
+```
+
+### Adding New Field Mappers
+```python
+from tasks.InfoboxSync.map.field_mappers import FieldMapper
+
+class CustomFieldMapper(FieldMapper):
+ def __init__(self, english_key: str, arabic_key: str):
+ super().__init__(english_key, arabic_key, "custom")
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ # Custom field mapping logic
+ return {
+ self.arabic_key: {
+ "value": value,
+ "type": "custom",
+ "validation": {"is_valid": True}
+ }
+ }
+```
+
+### Adding New Translation Services
+```python
+from tasks.InfoboxSync.translate.base_translator import TranslationService
+
+class CustomTranslator(TranslationService):
+ def __init__(self, api_key: str):
+ super().__init__('en', 'ar')
+ self.api_key = api_key
+
+ def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+ # Custom translation logic
+ pass
+
+ def translate_text(self, text: str, **kwargs) -> TranslationResult:
+ # Custom text translation
+ pass
+
+ def translate_field(self, field_name: str, field_value: Any, **kwargs) -> TranslationResult:
+ # Custom field translation
+ pass
+
+ def is_available(self) -> bool:
+ # Check service availability
+ pass
+
+ def get_service_name(self) -> str:
+ return "Custom Translator"
+
+# Register the service
+from tasks.InfoboxSync.translate.base_translator import TranslationServiceFactory
+TranslationServiceFactory.register_service("custom", CustomTranslator)
+```
+
+### Adding New Template Builders
+```python
+from tasks.InfoboxSync.construct.base_builder import TemplateBuilder
+
+class CustomTemplateBuilder(TemplateBuilder):
+ def __init__(self, template_type: str = 'custom'):
+ super().__init__(template_type)
+
+ def build_template(self, translated_data: Dict[str, Any], **kwargs) -> BuildResult:
+ # Custom template building logic
+ pass
+
+ def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ # Custom field formatting
+ pass
+
+ def get_template_name(self) -> str:
+ return 'صندوق مخصص'
+
+ def is_available(self) -> bool:
+ return True
+
+ def get_builder_name(self) -> str:
+ return "Custom Template Builder"
+
+# Register the builder
+from tasks.InfoboxSync.construct.base_builder import TemplateBuilderFactory
+TemplateBuilderFactory.register_builder("custom_builder", CustomTemplateBuilder)
+```
+
+### Enhanced Parsing
+The parse stage uses `wikitextparser` for more accurate infobox extraction compared to regex-based approaches.
+
+## File Structure
+
+```
+tasks/InfoboxSync/
+├── README.md # This documentation
+├── test.py # Main pipeline orchestrator
+├── demo_real_wikipedia.py # Demo with real Wikipedia data
+├── fetch/
+│ ├── __init__.py
+│ └── fetch.py # Fetch stage with design patterns
+├── parse/
+│ ├── __init__.py
+│ ├── base_parser.py # Abstract parser base class
+│ ├── football_parser.py # Football biography parser
+│ ├── parser_factory.py # Factory for creating parsers
+│ └── parse.py # Parse stage using Strategy Pattern
+├── map/
+│ ├── __init__.py
+│ ├── field_mappers.py # Field type strategy mappers
+│ ├── template_mapper.py # Template field mapping coordinators
+│ └── map.py # Map stage using Strategy Pattern
+├── translate/
+│ ├── __init__.py
+│ ├── base_translator.py # Abstract translation service base class
+│ ├── gemini_translator.py # Google Gemini AI implementation
+│ ├── config.py # Translation configuration management
+│ ├── prompt_template.txt # External prompt template file
+│ ├── translate.py # Main translation interface
+│ └── README.md # Translation stage documentation
+├── construct/
+│ ├── __init__.py
+│ ├── base_builder.py # Abstract template builder base class
+│ ├── arabic_builder.py # Arabic Wikipedia template builder
+│ ├── build.py # Main construct stage interface
+│ └── README.md # Construct stage documentation
+├── publish/
+│ ├── __init__.py
+│ └── publish.py # Publish stage for Wikipedia publishing
+└── save/
+ ├── __init__.py
+ └── save.py # Save stage for data persistence
+```
+
+## Logging
+
+The pipeline uses Python's logging module with configurable levels. All stages include detailed logging for debugging and monitoring.
+
+## Future Enhancements
+
+- Support for additional translation services (OpenAI, DeepL, Microsoft Translator)
+- Support for additional Wikipedia languages
+- Database storage instead of JSON files
+- Web interface for pipeline management
+- Batch processing capabilities
+- Additional template parser implementations
+- Enhanced field type detection and validation
+- Translation quality scoring and confidence metrics
+- Additional Arabic Wikipedia template builders
+- Template validation against Arabic Wikipedia standards
+- Integration with Arabic Wikipedia bot frameworks
+
+## Translation Features
+
+### Single-Request Optimization
+- Translates ALL fields in ONE API call instead of multiple requests
+- Significant cost savings and performance improvement
+- Maintains field relationships and context
+
+### Template-Based Prompts
+- Prompt text stored in external `prompt_template.txt` file
+- Easy customization without touching Python code
+- Placeholder replacement system (`{{FIELDS_TEXT}}`, `{{START_INDEX}}`)
+
+### Smart Field Handling
+- **Text Fields**: Naturally translated (names, descriptions)
+- **Number Fields**: Preserved as-is (heights, statistics)
+- **Link Fields**: Maintained with proper formatting
+- **Numbered Fields**: Translated individually while maintaining sequence
+
+### AI Integration
+- Google Gemini AI via LiteLLM for high-quality translations
+- Configurable models and parameters
+- Environment variable configuration for API keys
+
+## Construct Stage Features
+
+### Arabic Template Construction
+- Builds properly formatted Arabic Wikipedia templates
+- Handles different field types (text, numbers, links, images, numbered arrays)
+- Supports multiple template types with proper Arabic names
+- Proper Arabic Wikipedia syntax and formatting
+
+### Smart Field Formatting
+- **Text Fields**: Properly escaped for wiki syntax
+- **Number Fields**: Preserved with units and formatting
+- **Link Fields**: Correct wiki link syntax
+- **Image Fields**: Proper Arabic image syntax
+- **Numbered Fields**: Expanded into sequential fields (الأندية1, الأندية2, etc.)
+
+### Template Types Supported
+- `football_biography` → `سيرة لاعب كرة قدم`
+- `person` → `صندوق شخص`
+- `biography` → `سيرة شخصية`
+- `football_club` → `صندوق نادي كرة قدم`
+- `country` → `صندوق دولة`
+- And many more...
+
+## Publish Stage Features
+
+### Direct Wikipedia Publishing
+- Publishes Arabic templates directly to Arabic Wikipedia using pywikibot
+- Automated edit summaries in Arabic for transparency
+- Revision tracking and metadata collection
+- Comprehensive error handling for publish failures
+
+### Template Validation
+- Validates template content before publishing
+- Checks for required fields and proper formatting
+- Ensures compatibility with Arabic Wikipedia standards
+- Prevents publishing of malformed templates
+
+### Publishing Results
+After publishing, detailed results are provided:
+```python
+{
+ "success": true,
+ "page_title": "بول أباسولو",
+ "edit_summary": "تحديث قالب السيرة الذاتية باستخدام InfoboxSync - football_biography",
+ "revision_id": 12345678,
+ "metadata": {
+ "template_length": 450,
+ "site": "ar.wikipedia.org",
+ "published_at": "2024-01-15T10:30:00Z"
+ },
+ "errors": []
+}
+```
+
+### Safety Features
+- Verifies page existence before publishing
+- Requires proper pywikibot configuration
+- Includes edit summaries for accountability
+- Supports dry-run mode for testing (future enhancement)
+
+## Field Mapping Examples
+
+### Numbered Fields (Most Common in Football)
+Wikipedia often uses numbered fields for career history:
+```
+years1 = 2002–2003 | clubs1 = Basconia | caps1 = 35 | goals1 = 5
+years2 = 2003–2004 | clubs2 = Barakaldo | caps2 = 24 | goals2 = 1
+```
+
+Mapped to Arabic arrays:
+```json
+{
+ "سنوات_اللعب": {
+ "value": ["2002–2003", "2003–2004", ...],
+ "type": "numbered",
+ "count": 15
+ },
+ "الأندية": {
+ "value": ["Basconia", "Barakaldo", ...],
+ "type": "numbered",
+ "count": 15
+ }
+}
+```
+
+### Field Type Validation
+Each field type includes validation:
+```json
+{
+ "الطول": {
+ "value": 1.70,
+ "type": "number",
+ "validation": {
+ "is_valid": true,
+ "numeric_value": 1.7,
+ "has_units": true
+ }
+ }
+}
+```
+
+### Translation Results
+After translation, fields include translated values:
+```json
+{
+ "الاسم": {
+ "value": "Paul Abasolo",
+ "translated_value": "بول أباسولو",
+ "translation_confidence": 0.9,
+ "type": "text"
+ },
+ "الأندية": {
+ "value": ["Club A", "Club B"],
+ "translated_value": ["النادي أ", "النادي ب"],
+ "translation_confidence": 0.9,
+ "type": "numbered"
+ }
+}
+```
+
+### Construction Results
+After construction, the template is ready for Arabic Wikipedia:
+```python
+{
+ "template_text": "{{صندوق سيرة لاعب كرة قدم\n| الاسم = بول أباسولو\n| الطول = 1.84 م\n...}}",
+ "template_type": "football_biography",
+ "field_count": 8,
+ "success": true,
+ "metadata": {
+ "template_name": "سيرة لاعب كرة قدم",
+ "builder_name": "Arabic Football Biography Builder",
+ "total_input_fields": 10
+ },
+ "errors": []
+}
\ No newline at end of file
diff --git a/tasks/InfoboxSync/construct/README.md b/tasks/InfoboxSync/construct/README.md
new file mode 100644
index 00000000..c1841b9f
--- /dev/null
+++ b/tasks/InfoboxSync/construct/README.md
@@ -0,0 +1,285 @@
+# Construct Stage - Arabic Wikipedia Template Construction
+
+This directory contains the construct stage implementation for constructing Arabic Wikipedia templates from translated infobox data.
+
+## Overview
+
+The build stage takes translated data from the translate stage and constructs properly formatted Arabic Wikipedia templates. It follows the Strategy Pattern to support different template types and formats.
+
+## Architecture
+
+### Core Components
+
+1. **`base_builder.py`** - Abstract base classes and factory pattern
+2. **`arabic_builder.py`** - Arabic Wikipedia template builder implementation
+3. **`build.py`** - Main construct stage interface and utilities
+
+### Design Patterns Used
+
+- **Strategy Pattern**: Different template builders for various Wikipedia template types
+- **Factory Pattern**: Creation of appropriate builders via `TemplateBuilderFactory`
+- **Template Method**: Consistent template construction workflow
+
+## Features
+
+### Template Construction
+- Constructs properly formatted Arabic Wikipedia templates
+- Handles different field types (text, numbers, links, images, numbered arrays)
+- Supports multiple template types (football biography, person, country, etc.)
+- Proper Arabic Wikipedia syntax and formatting
+
+### Smart Field Formatting
+- **Text Fields**: Properly escaped for wiki syntax
+- **Number Fields**: Preserved with units and formatting
+- **Link Fields**: Correct wiki link syntax
+- **Image Fields**: Proper Arabic image syntax
+- **Numbered Fields**: Expanded into sequential fields (الأندية1, الأندية2, etc.)
+
+### Template Types Supported
+- `football_biography` → `سيرة لاعب كرة قدم`
+- `person` → `صندوق شخص`
+- `biography` → `سيرة شخصية`
+- `football_club` → `صندوق نادي كرة قدم`
+- `country` → `صندوق دولة`
+- And many more...
+
+## Usage
+
+### Basic Usage
+
+```python
+from tasks.InfoboxSync.construct.build import construct_arabic_template
+
+# Your translated data from translate stage
+translated_data = {
+ 'translated_fields': {
+ 'الاسم': {'value': 'Paul Abasolo', 'translated_value': 'بول أباسولو', 'type': 'text'},
+ 'الطول': {'value': '1.84 m', 'translated_value': '1.84 م', 'type': 'number'},
+ # ... more translated fields
+ }
+}
+
+# Construct Arabic template
+result = construct_arabic_template(translated_data, template_type='football_biography')
+
+if result.success:
+ arabic_template = result.template_text
+ print(f"Constructed template with {result.field_count} fields")
+ print(arabic_template)
+else:
+ print(f"Construction failed: {result.errors}")
+```
+
+### Advanced Usage
+
+```python
+from tasks.InfoboxSync.construct.build import construct_template, TemplateBuilderFactory
+
+# Create specific builder
+builder = TemplateBuilderFactory.create_builder('arabic', template_type='person')
+
+# Construct template
+result = builder.construct_template(translated_data)
+
+# Access build metadata
+print(f"Template type: {result.template_type}")
+print(f"Fields included: {result.field_count}")
+print(f"Builder used: {result.metadata['builder_name']}")
+```
+
+### Template Validation
+
+```python
+from tasks.InfoboxSync.construct.build import validate_arabic_template, estimate_template_quality
+
+# Validate template
+validation = validate_arabic_template(template_text)
+print(f"Valid: {validation['valid']}")
+print(f"Errors: {validation['errors']}")
+
+# Estimate quality
+quality = estimate_template_quality(template_text)
+print(f"Quality score: {quality['quality_score']}/100")
+```
+
+## Data Flow
+
+### Input Data Structure
+```python
+{
+ 'translated_fields': {
+ 'arabic_field_name': {
+ 'value': 'original_value',
+ 'translated_value': 'arabic_translation',
+ 'type': 'text|number|link|image|numbered',
+ 'translation_confidence': 0.9
+ }
+ },
+ 'translation_metadata': {...},
+ 'page_title': 'English Title',
+ 'arabic_title': 'Arabic Title'
+}
+```
+
+### Output Data Structure
+```python
+{
+ 'template_text': '{{صندوق سيرة لاعب كرة قدم\n| الاسم = بول أباسولو\n| الطول = 1.84 م\n...}}',
+ 'template_type': 'football_biography',
+ 'field_count': 8,
+ 'success': True,
+ 'metadata': {
+ 'template_name': 'سيرة لاعب كرة قدم',
+ 'builder_name': 'Arabic Football Biography Builder',
+ 'total_input_fields': 10
+ },
+ 'errors': []
+}
+```
+
+## Template Construction Process
+
+1. **Extract Translated Fields** - Get translated data from translate stage
+2. **Select Template Type** - Choose appropriate Arabic template name
+3. **Format Each Field** - Apply proper Arabic Wikipedia syntax
+4. **Handle Field Types** - Special formatting for numbers, links, arrays
+5. **Construct Template** - Construct complete template with all fields
+6. **Validate Output** - Check for syntax errors and formatting issues
+
+## Field Type Handling
+
+### Text Fields
+```
+Input: {'value': 'Paul Abasolo', 'translated_value': 'بول أباسولو'}
+Output: | الاسم = بول أباسولو
+```
+
+### Number Fields
+```
+Input: {'value': '1.84 m', 'translated_value': '1.84 م'}
+Output: | الطول = 1.84 م
+```
+
+### Numbered Fields (Arrays)
+```
+Input: {'value': ['Club A', 'Club B'], 'translated_value': ['النادي أ', 'النادي ب']}
+Output:
+| الأندية1 = النادي أ
+| الأندية2 = النادي ب
+```
+
+### Link Fields
+```
+Input: {'value': 'Argentina', 'translated_value': 'الأرجنتين'}
+Output: | الجنسية = [[الأرجنتين]]
+```
+
+## Extending the Build Stage
+
+### Adding New Template Types
+
+```python
+from tasks.InfoboxSync.construct.arabic_builder import ArabicTemplateBuilder
+
+class CustomArabicBuilder(ArabicTemplateBuilder):
+ def __init__(self):
+ super().__init__('custom_type')
+
+ def get_template_name(self) -> str:
+ return 'صندوق مخصص'
+
+ def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ # Custom formatting logic
+ return f"| {arabic_key} = {field_data['translated_value']}"
+
+# Register the builder
+from tasks.InfoboxSync.construct.base_builder import TemplateBuilderFactory
+TemplateBuilderFactory.register_builder("custom_arabic", CustomArabicBuilder)
+```
+
+### Custom Field Formatters
+
+```python
+class AdvancedArabicBuilder(ArabicTemplateBuilder):
+ def _format_custom_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ # Advanced custom formatting
+ value = field_data.get('translated_value', '')
+ return f"| {arabic_key} = '''{value}'''"
+
+ def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ field_type = field_data.get('type', 'text')
+
+ if field_type == 'custom':
+ return self._format_custom_field(arabic_key, field_data)
+ else:
+ return super().format_field(arabic_key, field_data)
+```
+
+## Integration with Pipeline
+
+The construct stage seamlessly integrates with the InfoboxSync pipeline:
+
+1. **Receives** translated data from translate stage
+2. **Constructs** Arabic Wikipedia template
+3. **Passes** template text to save stage
+4. **Provides** metadata for logging and debugging
+
+## Quality Assurance
+
+### Template Validation
+- Syntax checking for Arabic Wikipedia format
+- Field count verification
+- Error and warning reporting
+
+### Quality Estimation
+- Quality scoring algorithm (0-100)
+- Issue detection (escaped characters, formatting problems)
+- Template complexity analysis
+
+## Performance Considerations
+
+- **Efficient Processing**: Single-pass field formatting
+- **Memory Optimized**: Streaming template construction
+- **Error Resilient**: Continues processing despite individual field errors
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Empty Template Output**
+ - Check if translated_fields contains valid data
+ - Verify field types are supported
+ - Check for translation stage errors
+
+2. **Malformed Template Syntax**
+ - Ensure proper Arabic Wikipedia template names
+ - Check for special character escaping
+ - Validate field formatting
+
+3. **Unsupported Template Type**
+ - Add new template type mapping in `get_template_name()`
+ - Extend field formatters if needed
+ - Register new builder class
+
+### Debug Information
+
+Enable detailed logging:
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+```
+
+The construct stage provides comprehensive logging for:
+- Template construction process
+- Field formatting details
+- Error conditions and recovery
+- Performance metrics
+
+## Future Enhancements
+
+- Support for additional Arabic template types
+- Advanced template customization options
+- Integration with Arabic Wikipedia bot frameworks
+- Template quality improvement suggestions
+- Multi-language template support
+- Template validation against Arabic Wikipedia standards
\ No newline at end of file
diff --git a/tasks/InfoboxSync/construct/__init__.py b/tasks/InfoboxSync/construct/__init__.py
new file mode 100644
index 00000000..3f7e6b41
--- /dev/null
+++ b/tasks/InfoboxSync/construct/__init__.py
@@ -0,0 +1,19 @@
+# Construct stage package
+
+# Import base classes
+from .base_builder import TemplateBuilder, TemplateBuilderFactory, BuildResult
+
+# Import concrete builders
+from . import arabic_builder
+
+# Import main construct function
+from .build import construct_template, get_available_builders, test_builder
+
+__all__ = [
+ 'TemplateBuilder',
+ 'TemplateBuilderFactory',
+ 'BuildResult',
+ 'construct_template',
+ 'get_available_builders',
+ 'test_builder'
+]
\ No newline at end of file
diff --git a/tasks/InfoboxSync/construct/arabic_builder.py b/tasks/InfoboxSync/construct/arabic_builder.py
new file mode 100644
index 00000000..bc596abf
--- /dev/null
+++ b/tasks/InfoboxSync/construct/arabic_builder.py
@@ -0,0 +1,258 @@
+"""
+Arabic Wikipedia template builder implementation.
+"""
+
+import logging
+from typing import Dict, Any, List, Optional
+from .base_builder import TemplateBuilder, TemplateBuilderFactory, BuildResult
+
+logger = logging.getLogger(__name__)
+
+
+class ArabicTemplateBuilder(TemplateBuilder):
+ """Builder for Arabic Wikipedia templates using translated data."""
+
+ def __init__(self, template_type: str = 'football_biography'):
+ """
+ Initialize Arabic template builder.
+
+ Args:
+ template_type (str): Type of template to build
+ """
+ super().__init__(template_type)
+ self.field_formatters = {
+ 'text': self._format_text_field,
+ 'number': self._format_number_field,
+ 'link': self._format_link_field,
+ 'image': self._format_image_field,
+ 'numbered': self._format_numbered_field,
+ 'mixed': self._format_mixed_field
+ }
+
+ def construct_template(self, translated_data: Dict[str, Any], **kwargs) -> BuildResult:
+ """
+ Build an Arabic Wikipedia template from translated data.
+
+ Args:
+ translated_data (Dict[str, Any]): Data from translate stage with translated_fields
+ **kwargs: Additional parameters
+
+ Returns:
+ BuildResult: Template building result
+ """
+ try:
+ logger.info(f"Building Arabic template for type: {self.template_type}")
+
+ # Extract translated fields
+ translated_fields = translated_data.get('translated_fields', {})
+ if not translated_fields:
+ return BuildResult(
+ template_text="",
+ template_type=self.template_type,
+ field_count=0,
+ success=False,
+ metadata={},
+ errors=["No translated fields found"]
+ )
+
+ # Build template structure
+ template_lines = []
+ template_lines.append(f"{{{{{self.get_template_name()}")
+ template_lines.append("|") # First pipe after template name
+
+ # Process each translated field
+ field_count = 0
+ errors = []
+
+ for arabic_key, field_data in translated_fields.items():
+ try:
+ # Get the translated value
+ if 'translated_value' in field_data:
+ value = field_data['translated_value']
+ else:
+ value = field_data.get('value', '')
+
+ # Format the field
+ formatted_field = self.format_field(arabic_key, {
+ 'value': value,
+ 'type': field_data.get('type', 'text'),
+ 'original_type': field_data.get('type', 'text')
+ })
+
+ if formatted_field:
+ # Handle different field types
+ if field_data.get('type') == 'numbered' and isinstance(formatted_field, list):
+ # Numbered fields return a list of lines
+ template_lines.extend(formatted_field)
+ field_count += 1
+ elif isinstance(formatted_field, str) and formatted_field.strip():
+ template_lines.append(formatted_field)
+ field_count += 1
+
+ except Exception as e:
+ error_msg = f"Failed to format field {arabic_key}: {e}"
+ logger.warning(error_msg)
+ errors.append(error_msg)
+ continue
+
+ # Close template
+ template_lines.append("}}")
+
+ # Join all lines with actual newlines - creates proper line breaks
+ template_text = "\n".join(template_lines)
+
+ logger.info(f"Successfully built Arabic template with {field_count} fields")
+
+ return BuildResult(
+ template_text=template_text,
+ template_type=self.template_type,
+ field_count=field_count,
+ success=True,
+ metadata={
+ 'template_name': self.get_template_name(),
+ 'builder_name': self.get_builder_name(),
+ 'total_input_fields': len(translated_fields)
+ },
+ errors=errors
+ )
+
+ except Exception as e:
+ logger.error(f"Template building failed: {e}")
+ return BuildResult(
+ template_text="",
+ template_type=self.template_type,
+ field_count=0,
+ success=False,
+ metadata={},
+ errors=[str(e)]
+ )
+
+ def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ """
+ Format a single field for the Arabic template.
+
+ Args:
+ arabic_key (str): Arabic field name
+ field_data (Dict[str, Any]): Field data with value and type
+
+ Returns:
+ str: Formatted field string
+ """
+ field_type = field_data.get('type', 'text')
+
+ # Get the appropriate formatter
+ formatter = self.field_formatters.get(field_type, self._format_text_field)
+
+ try:
+ return formatter(arabic_key, field_data)
+ except Exception as e:
+ logger.warning(f"Failed to format field {arabic_key} of type {field_type}: {e}")
+ # Fallback to text formatting
+ return self._format_text_field(arabic_key, field_data)
+
+ def _format_text_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ """Format a text field."""
+ value = field_data.get('value', '')
+ if not value:
+ return ""
+
+ # Escape pipes and other wiki syntax
+ # escaped_value = str(value).replace('|', '{{!}}').replace('=', '{{=}}')
+ escaped_value = str(value)
+
+ return f"| {arabic_key} = {escaped_value}"
+
+ def _format_number_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ """Format a number field."""
+ value = field_data.get('value', '')
+ if not value:
+ return ""
+
+ # Keep numbers as-is, just ensure proper formatting
+ return f"| {arabic_key} = {value}"
+
+ def _format_link_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ """Format a link field."""
+ value = field_data.get('value', '')
+ if not value:
+ return ""
+
+ # Ensure proper wiki link format
+ if '|' in str(value):
+ # Already has link text
+ return f"| {arabic_key} = {value}"
+ else:
+ # Simple link
+ return f"| {arabic_key} = [[{value}]]"
+
+ def _format_image_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ """Format an image field."""
+ value = field_data.get('value', '')
+ if not value:
+ return ""
+
+ # Ensure proper image format
+ if value.startswith('[[File:') or value.startswith('[[ملف:'):
+ return f"| {arabic_key} = {value}"
+ else:
+ return f"| {arabic_key} = [[ملف:{value}]]"
+
+ def _format_numbered_field(self, arabic_key: str, field_data: Dict[str, Any]) -> List[str]:
+ """Format a numbered field (array of values)."""
+ value = field_data.get('value', [])
+ if not value or not isinstance(value, list):
+ return []
+
+ # Return a list of formatted lines for each numbered field
+ formatted_lines = []
+
+ for i, item_value in enumerate(value, 1):
+ if item_value: # Only include non-empty values
+ field_name = f"{arabic_key}{i}"
+ # escaped_value = str(item_value).replace('|', '{{!}}').replace('=', '{{=}}')
+ escaped_value = str(item_value)
+ formatted_lines.append(f"| {field_name} = {escaped_value}")
+
+ return formatted_lines
+
+ def _format_mixed_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ """Format a mixed field (contains both text and links)."""
+ value = field_data.get('value', '')
+ if not value:
+ return ""
+
+ # Mixed fields usually contain wiki markup, keep as-is
+ return f"| {arabic_key} = {value}"
+
+ def get_template_name(self) -> str:
+ """Get the Arabic Wikipedia template name."""
+ template_names = {
+ 'football_biography': 'صندوق معلومات سيرة كرة قدم',
+ 'person': 'صندوق شخص',
+ 'biography': 'سيرة شخصية',
+ 'football_club': 'صندوق نادي كرة قدم',
+ 'country': 'صندوق دولة',
+ 'city': 'صندوق مدينة',
+ 'university': 'صندوق جامعة',
+ 'company': 'صندوق شركة',
+ 'film': 'صندوق فيلم',
+ 'book': 'صندوق كتاب',
+ 'album': 'صندوق ألبوم',
+ 'tv_series': 'صندوق مسلسل تلفزيوني'
+ }
+
+ return template_names.get(self.template_type, 'صندوق عام')
+
+ def is_available(self) -> bool:
+ """Check if Arabic template builder is available."""
+ # Always available since it doesn't require external services
+ return True
+
+ def get_builder_name(self) -> str:
+ """Get the name of this builder."""
+ return f"Arabic {self.template_type.title()} Builder"
+
+
+# Register the Arabic builder
+TemplateBuilderFactory.register_builder("arabic", ArabicTemplateBuilder)
+TemplateBuilderFactory.register_builder("arabic_football", ArabicTemplateBuilder)
\ No newline at end of file
diff --git a/tasks/InfoboxSync/construct/base_builder.py b/tasks/InfoboxSync/construct/base_builder.py
new file mode 100644
index 00000000..6e4f244b
--- /dev/null
+++ b/tasks/InfoboxSync/construct/base_builder.py
@@ -0,0 +1,135 @@
+"""
+Base template builder classes following Strategy Pattern.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BuildResult:
+ """Result of a template building operation."""
+ template_text: str
+ template_type: str
+ field_count: int
+ success: bool
+ metadata: Dict[str, Any]
+ errors: List[str] = None
+
+ def __post_init__(self):
+ if self.errors is None:
+ self.errors = []
+
+
+class TemplateBuilder(ABC):
+ """Abstract base class for template builders."""
+
+ def __init__(self, template_type: str = 'generic'):
+ self.template_type = template_type
+
+ @abstractmethod
+ def construct_template(self, translated_data: Dict[str, Any], **kwargs) -> BuildResult:
+ """
+ Build a Wikipedia template from translated data.
+
+ Args:
+ translated_data (Dict[str, Any]): Translated data with Arabic field names
+ **kwargs: Additional parameters for building
+
+ Returns:
+ BuildResult: Template building result
+ """
+ pass
+
+ @abstractmethod
+ def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str:
+ """
+ Format a single field for the template.
+
+ Args:
+ arabic_key (str): Arabic field name
+ field_data (Dict[str, Any]): Field data with value and type
+
+ Returns:
+ str: Formatted field string
+ """
+ pass
+
+ @abstractmethod
+ def get_template_name(self) -> str:
+ """
+ Get the Wikipedia template name for this builder.
+
+ Returns:
+ str: Template name (e.g., 'infobox football biography')
+ """
+ pass
+
+ @abstractmethod
+ def is_available(self) -> bool:
+ """Check if this builder is available and properly configured."""
+ pass
+
+ @abstractmethod
+ def get_builder_name(self) -> str:
+ """Get the name of this builder."""
+ pass
+
+
+class TemplateBuilderFactory:
+ """Factory for creating template builders."""
+
+ _builders = {}
+
+ @classmethod
+ def register_builder(cls, builder_name: str, builder_class):
+ """Register a new template builder."""
+ cls._builders[builder_name] = builder_class
+
+ @classmethod
+ def create_builder(cls, builder_name: str, **kwargs) -> TemplateBuilder:
+ """
+ Create a template builder instance.
+
+ Args:
+ builder_name (str): Name of the builder to create
+ **kwargs: Parameters for builder initialization
+
+ Returns:
+ TemplateBuilder: Builder instance
+
+ Raises:
+ ValueError: If builder is not registered or creation fails
+ """
+ if builder_name not in cls._builders:
+ available_builders = list(cls._builders.keys())
+ raise ValueError(f"Unknown template builder: {builder_name}. "
+ f"Available builders: {available_builders}")
+
+ builder_class = cls._builders[builder_name]
+ try:
+ return builder_class(**kwargs)
+ except Exception as e:
+ raise ValueError(f"Failed to create {builder_name} builder: {e}")
+
+ @classmethod
+ def get_available_builders(cls) -> List[str]:
+ """Get list of available template builders."""
+ return list(cls._builders.keys())
+
+ @classmethod
+ def get_supported_template_types(cls) -> List[str]:
+ """Get list of supported template types across all builders."""
+ template_types = []
+ for builder_class in cls._builders.values():
+ try:
+ # Create a temporary instance to get template name
+ temp_builder = builder_class()
+ template_types.append(temp_builder.get_template_name())
+ except Exception:
+ continue
+ return template_types
\ No newline at end of file
diff --git a/tasks/InfoboxSync/construct/build.py b/tasks/InfoboxSync/construct/build.py
new file mode 100644
index 00000000..232e46e1
--- /dev/null
+++ b/tasks/InfoboxSync/construct/build.py
@@ -0,0 +1,251 @@
+"""
+Build stage for Arabic Wikipedia template construction.
+"""
+
+import logging
+from typing import Dict, Any, Optional
+from .base_builder import TemplateBuilderFactory, BuildResult
+
+logger = logging.getLogger(__name__)
+
+
+def construct_template(translated_data: dict, builder_name: str = 'arabic',
+ template_type: str = 'football_biography') -> BuildResult:
+ """
+ Build an Arabic Wikipedia template from translated data.
+
+ Args:
+ translated_data (dict): Data from translate stage with translated_fields
+ builder_name (str): Name of the builder to use ('arabic', 'arabic_football', etc.)
+ template_type (str): Type of template to build
+
+ Returns:
+ BuildResult: Template building result with Arabic template text
+ """
+ logger.info(f"Starting template build with builder: {builder_name}")
+
+ try:
+ # Create the appropriate builder
+ builder = TemplateBuilderFactory.create_builder(
+ builder_name,
+ template_type=template_type
+ )
+
+ # Check if builder is available
+ if not builder.is_available():
+ error_msg = f"Template builder {builder_name} is not available"
+ logger.error(error_msg)
+ return BuildResult(
+ template_text="",
+ template_type=template_type,
+ field_count=0,
+ success=False,
+ metadata={},
+ errors=[error_msg]
+ )
+
+ # Build the template
+ result = builder.construct_template(translated_data)
+
+ if result.success:
+ logger.info(f"Template build completed successfully: {result.field_count} fields")
+ else:
+ logger.error(f"Template build failed: {result.errors}")
+
+ return result
+
+ except Exception as e:
+ logger.error(f"Template building failed: {e}")
+ return BuildResult(
+ template_text="",
+ template_type=template_type,
+ field_count=0,
+ success=False,
+ metadata={},
+ errors=[str(e)]
+ )
+
+
+def construct_arabic_template(translated_data: dict, template_type: str = 'football_biography') -> BuildResult:
+ """
+ Convenience function to build Arabic templates.
+
+ Args:
+ translated_data (dict): Translated data from translate stage
+ template_type (str): Template type to build
+
+ Returns:
+ BuildResult: Arabic template building result
+ """
+ return construct_template(translated_data, 'arabic', template_type)
+
+
+def get_available_builders() -> list:
+ """
+ Get list of available template builders.
+
+ Returns:
+ list: List of available builder names
+ """
+ try:
+ return TemplateBuilderFactory.get_available_builders()
+ except Exception as e:
+ logger.error(f"Error getting available builders: {e}")
+ return []
+
+
+def get_supported_template_types() -> list:
+ """
+ Get list of supported template types.
+
+ Returns:
+ list: List of supported template type names
+ """
+ try:
+ return TemplateBuilderFactory.get_supported_template_types()
+ except Exception as e:
+ logger.error(f"Error getting supported template types: {e}")
+ return []
+
+
+def test_builder(builder_name: str = 'arabic') -> bool:
+ """
+ Test if a template builder is available and working.
+
+ Args:
+ builder_name (str): Name of the builder to test
+
+ Returns:
+ bool: True if builder is available and working
+ """
+ try:
+ builder = TemplateBuilderFactory.create_builder(builder_name)
+ return builder.is_available()
+ except Exception as e:
+ logger.error(f"Error testing builder {builder_name}: {e}")
+ return False
+
+
+def create_sample_arabic_template() -> str:
+ """
+ Create a sample Arabic Wikipedia template for testing.
+
+ Returns:
+ str: Sample Arabic template
+ """
+ return """{{صندوق سيرة لاعب كرة قدم
+| الاسم = بول أباسولو
+| الاسم الكامل = بول أباسولو أمانتيغي
+| تاريخ الميلاد = 29 يونيو 1984
+| مكان الميلاد = دورانغو، إسبانيا
+| الطول = 1.84 م
+| المركز = مهاجم
+| الأندية1 = نادي باسكونيا
+| سنوات اللاعب1 = 2002–2003
+| المباريات1 = 35
+| الأهداف1 = 5
+| الأندية2 = براكالدو
+| سنوات اللاعب2 = 2003–2004
+| المباريات2 = 24
+| الأهداف2 = 1
+}}"""
+
+
+def validate_arabic_template(template_text: str) -> Dict[str, Any]:
+ """
+ Validate an Arabic Wikipedia template.
+
+ Args:
+ template_text (str): Template text to validate
+
+ Returns:
+ dict: Validation results
+ """
+ errors = []
+ warnings = []
+
+ # Check basic structure
+ if not template_text.startswith('{{'):
+ errors.append("Template must start with '{{'")
+ if not template_text.endswith('}}'):
+ errors.append("Template must end with '}}'")
+
+ # Check for required fields (basic validation)
+ lines = template_text.split('\n')
+ field_count = 0
+
+ for line in lines:
+ line = line.strip()
+ if line.startswith('|') and '=' in line:
+ field_count += 1
+
+ if field_count == 0:
+ warnings.append("No fields found in template")
+
+ return {
+ 'valid': len(errors) == 0,
+ 'errors': errors,
+ 'warnings': warnings,
+ 'field_count': field_count,
+ 'template_length': len(template_text)
+ }
+
+
+def format_template_for_display(template_text: str) -> str:
+ """
+ Format template text for better display in logs or UI.
+
+ Args:
+ template_text (str): Raw template text
+
+ Returns:
+ str: Formatted template text
+ """
+ # Add line numbers and indentation for readability
+ lines = template_text.split('\n')
+ formatted_lines = []
+
+ for i, line in enumerate(lines, 1):
+ if line.strip():
+ formatted_lines.append("2d")
+ else:
+ formatted_lines.append("")
+
+ return '\n'.join(formatted_lines)
+
+
+def estimate_template_quality(template_text: str) -> Dict[str, Any]:
+ """
+ Estimate the quality of a generated template.
+
+ Args:
+ template_text (str): Template text to analyze
+
+ Returns:
+ dict: Quality metrics
+ """
+ # Basic quality metrics
+ field_count = template_text.count('|')
+ escaped_chars = template_text.count('{{!}}') + template_text.count('{{=}}')
+
+ # Check for common issues
+ issues = []
+ if '{{!}}' in template_text:
+ issues.append("Contains escaped pipes")
+ if '{{=}}' in template_text:
+ issues.append("Contains escaped equals signs")
+ if '\n\n\n' in template_text:
+ issues.append("Multiple consecutive empty lines")
+
+ # Calculate quality score (0-100)
+ base_score = min(100, field_count * 10) # 10 points per field, max 100
+ penalty = len(issues) * 10 # 10 point penalty per issue
+ quality_score = max(0, base_score - penalty)
+
+ return {
+ 'quality_score': quality_score,
+ 'field_count': field_count,
+ 'escaped_characters': escaped_chars,
+ 'issues': issues,
+ 'template_length': len(template_text)
+ }
\ No newline at end of file
diff --git a/tasks/InfoboxSync/fetch/__init__.py b/tasks/InfoboxSync/fetch/__init__.py
new file mode 100644
index 00000000..48588eaf
--- /dev/null
+++ b/tasks/InfoboxSync/fetch/__init__.py
@@ -0,0 +1,63 @@
+"""Fetch stage module for Wikipedia infobox synchronization."""
+
+import logging
+from typing import Dict, Any
+
+from .sync_fetcher import WikipediaSyncFetcher
+from .models import PageInfo, SyncResult
+
+logger = logging.getLogger(__name__)
+
+# Main API functions
+def fetch_wikipedia_data(ar_page_title: str) -> Dict[str, Any]:
+ """
+ Main function to fetch Wikipedia data for sync operation.
+
+ Args:
+ ar_page_title: Arabic page title to sync
+
+ Returns:
+ Dictionary with Arabic and English page data
+ """
+ fetcher = WikipediaSyncFetcher()
+ return fetcher.fetch_arabic_and_english_pages(ar_page_title)
+
+
+def fetch_sync_result(ar_page_title: str) -> SyncResult:
+ """
+ Fetch synchronization result with structured return type.
+
+ Args:
+ ar_page_title: Title of the Arabic Wikipedia page
+
+ Returns:
+ SyncResult object with structured data
+ """
+ fetcher = WikipediaSyncFetcher()
+ return fetcher.fetch_sync_result(ar_page_title)
+
+
+# Legacy function for backward compatibility
+def fetch_data(url: str) -> dict:
+ """
+ Legacy function for backward compatibility.
+ Now expects a page title instead of URL.
+ """
+ logger.warning("fetch_data(url) is deprecated. Use fetch_wikipedia_data(page_title) instead.")
+ # Extract page title from URL (simple implementation)
+ if 'wikipedia.org' in url:
+ page_title = url.split('/')[-1].replace('_', ' ')
+ return fetch_wikipedia_data(page_title)
+ else:
+ raise ValueError("URL must be a Wikipedia page URL")
+
+
+# Expose key classes for advanced usage
+__all__ = [
+ 'WikipediaSyncFetcher',
+ 'PageInfo',
+ 'SyncResult',
+ 'fetch_wikipedia_data',
+ 'fetch_sync_result',
+ 'fetch_data'
+]
\ No newline at end of file
diff --git a/tasks/InfoboxSync/fetch/fetch.py b/tasks/InfoboxSync/fetch/fetch.py
new file mode 100644
index 00000000..3f027d00
--- /dev/null
+++ b/tasks/InfoboxSync/fetch/fetch.py
@@ -0,0 +1,241 @@
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Any
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PageInfo:
+ """Data class for page information."""
+ title: str
+ exists: bool
+ content: Optional[str] = None
+ langlinks: Optional[Dict[str, str]] = None
+ error: Optional[str] = None
+
+
+class FetchObserver(ABC):
+ """Observer pattern for monitoring fetch operations."""
+
+ @abstractmethod
+ def on_page_check_start(self, page_title: str, site: str):
+ pass
+
+ @abstractmethod
+ def on_page_check_complete(self, page_info: PageInfo):
+ pass
+
+ @abstractmethod
+ def on_error(self, error: str):
+ pass
+
+
+class LoggingFetchObserver(FetchObserver):
+ """Logging implementation of fetch observer."""
+
+ def on_page_check_start(self, page_title: str, site: str):
+ logger.info(f"Starting page check for '{page_title}' on {site}")
+
+ def on_page_check_complete(self, page_info: PageInfo):
+ if page_info.exists:
+ logger.info(f"Page '{page_info.title}' found successfully")
+ else:
+ logger.warning(f"Page '{page_info.title}' not found")
+
+ def on_error(self, error: str):
+ logger.error(f"Fetch error: {error}")
+
+
+class WikipediaFetcher(ABC):
+ """Abstract base class for Wikipedia page fetchers using Template Method pattern."""
+
+ def __init__(self, observer: Optional[FetchObserver] = None):
+ self.observer = observer or LoggingFetchObserver()
+
+ def fetch_page_info(self, page_title: str) -> PageInfo:
+ """Template method for fetching page information."""
+ try:
+ self.observer.on_page_check_start(page_title, self.get_site_name())
+
+ page_info = self._check_page_exists(page_title)
+ if page_info.exists:
+ page_info = self._fetch_page_content(page_info)
+ page_info = self._fetch_langlinks(page_info)
+
+ self.observer.on_page_check_complete(page_info)
+ return page_info
+
+ except Exception as e:
+ error_msg = f"Error fetching page '{page_title}': {str(e)}"
+ self.observer.on_error(error_msg)
+ return PageInfo(title=page_title, exists=False, error=error_msg)
+
+ @abstractmethod
+ def get_site_name(self) -> str:
+ pass
+
+ @abstractmethod
+ def _check_page_exists(self, page_title: str) -> PageInfo:
+ pass
+
+ @abstractmethod
+ def _fetch_page_content(self, page_info: PageInfo) -> PageInfo:
+ pass
+
+ @abstractmethod
+ def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo:
+ pass
+
+
+class PywikibotFetcher(WikipediaFetcher):
+ """Pywikibot implementation of Wikipedia fetcher."""
+
+ def __init__(self, site_name: str, observer: Optional[FetchObserver] = None):
+ super().__init__(observer)
+ self.site_name = site_name
+ self.site = None
+ self._initialize_site()
+
+ def get_site_name(self) -> str:
+ return self.site_name
+
+ def _initialize_site(self):
+ """Initialize pywikibot site - lazy initialization."""
+ try:
+ import pywikibot
+ if self.site is None:
+ self.site = pywikibot.Site(self.site_name)
+ logger.info(f"Initialized pywikibot site: {self.site_name}")
+ except ImportError:
+ raise ImportError("pywikibot is required for Wikipedia operations. Install with: pip install pywikibot")
+
+ def _check_page_exists(self, page_title: str) -> PageInfo:
+ """Check if page exists on the wiki site."""
+ try:
+ import pywikibot
+ page = pywikibot.Page(self.site, page_title)
+ exists = page.exists()
+ return PageInfo(
+ title=page_title,
+ exists=exists,
+ content=page.text if exists else None
+ )
+ except Exception as e:
+ logger.error(f"Error checking page existence: {e}")
+ return PageInfo(title=page_title, exists=False, error=str(e))
+
+ def _fetch_page_content(self, page_info: PageInfo) -> PageInfo:
+ """Fetch full page content."""
+ # Content is already fetched in _check_page_exists for efficiency
+ return page_info
+
+ def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo:
+ """Fetch language links (interwiki links)."""
+ try:
+ import pywikibot
+ if page_info.exists:
+ page = pywikibot.Page(self.site, page_info.title)
+ langlinks = {}
+ for langlink in page.langlinks():
+ langlinks[langlink.site.code] = langlink.title
+ page_info.langlinks = langlinks
+ return page_info
+ except Exception as e:
+ logger.error(f"Error fetching langlinks: {e}")
+ page_info.langlinks = {}
+ return page_info
+
+
+class WikipediaSyncFetcher:
+ """Main fetcher class using Strategy pattern for different fetch strategies."""
+
+ def __init__(self, observer: Optional[FetchObserver] = None):
+ self.observer = observer or LoggingFetchObserver()
+ self.ar_fetcher = PywikibotFetcher('ar', self.observer)
+ self.en_fetcher = PywikibotFetcher('en', self.observer)
+
+ def fetch_arabic_and_english_pages(self, ar_page_title: str) -> Dict[str, Any]:
+ """
+ Fetch Arabic page and corresponding English page if it exists.
+
+ Args:
+ ar_page_title: Title of the Arabic Wikipedia page
+
+ Returns:
+ Dict containing both Arabic and English page information
+ """
+ logger.info(f"Starting sync fetch for Arabic page: {ar_page_title}")
+
+ # Step 1: Check Arabic page
+ ar_page_info = self.ar_fetcher.fetch_page_info(ar_page_title)
+
+ if not ar_page_info.exists:
+ return {
+ 'arabic': ar_page_info,
+ 'english': None,
+ 'sync_possible': False,
+ 'error': f"Arabic page '{ar_page_title}' does not exist"
+ }
+
+ # Step 2: Find corresponding English page
+ en_page_title = self._find_english_page_title(ar_page_info)
+
+ if not en_page_title:
+ return {
+ 'arabic': ar_page_info,
+ 'english': None,
+ 'sync_possible': False,
+ 'error': f"No corresponding English page found for '{ar_page_title}'"
+ }
+
+ # Step 3: Fetch English page
+ en_page_info = self.en_fetcher.fetch_page_info(en_page_title)
+
+ return {
+ 'arabic': ar_page_info,
+ 'english': en_page_info,
+ 'sync_possible': en_page_info.exists,
+ 'error': None if en_page_info.exists else f"English page '{en_page_title}' does not exist"
+ }
+
+ def _find_english_page_title(self, ar_page_info: PageInfo) -> Optional[str]:
+ """Find the corresponding English page title."""
+ # Method 1: Check langlinks from Arabic page
+ if ar_page_info.langlinks and 'en' in ar_page_info.langlinks:
+ return ar_page_info.langlinks['en']
+
+ # Method 2: Try direct title match (for pages with same name in both languages)
+ # This is a fallback - in reality you'd want more sophisticated matching
+ logger.warning(f"No direct English langlink found for '{ar_page_info.title}', trying direct match")
+ return ar_page_info.title
+
+
+def fetch_wikipedia_data(ar_page_title: str) -> Dict[str, Any]:
+ """
+ Main function to fetch Wikipedia data for sync operation.
+
+ Args:
+ ar_page_title: Arabic page title to sync
+
+ Returns:
+ Dictionary with Arabic and English page data
+ """
+ fetcher = WikipediaSyncFetcher()
+ return fetcher.fetch_arabic_and_english_pages(ar_page_title)
+
+
+# Legacy function for backward compatibility
+def fetch_data(url: str) -> dict:
+ """
+ Legacy function for backward compatibility.
+ Now expects a page title instead of URL.
+ """
+ logger.warning("fetch_data(url) is deprecated. Use fetch_wikipedia_data(page_title) instead.")
+ # Extract page title from URL (simple implementation)
+ if 'wikipedia.org' in url:
+ page_title = url.split('/')[-1].replace('_', ' ')
+ return fetch_wikipedia_data(page_title)
+ else:
+ raise ValueError("URL must be a Wikipedia page URL")
\ No newline at end of file
diff --git a/tasks/InfoboxSync/fetch/interfaces.py b/tasks/InfoboxSync/fetch/interfaces.py
new file mode 100644
index 00000000..99dede1a
--- /dev/null
+++ b/tasks/InfoboxSync/fetch/interfaces.py
@@ -0,0 +1,50 @@
+"""Abstract interfaces for the fetch stage."""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Optional
+from .models import PageInfo
+from .observers import FetchObserver, LoggingFetchObserver
+
+logger = logging.getLogger(__name__)
+
+
+class WikipediaFetcher(ABC):
+ """Abstract base class for Wikipedia page fetchers using Template Method."""
+
+ def __init__(self, observer: Optional[FetchObserver] = None):
+ self.observer = observer or LoggingFetchObserver()
+
+ def fetch_page_info(self, page_title: str) -> PageInfo:
+ """Template method for fetching page information."""
+ try:
+ self.observer.on_page_check_start(page_title, self.get_site_name())
+
+ page_info = self._check_page_exists(page_title)
+ if page_info.exists:
+ page_info = self._fetch_page_content(page_info)
+ page_info = self._fetch_langlinks(page_info)
+
+ self.observer.on_page_check_complete(page_info)
+ return page_info
+
+ except Exception as e:
+ error_msg = f"Error fetching page '{page_title}': {str(e)}"
+ self.observer.on_error(error_msg)
+ return PageInfo(title=page_title, exists=False, error=error_msg)
+
+ @abstractmethod
+ def get_site_name(self) -> str:
+ pass
+
+ @abstractmethod
+ def _check_page_exists(self, page_title: str) -> PageInfo:
+ pass
+
+ @abstractmethod
+ def _fetch_page_content(self, page_info: PageInfo) -> PageInfo:
+ pass
+
+ @abstractmethod
+ def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo:
+ pass
\ No newline at end of file
diff --git a/tasks/InfoboxSync/fetch/models.py b/tasks/InfoboxSync/fetch/models.py
new file mode 100644
index 00000000..5665fb11
--- /dev/null
+++ b/tasks/InfoboxSync/fetch/models.py
@@ -0,0 +1,23 @@
+"""Data models for the fetch stage."""
+
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+
+@dataclass
+class PageInfo:
+ """Data class for page information."""
+ title: str
+ exists: bool
+ content: Optional[str] = None
+ langlinks: Optional[Dict[str, str]] = None
+ error: Optional[str] = None
+
+
+@dataclass
+class SyncResult:
+ """Data class for synchronization results."""
+ arabic: PageInfo
+ english: Optional[PageInfo]
+ sync_possible: bool
+ error: Optional[str] = None
\ No newline at end of file
diff --git a/tasks/InfoboxSync/fetch/observers.py b/tasks/InfoboxSync/fetch/observers.py
new file mode 100644
index 00000000..ea0486c9
--- /dev/null
+++ b/tasks/InfoboxSync/fetch/observers.py
@@ -0,0 +1,67 @@
+"""Observer pattern implementation for monitoring fetch operations."""
+
+import logging
+from abc import ABC, abstractmethod
+from .models import PageInfo
+
+logger = logging.getLogger(__name__)
+
+
+class FetchObserver(ABC):
+ """Observer pattern for monitoring fetch operations."""
+
+ @abstractmethod
+ def on_page_check_start(self, page_title: str, site: str):
+ pass
+
+ @abstractmethod
+ def on_page_check_complete(self, page_info: PageInfo):
+ pass
+
+ @abstractmethod
+ def on_error(self, error: str):
+ pass
+
+
+class LoggingFetchObserver(FetchObserver):
+ """Logging implementation of fetch observer."""
+
+ def on_page_check_start(self, page_title: str, site: str):
+ logger.info(f"Starting page check for '{page_title}' on {site}")
+
+ def on_page_check_complete(self, page_info: PageInfo):
+ if page_info.exists:
+ logger.info(f"Page '{page_info.title}' found successfully")
+ else:
+ logger.warning(f"Page '{page_info.title}' not found")
+
+ def on_error(self, error: str):
+ logger.error(f"Fetch error: {error}")
+
+
+class MetricsFetchObserver(FetchObserver):
+ """Metrics collection implementation of fetch observer."""
+
+ def __init__(self):
+ self.metrics = {
+ 'pages_checked': 0,
+ 'pages_found': 0,
+ 'pages_not_found': 0,
+ 'errors': 0
+ }
+
+ def on_page_check_start(self, page_title: str, site: str):
+ self.metrics['pages_checked'] += 1
+
+ def on_page_check_complete(self, page_info: PageInfo):
+ if page_info.exists:
+ self.metrics['pages_found'] += 1
+ else:
+ self.metrics['pages_not_found'] += 1
+
+ def on_error(self, error: str):
+ self.metrics['errors'] += 1
+
+ def get_metrics(self) -> dict:
+ """Get current metrics."""
+ return self.metrics.copy()
\ No newline at end of file
diff --git a/tasks/InfoboxSync/fetch/pywikibot_fetcher.py b/tasks/InfoboxSync/fetch/pywikibot_fetcher.py
new file mode 100644
index 00000000..8c970773
--- /dev/null
+++ b/tasks/InfoboxSync/fetch/pywikibot_fetcher.py
@@ -0,0 +1,71 @@
+"""Pywikibot implementation of Wikipedia fetcher."""
+
+import logging
+from typing import Optional
+from .interfaces import WikipediaFetcher
+from .models import PageInfo
+from .observers import FetchObserver
+
+logger = logging.getLogger(__name__)
+
+
+class PywikibotFetcher(WikipediaFetcher):
+ """Pywikibot implementation of Wikipedia fetcher."""
+
+ def __init__(self, site_name: str,
+ observer: Optional[FetchObserver] = None):
+ super().__init__(observer)
+ self.site_name = site_name
+ self.site = None
+ self._initialize_site()
+
+ def get_site_name(self) -> str:
+ return self.site_name
+
+ def _initialize_site(self):
+ """Initialize pywikibot site - lazy initialization."""
+ try:
+ import pywikibot
+ if self.site is None:
+ self.site = pywikibot.Site(self.site_name)
+ logger.info(f"Initialized pywikibot site: {self.site_name}")
+ except ImportError:
+ msg = ("pywikibot is required for Wikipedia operations. "
+ "Install with: pip install pywikibot")
+ raise ImportError(msg)
+
+ def _check_page_exists(self, page_title: str) -> PageInfo:
+ """Check if page exists on the wiki site."""
+ try:
+ import pywikibot
+ page = pywikibot.Page(self.site, page_title)
+ exists = page.exists()
+ return PageInfo(
+ title=page_title,
+ exists=exists,
+ content=page.text if exists else None
+ )
+ except Exception as e:
+ logger.error(f"Error checking page existence: {e}")
+ return PageInfo(title=page_title, exists=False, error=str(e))
+
+ def _fetch_page_content(self, page_info: PageInfo) -> PageInfo:
+ """Fetch full page content."""
+ # Content is already fetched in _check_page_exists for efficiency
+ return page_info
+
+ def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo:
+ """Fetch language links (interwiki links)."""
+ try:
+ import pywikibot
+ if page_info.exists:
+ page = pywikibot.Page(self.site, page_info.title)
+ langlinks = {}
+ for langlink in page.langlinks():
+ langlinks[langlink.site.code] = langlink.title
+ page_info.langlinks = langlinks
+ return page_info
+ except Exception as e:
+ logger.error(f"Error fetching langlinks: {e}")
+ page_info.langlinks = {}
+ return page_info
\ No newline at end of file
diff --git a/tasks/InfoboxSync/fetch/sync_fetcher.py b/tasks/InfoboxSync/fetch/sync_fetcher.py
new file mode 100644
index 00000000..dad7f89a
--- /dev/null
+++ b/tasks/InfoboxSync/fetch/sync_fetcher.py
@@ -0,0 +1,87 @@
+"""Main synchronization fetcher using Strategy pattern."""
+
+import logging
+from typing import Dict, Any, Optional
+from .models import PageInfo, SyncResult
+from .observers import FetchObserver, LoggingFetchObserver
+from .pywikibot_fetcher import PywikibotFetcher
+
+logger = logging.getLogger(__name__)
+
+
+class WikipediaSyncFetcher:
+ """Main fetcher class using Strategy pattern."""
+
+ def __init__(self, observer: Optional[FetchObserver] = None):
+ self.observer = observer or LoggingFetchObserver()
+ self.ar_fetcher = PywikibotFetcher('ar', self.observer)
+ self.en_fetcher = PywikibotFetcher('en', self.observer)
+
+ def fetch_arabic_and_english_pages(self,
+ ar_page_title: str) -> Dict[str, Any]:
+ """Fetch Arabic page and corresponding English page."""
+ logger.info(f"Starting sync fetch for Arabic page: {ar_page_title}")
+
+ # Step 1: Check Arabic page
+ ar_page_info = self.ar_fetcher.fetch_page_info(ar_page_title)
+
+ if not ar_page_info.exists:
+ return {
+ 'arabic': ar_page_info,
+ 'english': None,
+ 'sync_possible': False,
+ 'error': f"Arabic page '{ar_page_title}' does not exist"
+ }
+
+ # Step 2: Find corresponding English page
+ en_page_title = self._find_english_page_title(ar_page_info)
+
+ if not en_page_title:
+ error_msg = (
+ f"No corresponding English page found for '{ar_page_title}'"
+ )
+ return {
+ 'arabic': ar_page_info,
+ 'english': None,
+ 'sync_possible': False,
+ 'error': error_msg
+ }
+
+ # Step 3: Fetch English page
+ en_page_info = self.en_fetcher.fetch_page_info(en_page_title)
+
+ error_msg = None
+ if not en_page_info.exists:
+ error_msg = f"English page '{en_page_title}' does not exist"
+
+ return {
+ 'arabic': ar_page_info,
+ 'english': en_page_info,
+ 'sync_possible': en_page_info.exists,
+ 'error': error_msg
+ }
+
+ def _find_english_page_title(self,
+ ar_page_info: PageInfo) -> Optional[str]:
+ """Find the corresponding English page title."""
+ # Method 1: Check langlinks from Arabic page
+ if ar_page_info.langlinks and 'en' in ar_page_info.langlinks:
+ return ar_page_info.langlinks['en']
+
+ # Method 2: Try direct title match
+ # This is a fallback - in reality you'd want more sophisticated
+ # matching
+ msg = f"No direct English langlink found for '{ar_page_info.title}'"
+ logger.warning(f"{msg}, trying direct match")
+ return ar_page_info.title
+
+ def fetch_sync_result(self, ar_page_title: str) -> SyncResult:
+ """Fetch synchronization result with structured return type."""
+ result = self.fetch_arabic_and_english_pages(ar_page_title)
+
+ return SyncResult(
+ arabic=result['arabic'],
+ english=result['english'],
+ sync_possible=result['sync_possible'],
+ error=result['error']
+ )
\ No newline at end of file
diff --git a/tasks/InfoboxSync/map/__init__.py b/tasks/InfoboxSync/map/__init__.py
new file mode 100644
index 00000000..4bf46847
--- /dev/null
+++ b/tasks/InfoboxSync/map/__init__.py
@@ -0,0 +1 @@
+# Map stage package
\ No newline at end of file
diff --git a/tasks/InfoboxSync/map/field_mappers.py b/tasks/InfoboxSync/map/field_mappers.py
new file mode 100644
index 00000000..ae9f8e9b
--- /dev/null
+++ b/tasks/InfoboxSync/map/field_mappers.py
@@ -0,0 +1,440 @@
+"""
+Field mapping strategies for different data types in Wikipedia infoboxes.
+"""
+
+import logging
+import re
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional, List
+from urllib.parse import urlparse
+
+logger = logging.getLogger(__name__)
+
+
+class FieldMapper(ABC):
+ """
+ Abstract base class for field mapping strategies.
+ Each field type (text, number, image, link, mixed) has its own mapper.
+ """
+
+ def __init__(self, english_key: str, arabic_key: str, field_type: str):
+ """
+ Initialize the field mapper.
+
+ Args:
+ english_key (str): English field name from infobox
+ arabic_key (str): Corresponding Arabic field name
+ field_type (str): Type of field (text, number, image, link, mixed)
+ """
+ self.english_key = english_key
+ self.arabic_key = arabic_key
+ self.field_type = field_type
+
+ @abstractmethod
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """
+ Map a field value to the standardized format.
+
+ Args:
+ value (str): Raw field value from infobox
+
+ Returns:
+ Dict[str, Any]: Mapped field data with Arabic key
+ """
+ pass
+
+ def _clean_value(self, value: str) -> str:
+ """Clean and normalize field value."""
+ if not value:
+ return ""
+ return value.strip()
+
+
+class NumberedFieldMapper(FieldMapper):
+ """
+ Mapper for numbered fields that follow a pattern (field1, field2, field3, ...).
+ Groups related numbered fields into arrays/lists.
+ """
+
+ def __init__(self, base_english_key: str, arabic_key: str, field_type: str = "text"):
+ # Store the base key without number (e.g., "years" not "years1")
+ self.base_english_key = base_english_key
+ super().__init__(base_english_key, arabic_key, "numbered")
+ self.item_field_type = field_type
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """Map numbered field - this is handled by the template mapper."""
+ # This method is not used directly for numbered fields
+ # The template mapper handles the grouping logic
+ return {}
+
+ def map_numbered_fields(self, infobox_data: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Map all numbered fields for this base key.
+
+ Args:
+ infobox_data: All infobox fields
+
+ Returns:
+ Dict with Arabic key containing array of numbered field values
+ """
+ numbered_values = []
+
+ # Find all fields that match the pattern: base_key + number
+ for key, value in infobox_data.items():
+ if key.startswith(self.base_english_key):
+ # Extract the number from the key
+ number_part = key[len(self.base_english_key):]
+ if number_part.isdigit():
+ number = int(number_part)
+ numbered_values.append({
+ "number": number,
+ "value": value,
+ "original_key": key
+ })
+
+ # Sort by number
+ numbered_values.sort(key=lambda x: x["number"])
+
+ # Extract just the values in order
+ values_only = [item["value"] for item in numbered_values]
+
+ return {
+ self.arabic_key: {
+ "value": values_only,
+ "type": "numbered",
+ "item_type": self.item_field_type,
+ "count": len(values_only),
+ "original_keys": [item["original_key"] for item in numbered_values]
+ }
+ }
+
+
+class TextFieldMapper(FieldMapper):
+ """
+ Mapper for text fields (names, descriptions, etc.).
+ """
+
+ def __init__(self, english_key: str, arabic_key: str):
+ super().__init__(english_key, arabic_key, "text")
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """Map text field value."""
+ clean_value = self._clean_value(value)
+
+ return {
+ self.arabic_key: {
+ "value": clean_value,
+ "type": "text",
+ "original_key": self.english_key,
+ "validation": self._validate_text(clean_value)
+ }
+ }
+
+ def _validate_text(self, value: str) -> Dict[str, Any]:
+ """Validate text field."""
+ return {
+ "is_valid": len(value) > 0,
+ "length": len(value),
+ "has_special_chars": bool(re.search(r'[^\w\s]', value))
+ }
+
+
+class NumberFieldMapper(FieldMapper):
+ """
+ Mapper for numeric fields (ages, years, counts, etc.).
+ """
+
+ def __init__(self, english_key: str, arabic_key: str):
+ super().__init__(english_key, arabic_key, "number")
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """Map numeric field value."""
+ clean_value = self._clean_value(value)
+ numeric_value = self._extract_number(clean_value)
+
+ return {
+ self.arabic_key: {
+ "value": numeric_value,
+ "type": "number",
+ "original_key": self.english_key,
+ "validation": self._validate_number(clean_value),
+ "numeric_value": numeric_value
+ }
+ }
+
+ def _extract_number(self, value: str) -> Optional[float]:
+ """Extract numeric value from string."""
+ if not value:
+ return None
+
+ # Remove common wiki formatting
+ value = re.sub(r'\[\[|\]\]', '', value)
+ value = re.sub(r'<[^>]+>', '', value)
+
+ # Find first number (integer or decimal)
+ match = re.search(r'(\d+(?:\.\d+)?)', value)
+ if match:
+ return float(match.group(1))
+ return None
+
+ def _validate_number(self, value: str) -> Dict[str, Any]:
+ """Validate numeric field."""
+ numeric_value = self._extract_number(value)
+ return {
+ "is_valid": numeric_value is not None,
+ "numeric_value": numeric_value,
+ "has_units": bool(re.search(r'\d+\s*\w+', value))
+ }
+
+
+class ImageFieldMapper(FieldMapper):
+ """
+ Mapper for image fields.
+ """
+
+ def __init__(self, english_key: str, arabic_key: str):
+ super().__init__(english_key, arabic_key, "image")
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """Map image field value."""
+ clean_value = self._clean_value(value)
+ image_info = self._parse_image(clean_value)
+
+ return {
+ self.arabic_key: {
+ "value": image_info["filename"],
+ "type": "image",
+ "original_key": self.english_key,
+ "validation": self._validate_image(clean_value),
+ "image_info": image_info
+ }
+ }
+
+ def _parse_image(self, value: str) -> Dict[str, Any]:
+ """Parse image field to extract filename and caption."""
+ if not value:
+ return {"filename": "", "caption": ""}
+
+ # Handle wiki image syntax [[File:filename.jpg|caption]]
+ file_match = re.search(r'\[\[File:([^|\]]+)(?:\|([^]]+))?\]\]', value, re.IGNORECASE)
+ if file_match:
+ return {
+ "filename": file_match.group(1),
+ "caption": file_match.group(2) or ""
+ }
+
+ # Handle simple filename
+ return {"filename": value, "caption": ""}
+
+ def _validate_image(self, value: str) -> Dict[str, Any]:
+ """Validate image field."""
+ image_info = self._parse_image(value)
+ return {
+ "is_valid": bool(image_info["filename"]),
+ "has_caption": bool(image_info["caption"]),
+ "filename": image_info["filename"]
+ }
+
+
+class LinkFieldMapper(FieldMapper):
+ """
+ Mapper for link fields (internal/external links).
+ """
+
+ def __init__(self, english_key: str, arabic_key: str):
+ super().__init__(english_key, arabic_key, "link")
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """Map link field value."""
+ clean_value = self._clean_value(value)
+ link_info = self._parse_link(clean_value)
+
+ return {
+ self.arabic_key: {
+ "value": link_info["url"],
+ "type": "link",
+ "original_key": self.english_key,
+ "validation": self._validate_link(clean_value),
+ "link_info": link_info
+ }
+ }
+
+ def _parse_link(self, value: str) -> Dict[str, Any]:
+ """Parse link to extract URL and display text."""
+ if not value:
+ return {"url": "", "display_text": "", "is_external": False}
+
+ # Handle wiki internal links [[Page|Display Text]]
+ internal_match = re.search(r'\[\[([^|\]]+)(?:\|([^]]+))?\]\]', value)
+ if internal_match:
+ return {
+ "url": internal_match.group(1),
+ "display_text": internal_match.group(2) or internal_match.group(1),
+ "is_external": False
+ }
+
+ # Handle external links [http://example.com Display Text]
+ external_match = re.search(r'\[([^\s]+)(?:\s([^]]+))?\]', value)
+ if external_match:
+ return {
+ "url": external_match.group(1),
+ "display_text": external_match.group(2) or external_match.group(1),
+ "is_external": True
+ }
+
+ # Plain text that might be a URL
+ if value.startswith(('http://', 'https://')):
+ return {
+ "url": value,
+ "display_text": value,
+ "is_external": True
+ }
+
+ return {"url": value, "display_text": value, "is_external": False}
+
+ def _validate_link(self, value: str) -> Dict[str, Any]:
+ """Validate link field."""
+ link_info = self._parse_link(value)
+ is_valid_url = False
+
+ if link_info["is_external"]:
+ try:
+ parsed = urlparse(link_info["url"])
+ is_valid_url = bool(parsed.netloc)
+ except:
+ is_valid_url = False
+
+ return {
+ "is_valid": bool(link_info["url"]),
+ "is_external": link_info["is_external"],
+ "is_valid_url": is_valid_url,
+ "has_display_text": link_info["display_text"] != link_info["url"]
+ }
+
+
+class MixedFieldMapper(FieldMapper):
+ """
+ Mapper for mixed content fields (containing multiple data types).
+ """
+
+ def __init__(self, english_key: str, arabic_key: str):
+ super().__init__(english_key, arabic_key, "mixed")
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """Map mixed field value."""
+ clean_value = self._clean_value(value)
+ parsed_content = self._parse_mixed_content(clean_value)
+
+ return {
+ self.arabic_key: {
+ "value": clean_value,
+ "type": "mixed",
+ "original_key": self.english_key,
+ "validation": self._validate_mixed(clean_value),
+ "parsed_content": parsed_content
+ }
+ }
+
+ def _parse_mixed_content(self, value: str) -> Dict[str, Any]:
+ """Parse mixed content to identify different elements."""
+ if not value:
+ return {"text_parts": [], "links": [], "images": [], "numbers": []}
+
+ text_parts = []
+ links = []
+ images = []
+ numbers = []
+
+ # Find links
+ link_matches = re.findall(r'\[\[[^\]]+\]\]', value)
+ links.extend(link_matches)
+
+ # Find images
+ image_matches = re.findall(r'\[\[File:[^\]]+\]\]', value, re.IGNORECASE)
+ images.extend(image_matches)
+
+ # Find numbers
+ number_matches = re.findall(r'\d+(?:\.\d+)?', value)
+ numbers.extend(number_matches)
+
+ # Remove wiki markup for clean text
+ clean_text = re.sub(r'\[\[[^\]]+\]\]', '', value)
+ clean_text = re.sub(r'<[^>]+>', '', clean_text)
+ text_parts = [part.strip() for part in clean_text.split() if part.strip()]
+
+ return {
+ "text_parts": text_parts,
+ "links": links,
+ "images": images,
+ "numbers": numbers
+ }
+
+ def _validate_mixed(self, value: str) -> Dict[str, Any]:
+ """Validate mixed field."""
+ parsed = self._parse_mixed_content(value)
+ return {
+ "is_valid": len(value) > 0,
+ "has_links": len(parsed["links"]) > 0,
+ "has_images": len(parsed["images"]) > 0,
+ "has_numbers": len(parsed["numbers"]) > 0,
+ "text_parts_count": len(parsed["text_parts"])
+ }
+
+class RawFieldMapper(FieldMapper):
+ """
+ Mapper for raw fields that takes the value as is without any preprocessing.
+ """
+
+ def __init__(self, english_key: str, arabic_key: str):
+ super().__init__(english_key, arabic_key, "raw")
+
+ def map_field(self, value: str) -> Dict[str, Any]:
+ """Map raw field value without any processing."""
+ return {
+ self.arabic_key: {
+ "value": value,
+ "type": "raw",
+ "original_key": self.english_key,
+ "validation": {"is_valid": True}
+ }
+ }
+
+
+
+class FieldMapperFactory:
+ """
+ Factory for creating appropriate field mappers.
+ """
+
+ @staticmethod
+ def create_mapper(english_key: str, arabic_key: str, field_type: str) -> FieldMapper:
+ """
+ Create appropriate field mapper based on type.
+
+ Args:
+ english_key (str): English field name
+ arabic_key (str): Arabic field name
+ field_type (str): Type of field mapper to create
+
+ Returns:
+ FieldMapper: Appropriate field mapper instance
+ """
+ field_type = field_type.lower()
+ if field_type == "text":
+ return TextFieldMapper(english_key, arabic_key)
+ elif field_type == "number":
+ return NumberFieldMapper(english_key, arabic_key)
+ elif field_type == "image":
+ return ImageFieldMapper(english_key, arabic_key)
+ elif field_type == "link":
+ return LinkFieldMapper(english_key, arabic_key)
+ elif field_type == "mixed":
+ return MixedFieldMapper(english_key, arabic_key)
+ elif field_type == "numbered":
+ return NumberedFieldMapper(english_key, arabic_key)
+ elif field_type == "raw":
+ return RawFieldMapper(english_key, arabic_key)
+ else:
+ # Default to text mapper
+ return TextFieldMapper(english_key, arabic_key)
\ No newline at end of file
diff --git a/tasks/InfoboxSync/map/map.py b/tasks/InfoboxSync/map/map.py
new file mode 100644
index 00000000..ee01ae48
--- /dev/null
+++ b/tasks/InfoboxSync/map/map.py
@@ -0,0 +1,131 @@
+"""
+Map stage for Wikipedia infobox synchronization using Strategy Pattern.
+"""
+
+import logging
+from .template_mapper import TemplateMapperFactory
+
+logger = logging.getLogger(__name__)
+
+
+def map_data(parsed_data: dict,
+ template_type: str = 'football_biography') -> dict:
+ """
+ Map the parsed data to a standardized format with Arabic field names.
+
+ Args:
+ parsed_data (dict): The parsed data from the parse stage.
+ template_type (str): Type of template ('football_biography',
+ 'person', etc.)
+
+ Returns:
+ dict: Mapped data in standardized format with Arabic field names.
+ """
+ msg = "Starting data mapping for template type: {}".format(template_type)
+ logger.info(msg)
+
+ try:
+ page_title = parsed_data.get('title', '')
+ infobox_data = parsed_data.get('infobox', {})
+
+ # Create appropriate template mapper
+ template_mapper = TemplateMapperFactory.create_mapper(template_type)
+
+ # Map the infobox data using the template mapper
+ mapped_infobox = template_mapper.map_infobox(infobox_data)
+
+ # Build the final mapped data structure
+ mapped_data = {
+ 'page_title': page_title,
+ 'template_type': template_type,
+ 'arabic_fields': mapped_infobox['mapped_fields'],
+ 'metadata': {
+ 'categories': parsed_data.get('categories', []),
+ 'links': parsed_data.get('links', []),
+ 'template_name': mapped_infobox['template_name'],
+ 'total_mapped_fields': mapped_infobox['total_mapped_fields'],
+ 'original_field_count': mapped_infobox['original_field_count']
+ },
+ 'raw_content': parsed_data.get('raw_content', ''),
+ 'arabic_title': parsed_data.get('arabic_title', '')
+ }
+
+ logger.info("Successfully mapped data for: {}".format(page_title))
+ msg = ("Mapped {} fields out of {} original fields").format(
+ mapped_infobox['total_mapped_fields'],
+ mapped_infobox['original_field_count'])
+ logger.info(msg)
+
+ return mapped_data
+
+ except Exception as e:
+ logger.error("Error mapping data: {}".format(e))
+ raise
+
+
+def get_supported_template_types() -> list:
+ """
+ Get list of supported template types for mapping.
+
+ Returns:
+ list: List of supported template type strings
+ """
+ return TemplateMapperFactory.get_supported_templates()
+
+
+def create_field_demo(template_type: str = 'football_biography') -> dict:
+ """
+ Create a demo showing different field types for a template.
+
+ Args:
+ template_type (str): Type of template to create demo for
+
+ Returns:
+ dict: Demo data showing different field types
+ """
+ if template_type == 'football_biography':
+ return {
+ "name": "Lionel Messi", # text field
+ "height": "1.70 m", # number field
+ # image field
+ "image": "[[File:Messi_vs_Nigeria_2018.jpg|Messi playing]]",
+ # link field
+ "website": "[http://www.messi.com Official Website]",
+ # mixed field
+ "position": "[[Forward (association football)|Forward]]",
+ "clubnumber": "10", # number field
+ "caps1": "520", # number field
+ "goals1": "474" # number field
+ }
+
+ return {}
+
+
+def demonstrate_field_types():
+ """
+ Demonstrate how different field types are mapped.
+ """
+ logger.info("Demonstrating field type mapping...")
+
+ # Create demo data
+ demo_data = create_field_demo('football_biography')
+
+ # Map the demo data
+ try:
+ mapped_result = map_data({
+ 'title': 'Demo Football Player',
+ 'infobox': demo_data,
+ 'categories': ['Football players'],
+ 'links': ['Argentina national football team'],
+ 'arabic_title': 'لاعب كرة قدم تجريبي'
+ }, 'football_biography')
+
+ logger.info("Demo mapping completed successfully")
+ arabic_fields = list(mapped_result['arabic_fields'].keys())
+ logger.info("Arabic fields: {}".format(arabic_fields))
+
+ return mapped_result
+
+ except Exception as e:
+ logger.error("Demo mapping failed: {}".format(e))
+ return {}
\ No newline at end of file
diff --git a/tasks/InfoboxSync/map/template_mapper.py b/tasks/InfoboxSync/map/template_mapper.py
new file mode 100644
index 00000000..e612401c
--- /dev/null
+++ b/tasks/InfoboxSync/map/template_mapper.py
@@ -0,0 +1,279 @@
+"""
+Template mapper classes for mapping English infobox fields to Arabic equivalents.
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Any, List
+from .field_mappers import FieldMapperFactory, FieldMapper, NumberedFieldMapper
+
+logger = logging.getLogger(__name__)
+
+
+class TemplateMapper(ABC):
+ """
+ Abstract base class for template-specific field mapping.
+ Each template type (football biography, person, etc.) has its own mapper.
+ """
+
+ def __init__(self, template_name: str):
+ """
+ Initialize the template mapper.
+
+ Args:
+ template_name (str): Name of the template being mapped
+ """
+ self.template_name = template_name
+ self.field_mappings = self._get_field_mappings()
+
+ @abstractmethod
+ def _get_field_mappings(self) -> Dict[str, Dict[str, Any]]:
+ """
+ Get field mappings for this template type.
+
+ Returns:
+ Dict[str, Dict[str, Any]]: Mapping configuration with format:
+ {
+ "english_field_name": {
+ "arabic_key": "الاسم",
+ "field_type": "text|number|image|link|mixed|numbered"
+ }
+ }
+ """
+ pass
+
+ def map_infobox(self, infobox_data: Dict[str, Any]) -> Dict[str, Any]:
+ """
+ Map all infobox fields using the configured field mappers.
+
+ Args:
+ infobox_data (Dict[str, Any]): Raw infobox data from parser
+
+ Returns:
+ Dict[str, Any]: Mapped data with Arabic field names
+ """
+ logger.info("Mapping infobox fields for template: {}".format(
+ self.template_name))
+
+ mapped_data = {}
+ mapped_fields = {}
+
+ # Handle numbered fields first (they need access to all data)
+ numbered_mappings = {}
+ for english_key, mapping_config in self.field_mappings.items():
+ if mapping_config["field_type"] == "numbered":
+ numbered_mappings[english_key] = mapping_config
+
+ for base_key, mapping_config in numbered_mappings.items():
+ arabic_key = mapping_config["arabic_key"]
+ item_type = mapping_config.get("item_type", "text")
+
+ # Create numbered field mapper
+ numbered_mapper = NumberedFieldMapper(base_key, arabic_key, item_type)
+
+ # Map all numbered fields for this base key
+ try:
+ mapped_field = numbered_mapper.map_numbered_fields(infobox_data)
+ mapped_fields.update(mapped_field)
+
+ logger.debug("Mapped numbered field '{}' -> '{}'".format(
+ base_key, arabic_key))
+
+ except Exception as e:
+ logger.warning("Failed to map numbered field '{}': {}".format(
+ base_key, e))
+
+ # Handle regular fields
+ for english_key, value in infobox_data.items():
+ # Skip if this key was already handled as part of numbered fields
+ is_numbered_field = False
+ for base_key in numbered_mappings.keys():
+ if english_key.startswith(base_key):
+ is_numbered_field = True
+ break
+
+ if is_numbered_field:
+ continue
+
+ # Normalize the key
+ normalized_key = english_key.lower().replace(' ', '_').replace('-', '_')
+
+ # Check if we have a mapping for this field
+ if normalized_key in self.field_mappings:
+ mapping_config = self.field_mappings[normalized_key]
+ arabic_key = mapping_config["arabic_key"]
+ field_type = mapping_config["field_type"]
+
+ # Create appropriate field mapper
+ field_mapper = FieldMapperFactory.create_mapper(
+ english_key, arabic_key, field_type
+ )
+
+ # Map the field
+ try:
+ mapped_field = field_mapper.map_field(str(value))
+ mapped_fields.update(mapped_field)
+
+ logger.debug("Mapped field '{}' -> '{}' (type: {})".format(
+ english_key, arabic_key, field_type))
+
+ except Exception as e:
+ logger.warning("Failed to map field '{}': {}".format(
+ english_key, e))
+ # Fall back to text mapping
+ text_mapper = FieldMapperFactory.create_mapper(
+ english_key, arabic_key, "text"
+ )
+ mapped_field = text_mapper.map_field(str(value))
+ mapped_fields.update(mapped_field)
+
+ else:
+ logger.debug("No mapping found for field '{}', skipping".format(
+ english_key))
+
+ mapped_data["mapped_fields"] = mapped_fields
+ mapped_data["template_name"] = self.template_name
+ mapped_data["total_mapped_fields"] = len(mapped_fields)
+ mapped_data["original_field_count"] = len(infobox_data)
+
+ logger.info("Successfully mapped {} fields from {} original fields".format(
+ len(mapped_fields), len(infobox_data)))
+
+ return mapped_data
+
+ def get_supported_fields(self) -> List[str]:
+ """
+ Get list of supported English field names.
+
+ Returns:
+ List[str]: List of supported field names
+ """
+ return list(self.field_mappings.keys())
+
+ def get_field_info(self, english_key: str) -> Dict[str, Any]:
+ """
+ Get information about a specific field mapping.
+
+ Args:
+ english_key (str): English field name
+
+ Returns:
+ Dict[str, Any]: Field mapping information or empty dict if not found
+ """
+ normalized_key = english_key.lower().replace(' ', '_').replace('-', '_')
+ return self.field_mappings.get(normalized_key, {})
+
+
+class FootballBiographyMapper(TemplateMapper):
+ """
+ Mapper for football biography infobox templates.
+ Maps English fields to Arabic equivalents with appropriate field types.
+ Handles both regular fields and numbered sequences (years1, clubs1, etc.).
+ """
+
+ def __init__(self):
+ super().__init__("football_biography")
+
+ def _get_field_mappings(self) -> Dict[str, Dict[str, Any]]:
+ """Get field mappings for football biography template."""
+ return {
+ # Personal Information
+ "name": {"arabic_key": "اسم", "field_type": "text"},
+ "fullname": {"arabic_key": "الاسم الكامل", "field_type": "text"},
+ "full_name": {"arabic_key": "الاسم الكامل", "field_type": "text"},
+ "image": {"arabic_key": "صورة", "field_type": "image"},
+ "upright": {"arabic_key": "حجم الصورة", "field_type": "number"},
+ "caption": {"arabic_key": "تعليق الصورة", "field_type": "raw"},
+ "birth_date": {"arabic_key": "تاريخ الولادة", "field_type": "raw"},
+ "birth_place": {"arabic_key": "مكان الولادة", "field_type": "raw"},
+ "death_date": {"arabic_key": "تاريخ الوفاة", "field_type": "raw"},
+ "death_place": {"arabic_key": "مكان الوفاة", "field_type": "raw"},
+ "height": {"arabic_key": "الطول", "field_type": "number"},
+ "position": {"arabic_key": "المركز", "field_type": "raw"},
+ # Club Career
+ "clubnumber": {"arabic_key": "الرقم بالنادي", "field_type": "number"},
+ "youthclubs": {"arabic_key": "أندية_الشباب", "field_type": "numbered", "item_type": "raw"},
+ "youthyears": {"arabic_key": "سنوات_الشباب", "field_type": "numbered", "item_type": "raw"},
+ "clubs": {"arabic_key": "أندية", "field_type": "numbered", "item_type": "raw"},
+ "years": {"arabic_key": "سنوات", "field_type": "numbered", "item_type": "raw"},
+ "caps": {"arabic_key": "مباريات", "field_type": "numbered", "item_type": "number"},
+ "goals": {"arabic_key": "أهداف", "field_type": "numbered", "item_type": "number"},
+ "totalcaps": {"arabic_key": "مجموع_مباريات", "field_type": "number"},
+ "totalgoals": {"arabic_key": "إجمالي الأهداف", "field_type": "number"},
+ "club-update": {"arabic_key": "تحديث الأندية", "field_type": "raw"},
+ "pcupdate": {"arabic_key": "تحديث الأندية", "field_type": "raw"},
+ # National Team Career
+ "nationalteam": {"arabic_key": "منتخب_وطني", "field_type": "numbered", "item_type": "raw"},
+ "nationalyears": {"arabic_key": "سنوات_وطنية", "field_type": "numbered", "item_type": "raw"},
+ "nationalcaps": {"arabic_key": "مباريات_وطنية", "field_type": "numbered", "item_type": "number"},
+ "nationalgoals": {"arabic_key": "أهداف_وطنية", "field_type": "numbered", "item_type": "number"},
+ "nationalteam-update": {"arabic_key": "تحديث المنتخب", "field_type": "raw"},
+ "ntupdate": {"arabic_key": "تحديث المنتخب", "field_type": "raw"},
+ # Managerial Career
+ "managerclubs": {"arabic_key": "أندية_مدرب", "field_type": "numbered", "item_type": "raw"},
+ "manageryears": {"arabic_key": "سنوات_مدرب", "field_type": "numbered", "item_type": "raw"},
+ # Honors
+ "medaltemplates": {"arabic_key": "ميداليات", "field_type": "mixed"},
+ }
+
+
+class GenericTemplateMapper(TemplateMapper):
+ """
+ Generic mapper for templates without specific field mappings.
+ Falls back to text mapping for all fields.
+ """
+
+ def __init__(self, template_name: str):
+ self.custom_template_name = template_name
+ super().__init__(template_name)
+
+ def _get_field_mappings(self) -> Dict[str, Dict[str, Any]]:
+ """
+ Generic mapper returns empty dict - all fields will be skipped
+ unless custom mappings are provided.
+ """
+ # This could be extended to load mappings from config files
+ return {}
+
+
+class TemplateMapperFactory:
+ """
+ Factory for creating appropriate template mappers.
+ """
+
+ @staticmethod
+ def create_mapper(template_type: str) -> TemplateMapper:
+ """
+ Create appropriate template mapper based on type.
+
+ Args:
+ template_type (str): Type of template ('football_biography', etc.)
+
+ Returns:
+ TemplateMapper: Appropriate template mapper instance
+ """
+ template_type = template_type.lower()
+
+ if template_type == 'football_biography':
+ return FootballBiographyMapper()
+ elif template_type == 'person':
+ return GenericTemplateMapper("person")
+ elif template_type == 'biography':
+ return GenericTemplateMapper("biography")
+ else:
+ # For custom template names, create generic mapper
+ return GenericTemplateMapper(template_type)
+
+ @staticmethod
+ def get_supported_templates() -> List[str]:
+ """
+ Get list of supported template types.
+
+ Returns:
+ List[str]: List of supported template type strings
+ """
+ return [
+ 'football_biography',
+ 'person',
+ 'biography'
+ ]
\ No newline at end of file
diff --git a/tasks/InfoboxSync/parse/__init__.py b/tasks/InfoboxSync/parse/__init__.py
new file mode 100644
index 00000000..1746fc21
--- /dev/null
+++ b/tasks/InfoboxSync/parse/__init__.py
@@ -0,0 +1 @@
+# Parse stage package
\ No newline at end of file
diff --git a/tasks/InfoboxSync/parse/base_parser.py b/tasks/InfoboxSync/parse/base_parser.py
new file mode 100644
index 00000000..83450980
--- /dev/null
+++ b/tasks/InfoboxSync/parse/base_parser.py
@@ -0,0 +1,84 @@
+"""
+Abstract base class for infobox parsers using Strategy Pattern.
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Any
+import wikitextparser as wtp
+
+logger = logging.getLogger(__name__)
+
+
+class InfoboxParser(ABC):
+ """
+ Abstract base class for infobox parsers using Strategy Pattern.
+
+ This class defines the interface for parsing different types of
+ Wikipedia infobox templates using wikitextparser.
+ """
+
+ def __init__(self, template_name: str):
+ """
+ Initialize the parser with the target template name.
+
+ Args:
+ template_name (str): Name of the infobox template to parse
+ """
+ self.template_name = template_name.lower()
+
+ @abstractmethod
+ def parse_infobox(self, wikitext: str) -> Dict[str, Any]:
+ """
+ Parse the infobox from wikitext.
+
+ Args:
+ wikitext (str): The raw Wikipedia page content
+
+ Returns:
+ Dict[str, Any]: Extracted infobox fields
+ """
+ pass
+
+ def _find_template(self, parsed_wikitext: wtp.WikiText) -> wtp.Template:
+ """
+ Find the target template in the parsed wikitext.
+
+ Args:
+ parsed_wikitext: Parsed wikitext object
+
+ Returns:
+ wtp.Template: The found template object, or None
+ """
+ templates = parsed_wikitext.templates
+
+ for template in templates:
+ template_name = template.name.strip().lower()
+ if template_name == self.template_name:
+ return template
+
+ return None
+
+ def _extract_template_arguments(self, template: wtp.Template) -> Dict[str, str]:
+ """
+ Extract arguments from a template object.
+
+ Args:
+ template: The template object to extract from
+
+ Returns:
+ Dict[str, str]: Dictionary of template arguments
+ """
+ infobox_data = {}
+
+ for argument in template.arguments:
+ key = argument.name.strip()
+ value = argument.value.strip()
+
+ # Clean up the value by removing markup if needed
+ # clean_value = wtp.parse(value).plain_text()
+ clean_value = value
+ if key and clean_value:
+ infobox_data[key] = clean_value
+
+ return infobox_data
\ No newline at end of file
diff --git a/tasks/InfoboxSync/parse/football_parser.py b/tasks/InfoboxSync/parse/football_parser.py
new file mode 100644
index 00000000..b39bc8de
--- /dev/null
+++ b/tasks/InfoboxSync/parse/football_parser.py
@@ -0,0 +1,59 @@
+"""
+Football biography infobox parser implementation.
+"""
+
+import logging
+from typing import Dict, Any
+from .base_parser import InfoboxParser
+
+logger = logging.getLogger(__name__)
+
+
+class FootballBiographyParser(InfoboxParser):
+ """
+ Parser for Infobox football biography template.
+ """
+
+ def __init__(self):
+ super().__init__("infobox football biography")
+
+ def parse_infobox(self, wikitext: str) -> Dict[str, Any]:
+ """
+ Parse football biography infobox from wikitext.
+
+ Args:
+ wikitext (str): The raw Wikipedia page content
+
+ Returns:
+ Dict[str, Any]: Extracted football biography fields
+ """
+ infobox_data = {}
+
+ try:
+ # Parse wikitext using wikitextparser
+ import wikitextparser as wtp
+ parsed = wtp.parse(wikitext)
+
+ # Find the football biography template
+ football_bio_template = self._find_template(parsed)
+
+ if football_bio_template:
+ logger.info("Found Infobox football biography template")
+
+ # Extract arguments from the template
+ infobox_data = self._extract_template_arguments(
+ football_bio_template)
+
+ count = len(infobox_data)
+ msg = "Extracted {} fields from football biography infobox"
+ logger.info(msg.format(count))
+ else:
+ msg = ("No Infobox football biography template found in the "
+ "page")
+ logger.warning(msg)
+
+ except Exception as e:
+ msg = "Error extracting football biography infobox: {}"
+ logger.error(msg.format(e))
+
+ return infobox_data
\ No newline at end of file
diff --git a/tasks/InfoboxSync/parse/parse.py b/tasks/InfoboxSync/parse/parse.py
new file mode 100644
index 00000000..d4948025
--- /dev/null
+++ b/tasks/InfoboxSync/parse/parse.py
@@ -0,0 +1,112 @@
+"""
+Parse stage for Wikipedia infobox synchronization using Strategy Pattern.
+"""
+
+import logging
+from .parser_factory import InfoboxParserFactory
+
+logger = logging.getLogger(__name__)
+
+
+def parse_data(data: dict, template_type: str = 'football_biography') -> dict:
+ """
+ Parse the fetched Wikipedia data to extract infobox information.
+
+ Args:
+ data (dict): The raw Wikipedia data with page content.
+ template_type (str): Type of template to parse ('football_biography',
+ 'person', etc.)
+
+ Returns:
+ dict: Parsed infobox data.
+ """
+ logger.info("Starting Wikipedia data parsing for template: {}".format(
+ template_type))
+
+ try:
+ page_content = data.get('content', '')
+ page_title = data.get('title', '')
+ arabic_title = data.get('arabic_title', '')
+
+ # Create parser using Strategy Pattern
+ parser = InfoboxParserFactory.create_parser(template_type)
+
+ # Parse infobox from Wikipedia content
+ infobox_data = parser.parse_infobox(page_content)
+
+ # Extract categories
+ categories = extract_categories_from_wikitext(page_content)
+
+ # Extract links (simplified - could be enhanced)
+ links = extract_links_from_wikitext(page_content)
+
+ parsed_data = {
+ 'title': page_title,
+ 'arabic_title': arabic_title,
+ 'infobox': infobox_data,
+ 'categories': categories,
+ 'links': links,
+ 'raw_content': page_content
+ }
+
+ logger.info("Successfully parsed data for title: {}".format(page_title))
+ return parsed_data
+
+ except Exception as e:
+ logger.error("Error parsing Wikipedia data: {}".format(e))
+ raise
+
+
+def extract_categories_from_wikitext(wikitext: str) -> list:
+ """
+ Extract categories from Wikipedia wikitext.
+
+ Args:
+ wikitext (str): The raw Wikipedia page content.
+
+ Returns:
+ list: List of category names.
+ """
+ import re
+ categories = []
+
+ try:
+ # Pattern to match category links
+ category_pattern = r'\[\[Category:([^\]]+)\]\]'
+ matches = re.findall(category_pattern, wikitext, re.IGNORECASE)
+
+ categories = [match.strip() for match in matches]
+
+ except Exception as e:
+ logger.warning("Error extracting categories: {}".format(e))
+
+ return categories
+
+
+def extract_links_from_wikitext(wikitext: str) -> list:
+ """
+ Extract internal links from Wikipedia wikitext.
+
+ Args:
+ wikitext (str): The raw Wikipedia page content.
+
+ Returns:
+ list: List of linked page titles.
+ """
+ import re
+ links = []
+
+ try:
+ # Pattern to match internal links [[Link|Display]]
+ link_pattern = r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]'
+ matches = re.findall(link_pattern, wikitext)
+
+ # Filter out special links (File:, Category:, etc.)
+ special_prefixes = ('File:', 'Category:', 'Image:', 'Template:')
+ links = [match.strip() for match in matches
+ if not match.startswith(special_prefixes)]
+
+ except Exception as e:
+ logger.warning("Error extracting links: {}".format(e))
+
+ return links
\ No newline at end of file
diff --git a/tasks/InfoboxSync/parse/parser_factory.py b/tasks/InfoboxSync/parse/parser_factory.py
new file mode 100644
index 00000000..9344c088
--- /dev/null
+++ b/tasks/InfoboxSync/parse/parser_factory.py
@@ -0,0 +1,54 @@
+"""
+Factory class for creating infobox parsers using Factory Pattern.
+"""
+
+from .base_parser import InfoboxParser
+from .football_parser import FootballBiographyParser
+
+
+class InfoboxParserFactory:
+ """
+ Factory class to create appropriate parsers based on template type.
+ """
+
+ @staticmethod
+ def create_parser(template_type: str) -> InfoboxParser:
+ """
+ Create the appropriate parser for the given template type.
+
+ Args:
+ template_type (str): Type of template ('football_biography',
+ 'person', etc.)
+
+ Returns:
+ InfoboxParser: The appropriate parser instance
+
+ Raises:
+ ValueError: If template type is not supported
+ """
+ if template_type.lower() == 'football_biography':
+ return FootballBiographyParser()
+ elif template_type.lower() == 'person':
+ from .generic_parser import GenericInfoboxParser
+ return GenericInfoboxParser("infobox person")
+ elif template_type.lower() == 'biography':
+ from .generic_parser import GenericInfoboxParser
+ return GenericInfoboxParser("infobox biography")
+ else:
+ # For custom template names, create generic parser
+ from .generic_parser import GenericInfoboxParser
+ return GenericInfoboxParser(template_type)
+
+ @staticmethod
+ def get_supported_types() -> list:
+ """
+ Get list of supported template types.
+
+ Returns:
+ list: List of supported template type strings
+ """
+ return [
+ 'football_biography',
+ 'person',
+ 'biography'
+ ]
\ No newline at end of file
diff --git a/tasks/InfoboxSync/parse/parsers.py b/tasks/InfoboxSync/parse/parsers.py
new file mode 100644
index 00000000..b899165d
--- /dev/null
+++ b/tasks/InfoboxSync/parse/parsers.py
@@ -0,0 +1,203 @@
+"""
+Infobox parsers using Strategy Pattern for different template types.
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Any
+import wikitextparser as wtp
+
+logger = logging.getLogger(__name__)
+
+
+class InfoboxParser(ABC):
+ """
+ Abstract base class for infobox parsers using Strategy Pattern.
+
+ This class defines the interface for parsing different types of
+ Wikipedia infobox templates using wikitextparser.
+ """
+
+ def __init__(self, template_name: str):
+ """
+ Initialize the parser with the target template name.
+
+ Args:
+ template_name (str): Name of the infobox template to parse
+ """
+ self.template_name = template_name.lower()
+
+ @abstractmethod
+ def parse_infobox(self, wikitext: str) -> Dict[str, Any]:
+ """
+ Parse the infobox from wikitext.
+
+ Args:
+ wikitext (str): The raw Wikipedia page content
+
+ Returns:
+ Dict[str, Any]: Extracted infobox fields
+ """
+ pass
+
+ def _find_template(self, parsed_wikitext: wtp.WikiText) -> wtp.Template:
+ """
+ Find the target template in the parsed wikitext.
+
+ Args:
+ parsed_wikitext: Parsed wikitext object
+
+ Returns:
+ wtp.Template: The found template object, or None
+ """
+ templates = parsed_wikitext.templates
+
+ for template in templates:
+ template_name = template.name.strip().lower()
+ if template_name == self.template_name:
+ return template
+
+ return None
+
+ def _extract_template_arguments(self, template: wtp.Template) -> Dict[str, str]:
+ """
+ Extract arguments from a template object.
+
+ Args:
+ template: The template object to extract from
+
+ Returns:
+ Dict[str, str]: Dictionary of template arguments
+ """
+ infobox_data = {}
+
+ for argument in template.arguments:
+ key = argument.name.strip()
+ value = argument.value.strip()
+
+ # Clean up the value by removing markup if needed
+ clean_value = wtp.parse(value).plain_text()
+
+ if key and clean_value:
+ infobox_data[key] = clean_value
+
+ return infobox_data
+
+
+class FootballBiographyParser(InfoboxParser):
+ """
+ Parser for Infobox football biography template.
+ """
+
+ def __init__(self):
+ super().__init__("infobox football biography")
+
+ def parse_infobox(self, wikitext: str) -> Dict[str, Any]:
+ """
+ Parse football biography infobox from wikitext.
+
+ Args:
+ wikitext (str): The raw Wikipedia page content
+
+ Returns:
+ Dict[str, Any]: Extracted football biography fields
+ """
+ infobox_data = {}
+
+ try:
+ # Parse wikitext using wikitextparser
+ parsed = wtp.parse(wikitext)
+
+ # Find the football biography template
+ football_bio_template = self._find_template(parsed)
+
+ if football_bio_template:
+ logger.info("Found Infobox football biography template")
+
+ # Extract arguments from the template
+ infobox_data = self._extract_template_arguments(football_bio_template)
+
+ logger.info("Extracted {} fields from football biography infobox".format(
+ len(infobox_data)))
+ else:
+ logger.warning("No Infobox football biography template "
+ "found in the page")
+
+ except Exception as e:
+ logger.error("Error extracting football biography infobox: {}".format(e))
+
+ return infobox_data
+
+
+class GenericInfoboxParser(InfoboxParser):
+ """
+ Generic parser for any infobox template type.
+ """
+
+ def parse_infobox(self, wikitext: str) -> Dict[str, Any]:
+ """
+ Parse generic infobox from wikitext.
+
+ Args:
+ wikitext (str): The raw Wikipedia page content
+
+ Returns:
+ Dict[str, Any]: Extracted infobox fields
+ """
+ infobox_data = {}
+
+ try:
+ # Parse wikitext using wikitextparser
+ parsed = wtp.parse(wikitext)
+
+ # Find the target template
+ template = self._find_template(parsed)
+
+ if template:
+ logger.info("Found {} template".format(self.template_name))
+
+ # Extract arguments from the template
+ infobox_data = self._extract_template_arguments(template)
+
+ logger.info("Extracted {} fields from {} template".format(
+ len(infobox_data), self.template_name))
+ else:
+ logger.warning("No {} template found in the page".format(
+ self.template_name))
+
+ except Exception as e:
+ logger.error("Error extracting {} infobox: {}".format(
+ self.template_name, e))
+
+ return infobox_data
+
+
+class InfoboxParserFactory:
+ """
+ Factory class to create appropriate parsers based on template type.
+ """
+
+ @staticmethod
+ def create_parser(template_type: str) -> InfoboxParser:
+ """
+ Create the appropriate parser for the given template type.
+
+ Args:
+ template_type (str): Type of template ('football_biography',
+ 'person', etc.)
+
+ Returns:
+ InfoboxParser: The appropriate parser instance
+
+ Raises:
+ ValueError: If template type is not supported
+ """
+ if template_type.lower() == 'football_biography':
+ return FootballBiographyParser()
+ elif template_type.lower() == 'person':
+ return GenericInfoboxParser("infobox person")
+ elif template_type.lower() == 'biography':
+ return GenericInfoboxParser("infobox biography")
+ else:
+ # For custom template names, create generic parser
+ return GenericInfoboxParser(template_type)
\ No newline at end of file
diff --git a/tasks/InfoboxSync/publish/__init__.py b/tasks/InfoboxSync/publish/__init__.py
new file mode 100644
index 00000000..5761d6ad
--- /dev/null
+++ b/tasks/InfoboxSync/publish/__init__.py
@@ -0,0 +1 @@
+# Publish stage for publishing Arabic templates to Wikipedia
\ No newline at end of file
diff --git a/tasks/InfoboxSync/publish/publish.py b/tasks/InfoboxSync/publish/publish.py
new file mode 100644
index 00000000..51b9c8f1
--- /dev/null
+++ b/tasks/InfoboxSync/publish/publish.py
@@ -0,0 +1,265 @@
+"""
+Publish stage for publishing Arabic templates to Wikipedia.
+"""
+
+import logging
+from typing import Dict, Any, Optional
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PublishResult:
+ """Result of a publish operation."""
+ success: bool
+ page_title: str
+ edit_summary: str
+ revision_id: Optional[int] = None
+ errors: list = None
+ metadata: Dict[str, Any] = None
+
+ def __post_init__(self):
+ if self.errors is None:
+ self.errors = []
+ if self.metadata is None:
+ self.metadata = {}
+
+
+def publish_arabic_template(translated_data: Dict[str, Any],
+ arabic_page_title: str,
+ edit_summary: str = "تحديث قالب السيرة الذاتية باستخدام InfoboxSync") -> PublishResult:
+ """
+ Publish an Arabic Wikipedia template to the specified page.
+
+ Args:
+ translated_data (Dict[str, Any]): Data from previous stages including 'arabic_template'
+ arabic_page_title (str): Title of the Arabic Wikipedia page to publish to
+ edit_summary (str): Edit summary for the Wikipedia edit
+
+ Returns:
+ PublishResult: Result of the publish operation
+ """
+ logger.info(f"Starting publish operation for page: {arabic_page_title}")
+
+ try:
+ # Check if arabic_template exists in the data
+ if 'arabic_template' not in translated_data:
+ error_msg = "No arabic_template found in translated_data"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ template_text = translated_data['arabic_template']
+ if not template_text or not template_text.strip():
+ error_msg = "Arabic template is empty or invalid"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ # Import pywikibot
+ try:
+ import pywikibot
+ except ImportError:
+ error_msg = "pywikibot is required for publishing. Install with: pip install pywikibot"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ # Initialize Arabic Wikipedia site
+ try:
+ site = pywikibot.Site('ar', 'wikipedia')
+ logger.info("Connected to Arabic Wikipedia")
+ except Exception as e:
+ error_msg = f"Failed to connect to Arabic Wikipedia: {e}"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ # Create page object
+ try:
+ page = pywikibot.Page(site, arabic_page_title)
+ logger.info(f"Created page object for: {arabic_page_title}")
+ except Exception as e:
+ error_msg = f"Failed to create page object: {e}"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ # Check if page exists
+ if not page.exists():
+ error_msg = f"Page '{arabic_page_title}' does not exist on Arabic Wikipedia"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ # Get current page content
+ try:
+ current_content = page.text
+ logger.info(f"Retrieved current page content (length: {len(current_content)})")
+ except Exception as e:
+ error_msg = f"Failed to retrieve current page content: {e}"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ # Smart template insertion/replacement using wikitextparser
+ try:
+ import wikitextparser as wtp
+
+ # Parse the current page content
+ parsed_content = wtp.parse(current_content)
+
+ # Find existing infobox templates
+ existing_infoboxes = []
+ for template in parsed_content.templates:
+ template_name = template.name.strip()
+ # Check for common Arabic infobox template names
+ if any(infobox_name in template_name.lower() for infobox_name in [
+ 'صندوق', 'infobox', 'سيرة', 'biography', 'person', 'football'
+ ]):
+ existing_infoboxes.append(template)
+
+ if existing_infoboxes:
+ # Remove existing infoboxes
+ logger.info(f"Found {len(existing_infoboxes)} existing infobox(es), removing them")
+ for infobox in existing_infoboxes:
+ # Remove the template from the parsed content
+ infobox.string = ''
+
+ # Clean up empty lines around removed templates
+ new_content = str(parsed_content)
+ new_content = '\n'.join(line for line in new_content.split('\n') if line.strip() or line == '')
+
+ # Insert new template at the beginning
+ final_content = template_text + '\n\n' + new_content.strip()
+ logger.info("Replaced existing infobox with new template")
+ else:
+ # No existing infobox, add template at the beginning
+ final_content = template_text + '\n\n' + current_content.strip()
+ logger.info("Added new template at the beginning of the page")
+
+ # Set the final content
+ page.text = final_content
+ logger.info(f"Set new page content (length: {len(final_content)})")
+
+ # Save the page
+ page.save(summary=edit_summary, minor=False)
+ revision_id = page.latest_revision_id
+
+ logger.info(f"Successfully published template to: {arabic_page_title}")
+ logger.info(f"Revision ID: {revision_id}")
+
+ return PublishResult(
+ success=True,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ revision_id=revision_id,
+ metadata={
+ 'template_length': len(template_text),
+ 'site': 'ar.wikipedia.org',
+ 'published_at': page.editTime().isoformat() if hasattr(page, 'editTime') else None
+ }
+ )
+
+ except Exception as e:
+ error_msg = f"Failed to save page: {e}"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+ except Exception as e:
+ error_msg = f"Unexpected error during publish operation: {e}"
+ logger.error(error_msg)
+ return PublishResult(
+ success=False,
+ page_title=arabic_page_title,
+ edit_summary=edit_summary,
+ errors=[error_msg]
+ )
+
+
+def publish_data(translated_data: Dict[str, Any],
+ arabic_page_title: str,
+ edit_summary: str = "تحديث قالب السيرة الذاتية باستخدام InfoboxSync") -> PublishResult:
+ """
+ Convenience function to publish translated data to Arabic Wikipedia.
+
+ Args:
+ translated_data (Dict[str, Any]): Translated data with arabic_template
+ arabic_page_title (str): Arabic page title to publish to
+ edit_summary (str): Edit summary for the edit
+
+ Returns:
+ PublishResult: Publish operation result
+ """
+ return publish_arabic_template(translated_data, arabic_page_title, edit_summary)
+
+
+def validate_publish_data(translated_data: Dict[str, Any], arabic_page_title: str) -> Dict[str, Any]:
+ """
+ Validate data before publishing.
+
+ Args:
+ translated_data (Dict[str, Any]): Data to validate
+ arabic_page_title (str): Target page title
+
+ Returns:
+ Dict with validation results
+ """
+ errors = []
+ warnings = []
+
+ # Check arabic_template
+ if 'arabic_template' not in translated_data:
+ errors.append("Missing arabic_template in translated_data")
+ elif not translated_data['arabic_template'] or not translated_data['arabic_template'].strip():
+ errors.append("arabic_template is empty")
+ elif not translated_data['arabic_template'].startswith('{{'):
+ warnings.append("Template doesn't start with '{{' - may not be a valid wiki template")
+
+ # Check arabic_page_title
+ if not arabic_page_title or not arabic_page_title.strip():
+ errors.append("Arabic page title is empty")
+ elif len(arabic_page_title) > 255:
+ errors.append("Arabic page title is too long (>255 characters)")
+
+ return {
+ 'valid': len(errors) == 0,
+ 'errors': errors,
+ 'warnings': warnings,
+ 'arabic_page_title': arabic_page_title,
+ 'has_template': 'arabic_template' in translated_data
+ }
\ No newline at end of file
diff --git a/tasks/InfoboxSync/save/__init__.py b/tasks/InfoboxSync/save/__init__.py
new file mode 100644
index 00000000..446de7ff
--- /dev/null
+++ b/tasks/InfoboxSync/save/__init__.py
@@ -0,0 +1 @@
+# Save stage package
\ No newline at end of file
diff --git a/tasks/InfoboxSync/save/save.py b/tasks/InfoboxSync/save/save.py
new file mode 100644
index 00000000..fb2bb3ce
--- /dev/null
+++ b/tasks/InfoboxSync/save/save.py
@@ -0,0 +1,37 @@
+import logging
+import json
+import os
+
+logger = logging.getLogger(__name__)
+
+
+def save_data(translated_data: dict, output_dir: str = 'output') -> str:
+ """
+ Save the translated data to a file.
+
+ Args:
+ translated_data (dict): The translated data from the translate stage.
+ output_dir (str): Directory to save the data (default: 'output').
+
+ Returns:
+ str: Path to the saved file.
+ """
+ logger.info(f"Starting data save to {output_dir}")
+ try:
+ # Create output directory if it doesn't exist
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Generate filename based on page title
+ title = translated_data.get('page_title', 'unknown')
+ filename = f"{title.replace(' ', '_').lower()}.json"
+ filepath = os.path.join(output_dir, filename)
+
+ # Save data as JSON
+ with open(filepath, 'w', encoding='utf-8') as f:
+ json.dump(translated_data, f, indent=2, ensure_ascii=False)
+
+ logger.info(f"Successfully saved data to: {filepath}")
+ return filepath
+ except Exception as e:
+ logger.error(f"Error saving data: {e}")
+ raise
\ No newline at end of file
diff --git a/tasks/InfoboxSync/test.py b/tasks/InfoboxSync/test.py
new file mode 100644
index 00000000..a99dfd70
--- /dev/null
+++ b/tasks/InfoboxSync/test.py
@@ -0,0 +1,181 @@
+import logging
+from fetch import fetch_wikipedia_data
+from parse.parse import parse_data
+from map.map import map_data
+from translate.translate import translate_data
+from construct.build import construct_arabic_template
+from publish.publish import publish_data
+from save.save import save_data
+from wikilocalize.integrator import process_construct_to_publish
+# Configure logging
+logging.basicConfig(
+ level=logging.DEBUG,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+
+logger = logging.getLogger(__name__)
+
+
+def run_wikipedia_pipeline(ar_page_title: str, target_lang: str = 'ar',
+ output_dir: str = 'output',
+ template_type: str = 'football_biography') -> str:
+ """
+ Run the complete Wikipedia infobox sync pipeline.
+
+ Args:
+ ar_page_title (str): Arabic Wikipedia page title to sync.
+ target_lang (str): Target language for translation (default: 'ar').
+ output_dir (str): Directory to save the processed data.
+ template_type (str): Type of template to parse and map.
+
+ Returns:
+ str: Path to the saved file.
+ """
+ msg = f"Starting Wikipedia InfoboxSync pipeline for: {ar_page_title}"
+ logger.info(msg)
+
+ try:
+ # Stage 1: Fetch Wikipedia data
+ logger.info("Pipeline stage: Fetch Wikipedia data")
+ wiki_data = fetch_wikipedia_data(ar_page_title)
+
+ if not wiki_data['sync_possible']:
+ error_msg = wiki_data.get('error', 'Unknown error occurred')
+ logger.error(f"Cannot proceed with pipeline: {error_msg}")
+ raise ValueError(error_msg)
+
+ # Extract English page content for processing
+ en_page_info = wiki_data['english']
+ if not en_page_info or not en_page_info.content:
+ msg = "No English page content available for processing"
+ raise ValueError(msg)
+
+ # Convert page info to dictionary format expected by parse stage
+ raw_data = {
+ 'title': en_page_info.title,
+ 'content': en_page_info.content,
+ 'arabic_title': wiki_data['arabic'].title,
+ 'langlinks': en_page_info.langlinks or {}
+ }
+
+ # Stage 2: Parse
+ logger.info("Pipeline stage: Parse")
+ parsed_data = parse_data(raw_data, template_type)
+
+ # Stage 3: Map
+ logger.info("Pipeline stage: Map")
+ mapped_data = map_data(parsed_data, template_type)
+
+ # Stage 4: Translate
+ logger.info("Pipeline stage: Translate")
+ translated_data = translate_data(mapped_data, target_lang)
+
+ # Stage 5: Build Arabic Template
+ logger.info("Pipeline stage: Construct Arabic Template")
+ build_result = construct_arabic_template(translated_data, template_type)
+
+ if not build_result.success:
+ error_msg = f"Template construction failed: {build_result.errors}"
+ logger.error(error_msg)
+ raise ValueError(error_msg)
+
+ # Add the constructed template to the translated data for saving
+ translated_data['arabic_template'] = build_result.template_text
+ translated_data['construct_metadata'] = {
+ 'template_type': build_result.template_type,
+ 'field_count': build_result.field_count,
+ 'builder_name': build_result.metadata.get('builder_name', 'unknown'),
+ 'template_name': build_result.metadata.get('template_name', 'unknown')
+ }
+ # Stage 6: Wiki Localization - Localize links and templates to Arabic equivalents
+ logger.info("Pipeline stage: Wiki Localization")
+ localization_result = process_construct_to_publish(
+ translated_data, # Contains arabic_template from previous step
+ enable_local_link_replacement=True,
+ enable_template_localization=True
+ )
+
+ if not localization_result.success:
+ error_msg = f"Wiki localization failed: {localization_result.errors}"
+ logger.error(error_msg)
+ raise ValueError(error_msg)
+
+ # Use the localized data for publishing
+ translated_data = localization_result.localized_data
+
+ # Add localization metadata to the translated data
+ translated_data['localization_metadata'] = {
+ 'links_replaced': localization_result.localization_info.original_links_replaced,
+ 'templates_localized': localization_result.localization_info.templates_localized,
+ 'waou_templates_inserted': localization_result.localization_info.waou_templates_inserted,
+ 'localization_errors': localization_result.localization_info.errors
+ }
+
+ # Stage 6: Publish to Arabic Wikipedia
+ logger.info("Pipeline stage: Publish to Arabic Wikipedia")
+ arabic_page_title = wiki_data['arabic'].title
+ edit_summary = f"تحديث قالب السيرة الذاتية باستخدام InfoboxSync - {template_type}"
+
+ publish_result = publish_data(translated_data, arabic_page_title, edit_summary)
+
+ if not publish_result.success:
+ error_msg = f"Publishing failed: {publish_result.errors}"
+ logger.error(error_msg)
+ raise ValueError(error_msg)
+
+ # Add publish metadata to the translated data
+ translated_data['publish_metadata'] = {
+ 'page_title': publish_result.page_title,
+ 'edit_summary': publish_result.edit_summary,
+ 'revision_id': publish_result.revision_id,
+ 'publish_success': publish_result.success,
+ 'published_at': publish_result.metadata.get('published_at')
+ }
+
+ # Stage 7: Save
+ logger.info("Pipeline stage: Save")
+ saved_path = save_data(translated_data, output_dir)
+
+ msg = f"Data saved to: {saved_path}"
+ logger.info(f"Pipeline completed successfully. {msg}")
+ return saved_path
+
+ except Exception as e:
+ logger.error(f"Pipeline failed: {e}")
+ raise
+
+
+def run_pipeline(url: str, target_lang: str = 'ar', output_dir: str = 'output') -> str:
+ """
+ Legacy function for backward compatibility.
+ Now extracts page title from Wikipedia URL and calls new pipeline.
+ """
+ msg = ("run_pipeline(url) is deprecated. Use "
+ "run_wikipedia_pipeline(page_title) instead.")
+ logger.warning(msg)
+
+ if 'wikipedia.org' in url and '/wiki/' in url:
+ page_title = url.split('/wiki/')[-1].replace('_', ' ')
+ return run_wikipedia_pipeline(page_title, target_lang, output_dir)
+ else:
+ msg = ("URL must be a Wikipedia page URL "
+ "(e.g., https://en.wikipedia.org/wiki/Page_Title)")
+ raise ValueError(msg)
+
+
+if __name__ == "__main__":
+ # Example usage with Arabic page title
+ example_arabic_page = "خير الدين مضوي" # Football player in Arabic
+ try:
+ result_path = run_wikipedia_pipeline(example_arabic_page, target_lang='ar')
+ print(f"Pipeline result saved to: {result_path}")
+ except Exception as e:
+ print(f"Pipeline execution failed: {e}")
+
+ # Alternative: Example with English page title (for testing)
+ # example_english_page = "Egypt"
+ # try:
+ # result_path = run_wikipedia_pipeline(example_english_page)
+ # print(f"Pipeline result saved to: {result_path}")
+ # except Exception as e:
+ # print(f"Pipeline execution failed: {e}")
\ No newline at end of file
diff --git a/tasks/InfoboxSync/translate/README.md b/tasks/InfoboxSync/translate/README.md
new file mode 100644
index 00000000..975146c5
--- /dev/null
+++ b/tasks/InfoboxSync/translate/README.md
@@ -0,0 +1,360 @@
+# Translation Stage - LiteLLM + Google Gemini AI
+
+This directory contains the translation stage implementation for the InfoboxSync pipeline, featuring AI-powered translation using LiteLLM and Google Gemini AI.
+
+## Overview
+
+The translation stage translates English Wikipedia infobox data to Arabic using advanced AI models. It follows the Strategy Pattern for extensibility and includes comprehensive error handling and fallback mechanisms.
+
+## Architecture
+
+### Core Components
+
+1. **`base_translator.py`** - Abstract base classes and factory pattern
+2. **`gemini_translator.py`** - Google Gemini AI implementation
+3. **`config.py`** - Configuration management for API keys and settings
+4. **`translate.py`** - Main translation interface and pipeline integration
+
+### Design Patterns Used
+
+- **Strategy Pattern**: Different translation services (Gemini, future services)
+- **Factory Pattern**: Creation of appropriate translation services
+- **Template Method**: Consistent translation workflow across services
+
+## Features
+
+### AI-Powered Translation
+- Uses Google Gemini AI via LiteLLM for high-quality translations
+- Supports both template-level and field-by-field translation
+- Intelligent handling of different field types (text, numbers, links, images)
+
+### Smart Field Handling
+- **Text Fields**: Translated naturally while preserving meaning
+- **Number Fields**: Kept in original form (heights, statistics, etc.)
+- **Link Fields**: Preserved as-is with proper formatting
+- **Image Fields**: Maintained without translation
+- **Numbered Fields**: Translated individually while maintaining sequence
+
+### Error Handling & Fallbacks
+- Graceful degradation when API is unavailable
+- Automatic fallback to field-by-field translation
+- Comprehensive error logging and metadata
+- Service availability checking
+
+### Configuration Management
+- Environment variable support for API keys
+- Flexible configuration system
+- Support for multiple API key sources
+
+## Installation
+
+1. Install LiteLLM:
+```bash
+pip install litellm
+```
+
+2. Set up your Google AI API key:
+```bash
+export GEMINI_API_KEY="your-google-ai-api-key-here"
+# OR
+export GOOGLE_AI_API_KEY="your-google-ai-api-key-here"
+```
+
+## Usage
+
+### Basic Usage
+
+```python
+from translate.translate import translate_data
+
+# Your mapped data from the map stage
+mapped_data = {
+ 'page_title': 'Player Name',
+ 'arabic_fields': {
+ 'الاسم': {'value': 'John Doe', 'type': 'text'},
+ 'الطول': {'value': '1.80 m', 'type': 'number'},
+ # ... more fields
+ }
+}
+
+# Translate to Arabic (default)
+result = translate_data(mapped_data)
+
+if result['translation_metadata']['success']:
+ translated_fields = result['translated_fields']
+ print(f"Translated {result['translation_metadata']['translated_fields']} fields")
+else:
+ print(f"Translation failed: {result['translation_metadata']['error']}")
+```
+
+### Advanced Usage
+
+```python
+# Specify translation service
+result = translate_data(mapped_data, service_name='gemini', target_lang='ar')
+
+# Use field-by-field translation (alternative method)
+from translate.translate import translate_field_by_field
+result = translate_field_by_field(mapped_data, target_lang='ar')
+```
+
+### Service Management
+
+```python
+from translate.translate import get_available_translation_services, test_translation_service
+
+# List available services
+services = get_available_translation_services()
+print(f"Available: {services}")
+
+# Test if a service is working
+is_working = test_translation_service('gemini')
+print(f"Gemini available: {is_working}")
+```
+
+## Configuration
+
+### Environment Variables
+
+- `GEMINI_API_KEY` - Google AI API key (preferred)
+- `GOOGLE_AI_API_KEY` - Alternative Google AI API key
+- `TRANSLATION_DEFAULT_SERVICE` - Default translation service ('gemini')
+- `TRANSLATION_ENABLE_CACHING` - Enable/disable caching ('true'/'false')
+- `TRANSLATION_CACHE_MAX_SIZE` - Maximum cache size (default: 1000)
+
+### Configuration File
+
+You can also use a JSON configuration file:
+
+```json
+{
+ "gemini": {
+ "model": "gemini/gemini-1.5-flash",
+ "temperature": 0.3,
+ "api_key": "your-api-key-here"
+ },
+ "default_service": "gemini"
+}
+```
+
+```python
+from translate.config import setup_translation_config
+config = setup_translation_config('/path/to/config.json')
+```
+
+## Data Flow
+
+### Input Data Structure
+```python
+{
+ 'page_title': 'English Title',
+ 'arabic_fields': {
+ 'arabic_field_name': {
+ 'value': 'English value',
+ 'type': 'text|number|link|image|numbered',
+ 'validation': {...}
+ }
+ },
+ 'arabic_title': 'Arabic Title'
+}
+```
+
+### Output Data Structure
+```python
+{
+ 'page_title': 'English Title',
+ 'arabic_fields': {...}, # Original fields
+ 'translated_fields': {
+ 'arabic_field_name': {
+ 'value': 'English value',
+ 'translated_value': 'Arabic translation',
+ 'translation_confidence': 0.9,
+ 'type': 'text'
+ }
+ },
+ 'translation_metadata': {
+ 'service': 'Google Gemini AI',
+ 'target_language': 'ar',
+ 'translation_method': 'template_translation',
+ 'total_fields': 10,
+ 'translated_fields': 8,
+ 'success': True
+ },
+ 'translated_title': 'Arabic Title'
+}
+```
+
+## Translation Methods
+
+### 1. Template Translation (Default)
+- Sends entire infobox as context to AI
+- Maintains relationships between fields
+- More accurate for complex templates
+- Better handling of numbered sequences
+
+### 2. Field-by-Field Translation
+- Translates each field individually
+- Faster for simple cases
+- Easier to debug
+- Good fallback when template translation fails
+
+## Prompt Engineering
+
+The Gemini translator uses carefully crafted prompts:
+
+### Infobox Translation Prompt
+```python
+prompt = f"""You are a professional translator specializing in Wikipedia infobox content.
+
+Please translate the following infobox data from English to Arabic. The data contains field names in Arabic and their corresponding values in English.
+
+INSTRUCTION:
+- Translate ONLY the VALUES (not the Arabic field names)
+- Maintain the exact structure and format
+- For numbered fields (arrays), translate each item individually
+- Keep technical terms, proper names, and numbers in their original form when appropriate
+- Ensure the translation is natural and appropriate for Wikipedia content
+
+FIELDS TO TRANSLATE:
+{fields_text}
+
+Please provide the translated infobox in the following JSON format:
+{{
+ "translated_fields": {{
+ "field_name_1": "translated_value_1",
+ "field_name_2": "translated_value_2",
+ ...
+ }},
+ "translation_metadata": {{
+ "total_fields": number,
+ "translated_fields": number,
+ "skipped_fields": number
+ }}
+}}
+
+IMPORTANT: Only output valid JSON, no additional text or explanations."""
+```
+
+## Error Handling
+
+### Common Error Scenarios
+
+1. **Missing API Key**
+ - Returns error metadata
+ - Logs warning message
+ - Doesn't crash the pipeline
+
+2. **API Rate Limiting**
+ - Automatic retry with exponential backoff
+ - Graceful degradation to field-by-field translation
+
+3. **Invalid JSON Response**
+ - Fallback to field-by-field translation
+ - Logs parsing errors for debugging
+
+4. **Network Issues**
+ - Timeout handling
+ - Retry mechanisms
+ - Error metadata for pipeline continuation
+
+### Fallback Strategy
+
+1. **Primary**: Template-level translation with Gemini
+2. **Fallback 1**: Field-by-field translation with Gemini
+3. **Fallback 2**: Return original data with error metadata
+
+## Testing
+
+Run the test script to verify functionality:
+
+```bash
+python test_translation.py
+```
+
+The test script demonstrates:
+- Service availability checking
+- Error handling without API keys
+- Full translation workflow with API keys
+- Field-by-field translation comparison
+
+## Performance Considerations
+
+### Caching
+- Translation results can be cached to reduce API calls
+- Configurable cache size and TTL
+- Cache keys based on field content
+
+### Optimization
+- Batch translation for multiple fields
+- Intelligent field type detection
+- Minimal API calls for unchanged content
+
+## Future Enhancements
+
+### Additional Services
+- OpenAI GPT models
+- Microsoft Translator
+- DeepL Pro
+- Custom fine-tuned models
+
+### Advanced Features
+- Translation memory for repeated phrases
+- Glossary support for domain-specific terms
+- Quality scoring and confidence metrics
+- Multi-language support
+
+### Integration Improvements
+- Async translation for better performance
+- Streaming responses for large infoboxes
+- Cost optimization and usage tracking
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"litellm not installed"**
+ ```bash
+ pip install litellm
+ ```
+
+2. **"No API key provided"**
+ ```bash
+ export GEMINI_API_KEY="your-key-here"
+ ```
+
+3. **"Translation service not available"**
+ - Check API key validity
+ - Verify network connectivity
+ - Check API quota/limits
+
+4. **JSON parsing errors**
+ - Usually indicates AI response format issues
+ - Automatically falls back to field-by-field translation
+ - Check logs for response content
+
+### Debug Mode
+
+Enable detailed logging:
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+```
+
+## Contributing
+
+To add new translation services:
+
+1. Create new translator class inheriting from `TranslationService`
+2. Implement required abstract methods
+3. Register service in factory: `TranslationServiceFactory.register_service(name, class)`
+4. Add service configuration in `config.py`
+
+Example:
+```python
+class CustomTranslator(TranslationService):
+ def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+ # Your implementation
+ pass
+
+# Register
+TranslationServiceFactory.register_service("custom", CustomTranslator)
\ No newline at end of file
diff --git a/tasks/InfoboxSync/translate/__init__.py b/tasks/InfoboxSync/translate/__init__.py
new file mode 100644
index 00000000..47c3df3a
--- /dev/null
+++ b/tasks/InfoboxSync/translate/__init__.py
@@ -0,0 +1,25 @@
+# Translate stage package
+
+# Import base classes and factory
+from .base_translator import TranslationService, TranslationServiceFactory, TranslationResult
+
+# Import configuration
+from .config import get_translation_config, setup_translation_config
+
+# Import translation services (this ensures they are registered)
+from . import gemini_translator
+
+# Import main translation function
+from .translate import translate_data, translate_field_by_field, get_available_translation_services, test_translation_service
+
+__all__ = [
+ 'TranslationService',
+ 'TranslationServiceFactory',
+ 'TranslationResult',
+ 'get_translation_config',
+ 'setup_translation_config',
+ 'translate_data',
+ 'translate_field_by_field',
+ 'get_available_translation_services',
+ 'test_translation_service'
+]
\ No newline at end of file
diff --git a/tasks/InfoboxSync/translate/base_translator.py b/tasks/InfoboxSync/translate/base_translator.py
new file mode 100644
index 00000000..ba4ad115
--- /dev/null
+++ b/tasks/InfoboxSync/translate/base_translator.py
@@ -0,0 +1,126 @@
+"""
+Base translation service interface following Strategy Pattern.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, List, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class TranslationResult:
+ """Result of a translation operation."""
+
+ def __init__(self,
+ translated_text: str,
+ original_text: str,
+ confidence: float = 1.0,
+ metadata: Optional[Dict[str, Any]] = None):
+ self.translated_text = translated_text
+ self.original_text = original_text
+ self.confidence = confidence
+ self.metadata = metadata or {}
+
+
+class TranslationService(ABC):
+ """Abstract base class for translation services."""
+
+ def __init__(self, source_lang: str = 'en', target_lang: str = 'ar'):
+ self.source_lang = source_lang
+ self.target_lang = target_lang
+
+ @abstractmethod
+ def translate_text(self, text: str, **kwargs) -> TranslationResult:
+ """
+ Translate a single text string.
+
+ Args:
+ text (str): Text to translate
+ **kwargs: Additional parameters for translation
+
+ Returns:
+ TranslationResult: Translation result
+ """
+ pass
+
+ @abstractmethod
+ def translate_field(self, field_name: str, field_value: Any, **kwargs) -> TranslationResult:
+ """
+ Translate a field name and value pair.
+
+ Args:
+ field_name (str): Name of the field
+ field_value (Any): Value of the field
+ **kwargs: Additional parameters
+
+ Returns:
+ TranslationResult: Translation result
+ """
+ pass
+
+ @abstractmethod
+ def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+ """
+ Translate an entire infobox template.
+
+ Args:
+ infobox_data (Dict[str, Any]): Infobox data with Arabic field names
+ **kwargs: Additional parameters
+
+ Returns:
+ Dict[str, Any]: Translated infobox data
+ """
+ pass
+
+ @abstractmethod
+ def is_available(self) -> bool:
+ """Check if the translation service is available and properly configured."""
+ pass
+
+ @abstractmethod
+ def get_service_name(self) -> str:
+ """Get the name of this translation service."""
+ pass
+
+
+class TranslationServiceFactory:
+ """Factory for creating translation services."""
+
+ _services = {}
+
+ @classmethod
+ def register_service(cls, service_name: str, service_class):
+ """Register a new translation service."""
+ cls._services[service_name] = service_class
+
+ @classmethod
+ def create_service(cls, service_name: str, **kwargs) -> TranslationService:
+ """
+ Create a translation service instance.
+
+ Args:
+ service_name (str): Name of the service to create
+ **kwargs: Parameters for service initialization
+
+ Returns:
+ TranslationService: Service instance
+
+ Raises:
+ ValueError: If service is not registered or creation fails
+ """
+ if service_name not in cls._services:
+ available_services = list(cls._services.keys())
+ raise ValueError(f"Unknown translation service: {service_name}. "
+ f"Available services: {available_services}")
+
+ service_class = cls._services[service_name]
+ try:
+ return service_class(**kwargs)
+ except Exception as e:
+ raise ValueError(f"Failed to create {service_name} service: {e}")
+
+ @classmethod
+ def get_available_services(cls) -> List[str]:
+ """Get list of available translation services."""
+ return list(cls._services.keys())
\ No newline at end of file
diff --git a/tasks/InfoboxSync/translate/config.py b/tasks/InfoboxSync/translate/config.py
new file mode 100644
index 00000000..8c402337
--- /dev/null
+++ b/tasks/InfoboxSync/translate/config.py
@@ -0,0 +1,120 @@
+"""
+Configuration for translation services.
+"""
+
+import os
+import logging
+from typing import Optional, Dict, Any
+
+logger = logging.getLogger(__name__)
+
+
+class TranslationConfig:
+ """Configuration manager for translation services."""
+
+ # Default configuration
+ DEFAULT_CONFIG = {
+ 'gemini': {
+ 'model': 'gemini/gemini-2.0-flash',
+ 'temperature': 0.3,
+ # 'max_tokens': 2000,
+ 'api_key_env_vars': ['GEMINI_API_KEY', 'GOOGLE_AI_API_KEY']
+ },
+ 'default_service': 'gemini',
+ 'fallback_service': None,
+ 'enable_caching': True,
+ 'cache_max_size': 1000,
+ 'request_timeout': 30,
+ 'retry_attempts': 3,
+ 'retry_delay': 1.0
+ }
+
+ def __init__(self, config_file: Optional[str] = None):
+ """
+ Initialize configuration.
+
+ Args:
+ config_file (Optional[str]): Path to configuration file
+ """
+ self.config = self.DEFAULT_CONFIG.copy()
+ self._load_from_env()
+ if config_file and os.path.exists(config_file):
+ self._load_from_file(config_file)
+
+ def _load_from_env(self):
+ """Load configuration from environment variables."""
+ # API Keys
+ for service, service_config in self.config.items():
+ if isinstance(service_config, dict) and 'api_key_env_vars' in service_config:
+ for env_var in service_config['api_key_env_vars']:
+ api_key = os.getenv(env_var)
+ if api_key:
+ self.config[service]['api_key'] = api_key
+ logger.info(f"Loaded API key for {service} from {env_var}")
+ break
+
+ # Other environment variables
+ if os.getenv('TRANSLATION_DEFAULT_SERVICE'):
+ self.config['default_service'] = os.getenv('TRANSLATION_DEFAULT_SERVICE')
+
+ if os.getenv('TRANSLATION_ENABLE_CACHING') == 'false':
+ self.config['enable_caching'] = False
+
+ if os.getenv('TRANSLATION_CACHE_MAX_SIZE'):
+ try:
+ self.config['cache_max_size'] = int(os.getenv('TRANSLATION_CACHE_MAX_SIZE'))
+ except ValueError:
+ pass
+
+ def _load_from_file(self, config_file: str):
+ """Load configuration from file."""
+ try:
+ import json
+ with open(config_file, 'r', encoding='utf-8') as f:
+ file_config = json.load(f)
+ self._merge_config(file_config)
+ logger.info(f"Loaded configuration from {config_file}")
+ except Exception as e:
+ logger.warning(f"Failed to load configuration from {config_file}: {e}")
+
+ def _merge_config(self, new_config: Dict[str, Any]):
+ """Merge new configuration with existing."""
+ for key, value in new_config.items():
+ if isinstance(value, dict) and key in self.config:
+ self.config[key].update(value)
+ else:
+ self.config[key] = value
+
+ def get_service_config(self, service_name: str) -> Dict[str, Any]:
+ """Get configuration for a specific service."""
+ return self.config.get(service_name, {})
+
+ def get_default_service(self) -> str:
+ """Get default translation service."""
+ return self.config['default_service']
+
+ def has_api_key(self, service_name: str) -> bool:
+ """Check if API key is available for service."""
+ service_config = self.get_service_config(service_name)
+ return 'api_key' in service_config and service_config['api_key']
+
+ def get_api_key(self, service_name: str) -> Optional[str]:
+ """Get API key for service."""
+ service_config = self.get_service_config(service_name)
+ return service_config.get('api_key')
+
+
+# Global configuration instance
+translation_config = TranslationConfig()
+
+
+def get_translation_config() -> TranslationConfig:
+ """Get global translation configuration."""
+ return translation_config
+
+
+def setup_translation_config(config_file: Optional[str] = None) -> TranslationConfig:
+ """Setup translation configuration."""
+ global translation_config
+ translation_config = TranslationConfig(config_file)
+ return translation_config
\ No newline at end of file
diff --git a/tasks/InfoboxSync/translate/gemini_translator.py b/tasks/InfoboxSync/translate/gemini_translator.py
new file mode 100644
index 00000000..ff23f4ea
--- /dev/null
+++ b/tasks/InfoboxSync/translate/gemini_translator.py
@@ -0,0 +1,332 @@
+"""
+Google Gemini AI translation service using LiteLLM.
+"""
+
+import os
+import json
+import logging
+from typing import Dict, Any, List, Optional
+from .base_translator import TranslationService, TranslationResult, TranslationServiceFactory
+
+logger = logging.getLogger(__name__)
+
+
+class GeminiTranslator(TranslationService):
+ """Google Gemini AI translation service using LiteLLM."""
+
+ def __init__(self,
+ api_key: Optional[str] = None,
+ model: str = "gemini/gemini-2.0-flash",
+ source_lang: str = 'en',
+ target_lang: str = 'ar',
+ temperature: float = 0.3,
+ max_tokens: int = 5000):
+ """
+ Initialize Gemini translator.
+
+ Args:
+ api_key (Optional[str]): Google AI API key. If None, uses GEMINI_API_KEY env var
+ model (str): Gemini model to use
+ source_lang (str): Source language code
+ target_lang (str): Target language code
+ temperature (float): Sampling temperature
+ max_tokens (int): Maximum tokens in response
+ """
+ super().__init__(source_lang, target_lang)
+ self.api_key = api_key or os.getenv('GEMINI_API_KEY') or os.getenv('GOOGLE_AI_API_KEY')
+ self.model = model
+ self.temperature = temperature
+ self.max_tokens = max_tokens
+
+ if not self.api_key:
+ logger.warning("No API key provided for Gemini translator")
+
+ # Import litellm here to avoid import errors if not installed
+ try:
+ import litellm
+ self.litellm = litellm
+ except ImportError:
+ logger.error("litellm not installed. Install with: pip install litellm")
+ raise ImportError("litellm is required for GeminiTranslator")
+
+ def _load_prompt_template(self) -> str:
+ """Load the prompt template from file."""
+ template_path = os.path.join(os.path.dirname(__file__), 'prompt_template.txt')
+ try:
+ with open(template_path, 'r', encoding='utf-8') as f:
+ return f.read()
+ except FileNotFoundError:
+ logger.warning(f"Prompt template not found at {template_path}, using default template")
+ return self._get_default_prompt_template()
+ except Exception as e:
+ logger.warning(f"Error loading prompt template: {e}, using default template")
+ return self._get_default_prompt_template()
+
+ def _get_default_prompt_template(self) -> str:
+ """Get default prompt template if file is not available."""
+ return """You are a professional translator specializing in Wikipedia infobox content.
+
+Translate ALL the following field values from English to Arabic in ONE SINGLE REQUEST. Each field is marked with [index] for identification.
+
+INSTRUCTION:
+- Translate EVERY field value to Arabic
+- Keep the [index] markers in your response
+- Translate naturally while maintaining meaning
+- Keep technical terms, proper names, and numbers in original form when appropriate
+- For numbered field items, translate each one individually
+- Output in the SAME format with [index] markers
+
+FIELDS TO TRANSLATE:
+{{FIELDS_TEXT}}
+
+RESPONSE FORMAT:
+[{{START_INDEX}}]: translated_value_1
+[{{START_INDEX+1}}]: translated_value_2
+[{{START_INDEX+2}}]: translated_value_3
+...continue for all fields...
+
+IMPORTANT: Respond with ALL translated fields using the SAME [index] markers."""
+
+ def _build_prompt_from_template(self, template: str, fields_text: str, start_index: int = 0) -> str:
+ """Build prompt by replacing placeholders in template."""
+ # Replace placeholders
+ prompt = template.replace('{{FIELDS_TEXT}}', fields_text)
+ prompt = prompt.replace('{{START_INDEX}}', str(start_index))
+ prompt = prompt.replace('{{START_INDEX+1}}', str(start_index + 1))
+ prompt = prompt.replace('{{START_INDEX+2}}', str(start_index + 2))
+
+ return prompt
+
+ def _get_infobox_translation_prompt(self, infobox_data: Dict[str, Any]) -> tuple[str, dict]:
+ """Generate prompt for single-request infobox translation and return field mapping."""
+ # Extract field information and prepare for single translation request
+ fields_list = []
+ field_mapping = {} # Map field index to arabic key
+
+ for idx, (arabic_key, field_data) in enumerate(infobox_data.items()):
+ if isinstance(field_data, dict) and 'value' in field_data:
+ value = field_data['value']
+ field_type = field_data.get('type', 'text')
+
+ # Handle different field types
+ if field_type == 'numbered' and isinstance(value, list):
+ # For numbered fields, prepare each item for translation
+ for i, item in enumerate(value):
+ fields_list.append(f"[{idx}_{i}]: {item}")
+ field_mapping[f"{idx}_{i}"] = (arabic_key, i)
+ elif field_type in ['number', 'link', 'image']:
+ # Skip translation for these field types, but keep mapping for reference
+ field_mapping[str(idx)] = (arabic_key, None)
+ else:
+ fields_list.append(f"[{idx}]: {value}")
+ field_mapping[str(idx)] = (arabic_key, None)
+
+ fields_text = '\n'.join(fields_list)
+
+ # Load template and build prompt
+ template = self._load_prompt_template()
+ prompt = self._build_prompt_from_template(template, fields_text, start_index=0)
+
+ return prompt, field_mapping
+
+ def _parse_single_request_response(self, response_text: str, field_mapping: dict) -> Dict[str, Any]:
+ """Parse the single-request translation response and map back to fields."""
+ translated_fields = {}
+
+ # Parse response line by line
+ lines = response_text.strip().split('\n')
+
+ for line in lines:
+ line = line.strip()
+ if not line:
+ continue
+
+ # Look for [index]: translated_value pattern
+ if line.startswith('[') and ']:' in line:
+ try:
+ index_end = line.find(']:')
+ index = line[1:index_end].strip()
+ translated_value = line[index_end + 2:].strip()
+
+ if index in field_mapping:
+ arabic_key, item_index = field_mapping[index]
+
+ if arabic_key not in translated_fields:
+ translated_fields[arabic_key] = {}
+
+ if item_index is not None:
+ # This is part of a numbered field
+ if 'value' not in translated_fields[arabic_key]:
+ translated_fields[arabic_key]['value'] = []
+ translated_fields[arabic_key]['value'].append(translated_value)
+ else:
+ # This is a single field
+ translated_fields[arabic_key]['value'] = translated_value
+
+ except (ValueError, IndexError) as e:
+ logger.warning(f"Failed to parse response line: {line} - {e}")
+ continue
+
+ return translated_fields
+
+ def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+ """Translate an entire infobox template in ONE SINGLE REQUEST."""
+ try:
+ logger.info(f"Starting single-request infobox translation with {len(infobox_data)} fields")
+
+ # Generate single-request prompt and field mapping
+ prompt, field_mapping = self._get_infobox_translation_prompt(infobox_data)
+
+ # Make single API call for all fields
+ response_text = self._call_gemini(prompt)
+
+ # Parse the single response
+ translated_fields = self._parse_single_request_response(response_text, field_mapping)
+
+ # Merge translated fields back into original structure
+ translated_infobox = {}
+ for arabic_key, field_data in infobox_data.items():
+ if arabic_key in translated_fields:
+ # Create new field data with translated value
+ new_field_data = field_data.copy()
+ new_field_data['translated_value'] = translated_fields[arabic_key]['value']
+ new_field_data['translation_confidence'] = 0.9
+ translated_infobox[arabic_key] = new_field_data
+ else:
+ # Keep original if not translated
+ translated_infobox[arabic_key] = field_data
+
+ logger.info(f"Successfully translated infobox with {len(translated_fields)} fields in ONE request")
+
+ return {
+ 'translated_infobox': translated_infobox,
+ 'translation_metadata': {
+ 'method': 'single_request',
+ 'api_calls': 1,
+ 'total_fields': len(infobox_data),
+ 'translated_fields': len(translated_fields)
+ },
+ 'original_field_count': len(infobox_data),
+ 'translated_field_count': len(translated_fields)
+ }
+
+ except Exception as e:
+ logger.error(f"Single-request infobox translation failed: {e}")
+ # Return original data with error metadata
+ return {
+ 'translated_infobox': infobox_data,
+ 'translation_metadata': {
+ 'method': 'single_request_failed',
+ 'error': str(e),
+ 'api_calls': 0
+ },
+ 'original_field_count': len(infobox_data),
+ 'translated_field_count': 0
+ }
+
+ def _call_gemini(self, prompt: str) -> str:
+ """Make API call to Gemini via LiteLLM."""
+ try:
+ response = self.litellm.completion(
+ model=self.model,
+ messages=[{"role": "user", "content": prompt}],
+ temperature=self.temperature,
+ max_tokens=self.max_tokens,
+ api_key=self.api_key
+ )
+ return response.choices[0].message.content
+ except Exception as e:
+ logger.error(f"Gemini API call failed: {e}")
+ raise
+
+ def translate_text(self, text: str, **kwargs) -> TranslationResult:
+ """Translate a single text string."""
+ try:
+ prompt = f"Translate the following text from {self.source_lang} to {self.target_lang}:\n\n{text}\n\nTranslation:"
+ translated_text = self._call_gemini(prompt).strip()
+
+ return TranslationResult(
+ translated_text=translated_text,
+ original_text=text,
+ confidence=0.9,
+ metadata={"model": self.model, "method": "single_text"}
+ )
+ except Exception as e:
+ logger.error(f"Text translation failed: {e}")
+ return TranslationResult(
+ translated_text=text,
+ original_text=text,
+ confidence=0.0,
+ metadata={"error": str(e)}
+ )
+
+ def translate_field(self, field_name: str, field_value: Any, **kwargs) -> TranslationResult:
+ """Translate a field name and value pair."""
+ try:
+ # Skip translation for certain field types
+ if isinstance(field_value, dict):
+ field_type = field_value.get('type', 'text')
+ value = field_value.get('value', '')
+
+ # Don't translate numbers, links, or images
+ if field_type in ['number', 'link', 'image']:
+ return TranslationResult(
+ translated_text=str(value),
+ original_text=str(value),
+ confidence=1.0,
+ metadata={"skipped": True, "reason": f"field_type_{field_type}"}
+ )
+ else:
+ value = field_value
+
+ prompt = f"""Translate the following field value to Arabic:
+
+Field: {field_name}
+Value: {value}
+Type: text
+
+INSTRUCTION:
+- Translate naturally and maintain meaning
+- Keep technical terms and proper names in original form when appropriate
+- Output only the translated text, no explanations
+
+Translated value:"""
+
+ translated_text = self._call_gemini(prompt).strip()
+
+ return TranslationResult(
+ translated_text=translated_text,
+ original_text=str(value),
+ confidence=0.9,
+ metadata={"model": self.model, "method": "field_translation"}
+ )
+ except Exception as e:
+ logger.error(f"Field translation failed for {field_name}: {e}")
+ return TranslationResult(
+ translated_text=str(field_value),
+ original_text=str(field_value),
+ confidence=0.0,
+ metadata={"error": str(e)}
+ )
+
+ def is_available(self) -> bool:
+ """Check if Gemini service is available."""
+ if not self.api_key:
+ return False
+
+ try:
+ # Try a simple test call
+ test_prompt = "Say 'OK' if you can understand this message."
+ response = self._call_gemini(test_prompt)
+ return 'OK' in response.upper()
+ except Exception:
+ return False
+
+ def get_service_name(self) -> str:
+ """Get service name."""
+ return "Google Gemini AI"
+
+
+# Register the service
+TranslationServiceFactory.register_service("gemini", GeminiTranslator)
+TranslationServiceFactory.register_service("google_gemini", GeminiTranslator)
\ No newline at end of file
diff --git a/tasks/InfoboxSync/translate/prompt_template.txt b/tasks/InfoboxSync/translate/prompt_template.txt
new file mode 100644
index 00000000..15f68029
--- /dev/null
+++ b/tasks/InfoboxSync/translate/prompt_template.txt
@@ -0,0 +1,125 @@
+You are a professional translator specializing in Wikipedia infobox content.
+
+STRICT TRANSLATION RULES - MUST FOLLOW WITHOUT EXCEPTION:
+
+CONTENT TYPE HANDLING:
+
+* PLAIN TEXT:
+ - DO: Translate descriptively and naturally
+ - DON'T: Don't skip or ignore any text
+ Examples:
+ "Professional footballer" -> "لاعب كرة قدم محترف"
+ "American actor and comedian" -> "ممثل وكوميدي أمريكي"
+ "Award-winning journalist" -> "صحفي حاصل على جوائز"
+ "Environmental consultant" -> "مستشار بيئي"
+
+* EXTERNAL LINKS:
+ - DO: Keep the exact URL format unchanged, translate only the display text
+ - DON'T: Never modify the URL or format
+ Examples:
+ [http://www.example.com Football website] -> [http://www.example.com موقع كرة قدم]
+ [https://news.bbc.co.uk Football news] -> [https://news.bbc.co.uk أخبار كرة قدم]
+ [http://football.com/transfers Latest transfers] -> [http://football.com/transfers أحدث الانتقالات]
+ [http://wikipedia.org Wikipedia] -> [http://wikipedia.org ويكيبيديا]
+
+* WIKI LINKS:
+ - DO: Keep the link target exactly as is, translate ONLY the display text
+ - DON'T: Don't change the link target/URL or syntax
+ Examples:
+ [[Manchester_United|Manchester United F.C.]] -> [[Manchester_United|مانشستر يونايتد]]
+ [[FC_Bayern_Munich|Bayern Munich]] -> [[FC_Bayern_Munich|بايرن ميونخ]]
+ [[Barcelona_SC|Club Atlético Barcelona]] -> [[Barcelona_SC|برشلونة الرياضي]]
+ [[Premier_League|English Premier League]] -> [[Premier_League|الدوري الإنجليزي الممتاز]]
+
+ - IMPORTANT: Template NAMES (like 'birth date', 'convert') must NEVER be translated
+ - CRITICAL: Only translate template parameter VALUES if they are human-readable text
+* TEMPLATES:
+ - DO: Keep template name and syntax intact, translate ONLY human-readable text parameters
+ - DON'T: Don't change template structure, numbers, or technical parameters
+ Examples:
+ {{birth date|1990|5|15}} -> {{birth date|1990|5|15}}
+ {{convert|175|cm|ft}} -> {{convert|175|cm|ft}}
+ {{cite web|title=News article}} -> {{cite web|title=مقالة إخبارية}}
+ {{flagicon|USA}} -> {{علم الولايات المتحدة}}
+
+* NUMBERS & MEASURES:
+ - NOTE: When translating to another language, use the equivalent template name for that language (e.g., English 'flag' templates may become Arabic 'علم' templates)
+ - DO: Keep ALL numbers, decimals, and symbols unchanged, translate ONLY units and suffixes
+ - DON'T: Don't modify any numerical values or punctuation
+ Examples:
+ 1.84 m -> 1.84 متر
+ 25 years old -> 25 عامًا
+ 150 kg -> 150 كيلوغرام
+ $100,000 -> 100,000 دولار أمريكي
+
+* RAW TEXT:
+ - DO: Treat entirely as plain text and translate all contents
+ - DON'T: Don't leave any part untranslated
+ Examples:
+ "Barcelona, Spain" -> "برشلونة، إسبانيا"
+ "born in Madrid" -> "ولد في مدريد"
+ "New York City" -> "مدينة نيويورك"
+ "Los Angeles, California" -> "لوس أنجلوس، كاليفورنيا"
+
+
+Translate ALL the following field values from English to Arabic in ONE SINGLE REQUEST. Each field is marked with [index] for identification.
+
+INSTRUCTION:
+
+COMPOUND/COMPLEX TEXT HANDLING:
+- DO: When text contains multiple content types, process EACH PART based on the basic content type rules
+- DON'T: Don't treat compound text as a single unit - break it down and handle each element according to its type
+
+COMPOUND TEXT EXAMPLES:
+"[[Manchester United]] is a football club founded in [[1902]]"
+-> Break down: ğğManchester Unitedĭ translated using WIjI LINKS rule (translate display, keep target)
++ " is a football club founded in " translated as PLAIN TEXT
++ "1902" translated using NUMBERS rule (keep unchanged)
+-> Result: "[[Manchester United|مانشستر يونايتد]] هو نادي كرة قدم تأسس في [[1902]]"
+
+Text with links and plain text in templates must follow all the above rules simultaneously.
+
+
+FOOTBALL/MANAGERIAL TERMS TRANSLATION:
+- DO: Use these exact translations for common football positions and roles
+- DON'T: Don't improvise translations for these standard terms
+
+STANDARD FOOTBALL TRANSLATIONS:
+loan = إعارة
+manager = مدرب
+head coach = مدرب
+on loan from = معارًا من
+interim/caretaker = مؤقت
+scout = كشاف
+football director = مدير رياضي
+assistant = مساعد
+goalkeeping coach = مدرب حراس
+fitness coach = معد بدني
+coordinator = منسق
+player and individual coach = لاعب ومدرب
+assistant coach = مساعد مدرب
+
+EXAMPLES:
+- "Head Coach: John Smith" -> "المدرب: جون سميث"
+- "Goalkeeper Coach: Mike Johnson" -> "مدرب الحراس: مايك جونسون"
+- "Fitness Coach: David Brown" -> "المعد البدني: ديفيد براون"
+- "On loan from Manchester United" -> "معارًا من مانشستر يونايتد"
+- "Assistant Coach: Sarah Wilson" -> "المساعد المدرب: سارة ويلسون"
+
+- Translate EVERY field value to Arabic
+- Keep the [index] markers in your response
+- Translate naturally while maintaining meaning
+- Keep technical terms, proper names, and numbers in original form when appropriate
+- For numbered field items, translate each one individually
+- Output in the SAME format with [index] markers
+
+FIELDS TO TRANSLATE:
+{{FIELDS_TEXT}}
+
+RESPONSE FORMAT:
+[{{START_INDEX}}]: translated_value_1
+[{{START_INDEX+1}}]: translated_value_2
+[{{START_INDEX+2}}]: translated_value_3
+...continue for all fields...
+
+IMPORTANT: Respond with ALL translated fields using the SAME [index] markers.
\ No newline at end of file
diff --git a/tasks/InfoboxSync/translate/translate.py b/tasks/InfoboxSync/translate/translate.py
new file mode 100644
index 00000000..475194fe
--- /dev/null
+++ b/tasks/InfoboxSync/translate/translate.py
@@ -0,0 +1,230 @@
+import logging
+from typing import Dict, Any, Optional
+from .base_translator import TranslationServiceFactory
+from .config import get_translation_config
+
+logger = logging.getLogger(__name__)
+
+
+def translate_data(mapped_data: dict, target_lang: str = 'ar',
+ service_name: Optional[str] = None) -> dict:
+ """
+ Translate the mapped data to the target language using AI translation services.
+
+ Args:
+ mapped_data (dict): The mapped data from the map stage with Arabic field names.
+ target_lang (str): Target language code (default: 'ar' for Arabic).
+ service_name (Optional[str]): Translation service to use. If None, uses default.
+
+ Returns:
+ dict: Translated data with additional translation metadata.
+ """
+ logger.info(f"Starting data translation to {target_lang}")
+
+ try:
+ # Get configuration
+ config = get_translation_config()
+
+ # Determine which service to use
+ if not service_name:
+ service_name = config.get_default_service()
+
+ logger.info(f"Using translation service: {service_name}")
+
+ # Create translation service
+ try:
+ translator = TranslationServiceFactory.create_service(
+ service_name,
+ source_lang='en',
+ target_lang=target_lang
+ )
+ except Exception as e:
+ logger.error(f"Failed to create translation service {service_name}: {e}")
+ # Return original data with error metadata
+ return _add_translation_error(mapped_data, str(e))
+
+ # Check if service is available
+ if not translator.is_available():
+ error_msg = f"Translation service {service_name} is not available"
+ logger.error(error_msg)
+ return _add_translation_error(mapped_data, error_msg)
+
+ # Extract infobox data for translation
+ arabic_fields = mapped_data.get('arabic_fields', {})
+ if not arabic_fields:
+ logger.warning("No Arabic fields found in mapped data")
+ return _add_translation_metadata(mapped_data, {}, "no_fields")
+
+ logger.info(f"Translating {len(arabic_fields)} fields")
+
+ # Translate the infobox data
+ translation_result = translator.translate_infobox(arabic_fields)
+
+ # Process translation results
+ translated_infobox = translation_result.get('translated_infobox', {})
+ translation_metadata = translation_result.get('translation_metadata', {})
+
+ # Build the final translated data structure
+ translated_data = mapped_data.copy()
+ translated_data['translated_fields'] = translated_infobox
+ translated_data['translation_metadata'] = {
+ 'service': translator.get_service_name(),
+ 'target_language': target_lang,
+ 'translation_method': translation_metadata.get('method', 'unknown'),
+ 'total_fields': translation_result.get('original_field_count', 0),
+ 'translated_fields': translation_result.get('translated_field_count', 0),
+ 'success': True
+ }
+
+ # Update page title if it's in English and we have an Arabic title
+ if 'arabic_title' in mapped_data and mapped_data['arabic_title']:
+ translated_data['translated_title'] = mapped_data['arabic_title']
+
+ logger.info(f"Successfully translated data for: {mapped_data.get('page_title', 'Unknown')}")
+ logger.info(f"Translation stats: {translation_result.get('translated_field_count', 0)}/"
+ f"{translation_result.get('original_field_count', 0)} fields translated")
+
+ return translated_data
+
+ except Exception as e:
+ logger.error(f"Error translating data: {e}")
+ return _add_translation_error(mapped_data, str(e))
+
+
+def _add_translation_metadata(mapped_data: dict, translation_metadata: dict,
+ method: str = "unknown") -> dict:
+ """Add translation metadata to mapped data."""
+ translated_data = mapped_data.copy()
+ translated_data['translation_metadata'] = {
+ 'service': 'unknown',
+ 'target_language': 'ar',
+ 'translation_method': method,
+ 'success': True,
+ **translation_metadata
+ }
+ return translated_data
+
+
+def _add_translation_error(mapped_data: dict, error_message: str) -> dict:
+ """Add translation error metadata to mapped data."""
+ translated_data = mapped_data.copy()
+ translated_data['translation_metadata'] = {
+ 'service': 'unknown',
+ 'target_language': 'ar',
+ 'success': False,
+ 'error': error_message
+ }
+ return translated_data
+
+
+def get_available_translation_services() -> list:
+ """
+ Get list of available translation services.
+
+ Returns:
+ list: List of available service names
+ """
+ try:
+ return TranslationServiceFactory.get_available_services()
+ except Exception as e:
+ logger.error(f"Error getting available services: {e}")
+ return []
+
+
+def test_translation_service(service_name: str = 'gemini') -> bool:
+ """
+ Test if a translation service is available and working.
+
+ Args:
+ service_name (str): Name of the service to test
+
+ Returns:
+ bool: True if service is available and working
+ """
+ try:
+ config = get_translation_config()
+ if not config.has_api_key(service_name):
+ logger.warning(f"No API key available for {service_name}")
+ return False
+
+ translator = TranslationServiceFactory.create_service(service_name)
+ return translator.is_available()
+ except Exception as e:
+ logger.error(f"Error testing translation service {service_name}: {e}")
+ return False
+
+
+def translate_field_by_field(mapped_data: dict, target_lang: str = 'ar',
+ service_name: Optional[str] = None) -> dict:
+ """
+ Translate data field by field (alternative to template-based translation).
+
+ Args:
+ mapped_data (dict): The mapped data from the map stage.
+ target_lang (str): Target language code.
+ service_name (Optional[str]): Translation service to use.
+
+ Returns:
+ dict: Translated data with field-by-field results.
+ """
+ logger.info(f"Starting field-by-field translation to {target_lang}")
+
+ try:
+ # Get configuration and create translator (same as main function)
+ config = get_translation_config()
+ if not service_name:
+ service_name = config.get_default_service()
+
+ translator = TranslationServiceFactory.create_service(
+ service_name,
+ source_lang='en',
+ target_lang=target_lang
+ )
+
+ if not translator.is_available():
+ return _add_translation_error(mapped_data, f"Service {service_name} not available")
+
+ arabic_fields = mapped_data.get('arabic_fields', {})
+ translated_fields = {}
+
+ # Translate each field individually
+ for arabic_key, field_data in arabic_fields.items():
+ if isinstance(field_data, dict) and 'value' in field_data:
+ field_type = field_data.get('type', 'text')
+ value = field_data.get('value', '')
+
+ # Skip certain field types
+ if field_type in ['number', 'link', 'image']:
+ translated_fields[arabic_key] = field_data
+ continue
+
+ # Translate the field value
+ translation_result = translator.translate_field(arabic_key, value)
+
+ if translation_result.confidence > 0:
+ new_field_data = field_data.copy()
+ new_field_data['translated_value'] = translation_result.translated_text
+ new_field_data['translation_confidence'] = translation_result.confidence
+ translated_fields[arabic_key] = new_field_data
+ else:
+ translated_fields[arabic_key] = field_data
+
+ # Build final result
+ translated_data = mapped_data.copy()
+ translated_data['translated_fields'] = translated_fields
+ translated_data['translation_metadata'] = {
+ 'service': translator.get_service_name(),
+ 'target_language': target_lang,
+ 'translation_method': 'field_by_field',
+ 'total_fields': len(arabic_fields),
+ 'translated_fields': len([k for k, v in translated_fields.items()
+ if isinstance(v, dict) and 'translated_value' in v]),
+ 'success': True
+ }
+
+ logger.info("Field-by-field translation completed")
+ return translated_data
+
+ except Exception as e:
+ logger.error(f"Error in field-by-field translation: {e}")
+ return _add_translation_error(mapped_data, str(e))
\ No newline at end of file
diff --git a/tasks/InfoboxSync/wikilocalize/README.md b/tasks/InfoboxSync/wikilocalize/README.md
new file mode 100644
index 00000000..33c16173
--- /dev/null
+++ b/tasks/InfoboxSync/wikilocalize/README.md
@@ -0,0 +1,45 @@
+# Wiki Localization Stage
+
+This stage processes Arabic Wikipedia templates to localize English wiki links and template names to their Arabic equivalents.
+
+## Purpose
+
+- **Wiki Link Localization**: Converts `[[English Page]]` to `[[Arabic Page]]` when Arabic equivalent exists
+- **Template Localization**: Converts template names like `{{Infobox}}` to Arabic equivalents like `{{صندوق}}`
+- **Fallback Handling**: Uses `{{واو}}` template for English links that don't have Arabic equivalents
+- **Interlanguage Link Support**: Uses Wikipedia API to find Arabic versions via langlinks
+
+## Features
+
+✅ **Wiki Link Processing**: Extract and replace `[[link|text]]` patterns
+✅ **Template Processing**: Extract and replace `{{template|params}}` patterns
+✅ **Arabic Wikipedia API**: Check page existence on Arabic Wikipedia
+✅ **Interlanguage Retrieval**: Get Arabic equivalents from English wiki langlinks
+✅ **واو Template Fallback**: Automatically insert `{{واو}}` for untranslated links
+✅ **Error Handling**: Comprehensive error reporting and logging
+
+## Usage
+
+```python
+from wikilocalize import localize_arabic_content
+
+# Process Arabic content with English links
+result = localize_arabic_content(arabic_template_text)
+print(f"Replaced {result.original_links_replaced} links")
+print(f"Inserted {result.waou_templates_inserted} واو templates")
+```
+
+## Pipeline Integration
+
+This stage sits between **construct** (template building) and **publish** (publish to Wikipedia):
+
+1. **Construct** builds Arabic template from translated data
+2. **WikiLocalize** processes links/templates to Arabic equivalents
+3. **Publish** sends the localized template to Arabic Wikipedia
+
+## API Integration
+
+- Uses Arabic Wikipedia REST API for existence checking
+- Uses English Wikipedia Action API for langlink retrieval
+- Handles API errors gracefully with fallback behavior
+- Caches results to minimize API calls
\ No newline at end of file
diff --git a/tasks/InfoboxSync/wikilocalize/__init__.py b/tasks/InfoboxSync/wikilocalize/__init__.py
new file mode 100644
index 00000000..88f6106a
--- /dev/null
+++ b/tasks/InfoboxSync/wikilocalize/__init__.py
@@ -0,0 +1,3 @@
+"""
+Wiki localization stage for converting English wiki links and templates to Arabic.
+"""
\ No newline at end of file
diff --git a/tasks/InfoboxSync/wikilocalize/integrator.py b/tasks/InfoboxSync/wikilocalize/integrator.py
new file mode 100644
index 00000000..c902ee7a
--- /dev/null
+++ b/tasks/InfoboxSync/wikilocalize/integrator.py
@@ -0,0 +1,175 @@
+"""
+Integration functions for embedding wiki localization into the InfoboxSync pipeline.
+"""
+
+import logging
+from typing import Dict, Any, Optional
+from dataclasses import dataclass
+from tasks.InfoboxSync.wikilocalize.wikilocalize import WikiLocalizeResult, WikiLocalizer
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class LocalizationProcessingResult:
+ """Result of localization processing in the pipeline."""
+ success: bool
+ localized_data: Dict[str, Any]
+ localization_info: WikiLocalizeResult
+ processing_time: float
+ errors: list = None
+
+ def __post_init__(self):
+ if self.errors is None:
+ self.errors = []
+
+
+def process_construct_to_publish(
+ construct_result: Dict[str, Any],
+ enable_local_link_replacement: bool = True,
+ enable_template_localization: bool = True
+) -> LocalizationProcessingResult:
+ """
+ Process data from construct stage through wiki localization for publishing.
+
+ This function sits between construct and publish stages, taking the
+ constructed Arabic template and localizing any English wiki links
+ and templates to their Arabic equivalents.
+
+ Args:
+ construct_result (Dict[str, Any]): Data from construct stage containing 'arabic_template'
+ enable_local_link_replacement (bool): Whether to replace English wiki links with Arabic
+ enable_template_localization (bool): Whether to localize template names
+
+ Returns:
+ LocalizationProcessingResult: Processed data ready for publishing
+ """
+ import time
+ start_time = time.time()
+
+ logger.info("Starting wiki localization processing")
+
+ try:
+ # Check if we have the required input
+ if 'arabic_template' not in construct_result:
+ error_msg = "No arabic_template found in construct_result"
+ logger.error(error_msg)
+ return LocalizationProcessingResult(
+ success=False,
+ localized_data=construct_result,
+ localization_info=WikiLocalizeResult(
+ localized_content="",
+ original_links_replaced=0,
+ templates_localized=0,
+ waou_templates_inserted=0,
+ errors=[error_msg]
+ ),
+ processing_time=time.time() - start_time,
+ errors=[error_msg]
+ )
+
+ arabic_content = construct_result['arabic_template']
+ if not arabic_content or not arabic_content.strip():
+ error_msg = "Arabic template is empty"
+ logger.error(error_msg)
+ return LocalizationProcessingResult(
+ success=False,
+ localized_data=construct_result,
+ localization_info=WikiLocalizeResult(
+ localized_content=arabic_content,
+ original_links_replaced=0,
+ templates_localized=0,
+ waou_templates_inserted=0,
+ errors=[error_msg]
+ ),
+ processing_time=time.time() - start_time,
+ errors=[error_msg]
+ )
+
+ # Initialize localizer
+ localizer = WikiLocalizer()
+
+ # Perform localization if enabled
+ if enable_local_link_replacement or enable_template_localization:
+ localization_result = localizer.localize_content(arabic_content)
+
+ # Update the construct result with localized content
+ localized_data = construct_result.copy()
+ localized_data['arabic_template'] = localization_result.localized_content
+ localized_data['localization_metadata'] = {
+ 'links_replaced': localization_result.original_links_replaced,
+ 'templates_localized': localization_result.templates_localized,
+ 'waou_templates_inserted': localization_result.waou_templates_inserted,
+ 'localization_errors': localization_result.errors
+ }
+
+ processing_time = time.time() - start_time
+
+ logger.info("Wiki localization completed successfully")
+ logger.info(f"- Links replaced: {localization_result.original_links_replaced}")
+ logger.info(f"- Templates localized: {localization_result.templates_localized}")
+ logger.info(f"- واو templates inserted: {localization_result.waou_templates_inserted}")
+
+ if localization_result.errors:
+ logger.warning(f"Localization errors: {localization_result.errors}")
+
+ return LocalizationProcessingResult(
+ success=len(localization_result.errors) == 0,
+ localized_data=localized_data,
+ localization_info=localization_result,
+ processing_time=processing_time
+ )
+ else:
+ # Localization disabled, just pass through
+ logger.info("Wiki localization disabled, passing through data unchanged")
+ return LocalizationProcessingResult(
+ success=True,
+ localized_data=construct_result,
+ localization_info=WikiLocalizeResult(
+ localized_content=arabic_content,
+ original_links_replaced=0,
+ templates_localized=0,
+ waou_templates_inserted=0,
+ errors=[]
+ ),
+ processing_time=time.time() - start_time
+ )
+
+ except Exception as e:
+ error_msg = f"Unexpected error during localization processing: {e}"
+ logger.error(error_msg)
+ processing_time = time.time() - start_time
+
+ return LocalizationProcessingResult(
+ success=False,
+ localized_data=construct_result,
+ localization_info=WikiLocalizeResult(
+ localized_content=construct_result.get('arabic_template', ''),
+ original_links_replaced=0,
+ templates_localized=0,
+ waou_templates_inserted=0,
+ errors=[error_msg]
+ ),
+ processing_time=processing_time,
+ errors=[error_msg]
+ )
+
+
+def get_localization_statistics(localization_result: WikiLocalizeResult) -> Dict[str, Any]:
+ """
+ Extract useful statistics from localization results for reporting.
+
+ Args:
+ localization_result (WikiLocalizeResult): Localization result
+
+ Returns:
+ Dict[str, Any]: Statistics dictionary
+ """
+ return {
+ 'total_links_processed': localization_result.original_links_replaced + localization_result.waou_templates_inserted,
+ 'links_successfully_replaced': localization_result.original_links_replaced,
+ 'waou_fallback_templates': localization_result.waou_templates_inserted,
+ 'templates_localized': localization_result.templates_localized,
+ 'localization_errors': len(localization_result.errors),
+ 'success_rate': 'High' if not localization_result.errors else 'Medium' if localization_result.original_links_replaced > 0 else 'Low'
+ }
\ No newline at end of file
diff --git a/tasks/InfoboxSync/wikilocalize/wikilocalize.py b/tasks/InfoboxSync/wikilocalize/wikilocalize.py
new file mode 100644
index 00000000..0b1d95e6
--- /dev/null
+++ b/tasks/InfoboxSync/wikilocalize/wikilocalize.py
@@ -0,0 +1,317 @@
+
+import logging
+from typing import List, Optional, Any
+from dataclasses import dataclass
+import wikitextparser as wtp
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class WOWTemplateItem:
+ """Information about a واو template replacement."""
+ link: Any # Wikilink object from wikitextparser
+ localization_result: 'LangLinkResult' # Full localization result object
+
+@dataclass
+class WikiLocalizeResult:
+ """Result of wiki localization process."""
+ localized_content: str
+ original_links_replaced: int
+ templates_localized: int
+ waou_templates_inserted: int
+ wow_templates: List[WOWTemplateItem]
+ errors: List[str]
+
+
+@dataclass
+class LangLinkResult:
+ """Result of language link retrieval."""
+ lang: Optional[str] = None
+ ar_page: Optional[str] = None
+ en_page: Optional[str] = None
+
+ def is_empty(self) -> bool:
+ """Check if the result is empty."""
+ return (self.lang is None and self.ar_page is None
+ and self.en_page is None)
+
+
+def dummy_function():
+ """Dummy function to avoid linting issues."""
+ pass
+
+
+class WikipediaAPI:
+ """Interface to Wikipedia APIs using pywikibot."""
+
+ @staticmethod
+ def check_arabic_page_exists(page_title: str) -> Optional[str]:
+ """
+ Check if a page exists on Arabic Wikipedia using pywikibot.
+ If it's a redirect, it resolves to the target page.
+
+ Args:
+ page_title (str): Page title to check
+
+ Returns:
+ Optional[str]: The resolved page title if it exists, None otherwise
+ """
+ try:
+ import pywikibot
+
+ # Create Arabic Wikipedia site
+ arabic_site = pywikibot.Site('ar', 'wikipedia')
+
+ # Create page object
+ page = pywikibot.Page(arabic_site, page_title)
+
+ # Resolve redirects recursively
+ seen_titles = set()
+ while page.isRedirectPage():
+ if page.title() in seen_titles:
+ logger.warning(
+ f"Circular redirect detected for '{page_title}'")
+ return None # Return None for circular redirects
+ seen_titles.add(page.title())
+ page = page.getRedirectTarget()
+
+ if page.exists():
+ return page.title().replace('_', ' ')
+ return None
+
+ except ImportError:
+ logger.warning("pywikibot not available for Arabic page check")
+ return False
+ except Exception as e:
+ logger.error(f"Error checking Arabic page existence: {e}")
+ return False
+
+ @staticmethod
+ def get_arabic_langlink(en_page_title: str) -> Optional[str]:
+ """
+ Get the Arabic language link for an English Wikipedia page.
+
+ Args:
+ en_page_title (str): English page title
+
+ Returns:
+ Optional[str]: Arabic page title if exists, None otherwise
+ """
+ try:
+ import pywikibot
+
+ # Create English Wikipedia site and get page
+ english_site = pywikibot.Site('en', 'wikipedia')
+
+ # Clean up the page title
+ clean_title = en_page_title.strip()
+ if clean_title.startswith('[[') and clean_title.endswith(']]'):
+ clean_title = clean_title[2:-2]
+ if '|' in clean_title:
+ clean_title = clean_title.split('|')
+
+ page = pywikibot.Page(english_site, clean_title)
+
+ # Check if page exists on English Wikipedia
+ # Check if page exists on English Wikipedia
+ if not page.exists():
+ logger.debug(
+ f"Page '{clean_title}' does not exist on EN Wikipedia")
+ return None
+
+ # Get langlinks and find Arabic version
+ langlinks = page.langlinks()
+ for langlink in langlinks:
+ if langlink.site.code == 'ar':
+ return langlink.title.replace('_', ' ')
+
+ logger.debug(f"No Arabic langlink found for: {clean_title}")
+ return None
+
+ except ImportError:
+ logger.warning("pywikibot not available for langlink retrieval")
+ return None
+ except Exception as e:
+ logger.error(
+ f"Error getting Arabic langlink for '{en_page_title}': {e}")
+ return None
+
+ @staticmethod
+ def get_arabic_langlink_detailed(en_page_title: str) -> LangLinkResult:
+ """
+ Get the Arabic language link for an English Wikipedia page with
+ detailed results.
+
+ Args:
+ en_page_title (str): English page title
+
+ Returns:
+ LangLinkResult: Object with language and page information
+ - If Arabic found: {lang='ar', ar_page=arabic_title}
+ - If English exists: {lang='en', en_page=english_title}
+ - If not found: empty object {}
+ """
+ try:
+ import pywikibot
+
+ # Create English Wikipedia site and get page
+ english_site = pywikibot.Site('en', 'wikipedia')
+
+ # Clean up the page title
+ clean_title = en_page_title.strip()
+ if clean_title.startswith('[[') and clean_title.endswith(']]'):
+ clean_title = clean_title[2:-2]
+ if '|' in clean_title:
+ clean_title = clean_title.split('|')[0] # Take first part
+
+ page = pywikibot.Page(english_site, clean_title)
+
+ # Check if page exists on English Wikipedia
+ if not page.exists():
+ logger.debug(
+ f"Page '{clean_title}' does not exist on EN Wikipedia")
+ return LangLinkResult() # Return empty object
+
+ # Get langlinks and find Arabic version
+ langlinks = page.langlinks()
+ for langlink in langlinks:
+ if langlink.site.code == 'ar':
+ return LangLinkResult(
+ lang='ar',
+ ar_page=langlink.title.replace('_', ' ')
+ )
+
+ # No Arabic link found, but English page exists
+ logger.debug(f"No Arabic langlink found for: {clean_title}")
+ return LangLinkResult(lang='en', en_page=clean_title)
+
+ except ImportError:
+ logger.warning("pywikibot not available for langlink retrieval")
+ return LangLinkResult()
+ except Exception as e:
+ logger.error(
+ f"Error getting Arabic langlink for '{en_page_title}': {e}")
+ return LangLinkResult()
+
+
+class WikiLocalizer:
+ """
+ Localizes wiki links and templates within a given wikitext.
+ """
+
+ def localize_content(self, content: str) -> WikiLocalizeResult:
+ """
+ Localizes wiki links and templates in the provided wikitext content.
+
+ Args:
+ content (str): The wikitext content to localize.
+
+ Returns:
+ WikiLocalizeResult: The result of the localization process.
+ """
+ localized_content = content
+ original_links_replaced = 0
+ templates_localized = 0
+ waou_templates_inserted = 0
+ wow_templates = []
+ errors = []
+
+
+ parsed_content = wtp.parse(content)
+
+ # Localize wikilinks
+ for link in parsed_content.wikilinks:
+ original_target = link.target
+ localization_result = (self
+ ._localize_wikilink(original_target, errors))
+ if not localization_result.is_empty():
+ # Use the localized page based on language
+ if localization_result.lang == 'ar' and localization_result.ar_page:
+ if localization_result.ar_page != original_target:
+ link.target = localization_result.ar_page
+ original_links_replaced += 1
+ elif (localization_result.lang == 'en' and localization_result.en_page):
+ # Use واو template for English pages without Arabic equivalent
+ wow_templates.append(WOWTemplateItem(
+ link=link,
+ localization_result=localization_result
+ ))
+
+ # Localize templates
+ # for template in parsed_content.templates:
+ # original_name = template.name
+ # localized_name, is_waou = \
+ # self._localize_template(original_name, errors)
+ # if localized_name != original_name:
+ # template.name = localized_name
+ # templates_localized += 1
+ # if is_waou:
+ # waou_templates_inserted += 1
+
+ localized_content = parsed_content.string
+
+ # Handle WOW templates after link localization
+ for wow_template in wow_templates:
+ en_page = wow_template.localization_result.en_page
+ ar_text = wow_template.link.text
+ temp_template = f"{{{{وإو|{ar_text}|{en_page}}}}}"
+ localized_content = localized_content.replace(wow_template.link.string, temp_template)
+
+ return WikiLocalizeResult(
+ localized_content=localized_content,
+ original_links_replaced=original_links_replaced,
+ templates_localized=templates_localized,
+ waou_templates_inserted=waou_templates_inserted,
+ wow_templates=wow_templates,
+ errors=errors
+ )
+
+ def _localize_wikilink(self, target: str, errors: List[str]) -> LangLinkResult:
+ """
+ Localizes a single wikilink target.
+
+ Returns:
+ LangLinkResult: Object with lang and page info
+ - If Arabic page found: {lang='ar', ar_page=arabic_title}
+ - If English exists: {lang='en', en_page=target}
+ - If not found: empty object {}
+ """
+ # 1. Check in ar wiki directly first
+ arabic_page_title = WikipediaAPI.check_arabic_page_exists(target)
+ if arabic_page_title:
+ return LangLinkResult(lang='ar', ar_page=arabic_page_title)
+
+ # 2. Check in en wiki with detailed results
+ langlink_result = WikipediaAPI.get_arabic_langlink_detailed(target)
+ if not langlink_result.is_empty():
+ return langlink_result
+
+ # If not found, return empty result
+ return LangLinkResult()
+
+ def _localize_template(self, template_name: str, errors: List[str]) \
+ -> (str, bool):
+ """
+ Localizes a single template name.
+ Returns (localized_name, is_waou_template)
+ """
+ is_waou = False
+ # 1. Check in ar wiki, use if found (and resolved)
+ arabic_template_page_title = \
+ WikipediaAPI.check_arabic_page_exists(template_name)
+ if arabic_template_page_title:
+ return arabic_template_page_title, is_waou
+
+ # 2. Check in en wiki with detailed results
+ langlink_result = (WikipediaAPI
+ .get_arabic_langlink_detailed(template_name))
+ if not langlink_result.is_empty():
+ if langlink_result.lang == 'ar' and langlink_result.ar_page:
+ return langlink_result.ar_page, is_waou
+ elif langlink_result.lang == 'en' and langlink_result.en_page:
+ return langlink_result.en_page, is_waou
+
+ # If not found in en wiki, use واو template
+ is_waou = True
+ return f"واو|{template_name}", is_waou
\ No newline at end of file