diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..29dda20c --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,18 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Run pwb.py with test.py", + "type": "python", + "request": "launch", + "program": "/home/lokas/PycharmProjects/pythonProject3/core_stable/pwb.py", + "args": [ + "-dir:/home/lokas/PycharmProjects/pythonProject3/core_stable", + "/home/lokas/PycharmProjects/pythonProject3/code/tasks/InfoboxSync/test.py" + ], + "console": "integratedTerminal", + "justMyCode": false, + "python": "/usr/bin/python3.9" + } + ] +} diff --git a/output/paul_abasolo.json b/output/paul_abasolo.json new file mode 100644 index 00000000..66ef445b --- /dev/null +++ b/output/paul_abasolo.json @@ -0,0 +1,800 @@ +{ + "page_title": "Paul Abasolo", + "template_type": "football_biography", + "arabic_fields": { + "أندية_الشباب": { + "value": [ + "Lauaxeta Ikastola", + "[[Athletic Bilbao]]" + ], + "type": "numbered", + "item_type": "raw", + "count": 2, + "original_keys": [ + "youthclubs1", + "youthclubs2" + ] + }, + "سنوات_الشباب": { + "value": [ + "1995–1996", + "1996–2002" + ], + "type": "numbered", + "item_type": "raw", + "count": 2, + "original_keys": [ + "youthyears1", + "youthyears2" + ] + }, + "أندية": { + "value": [ + "[[CD Basconia|Basconia]]", + "[[Barakaldo CF|Barakaldo]]", + "[[SD Eibar|Eibar]]", + "→ [[SD Lemona|Lemona]] (loan)", + "→ [[Logroñés CF|Logroñés]] (loan)", + "[[Logroñés CF|Logroñés]]", + "[[Real Unión]]", + "Iurretako", + "[[SD Lemona|Lemona]]", + "[[Real Oviedo|Oviedo]]", + "[[Sestao River Club|Sestao]]", + "[[Amurrio Club|Amurrio]]", + "[[Zamudio SD|Zamudio]]", + "[[Club Portugalete|Portugalete]]", + "Batea" + ], + "type": "numbered", + "item_type": "raw", + "count": 15, + "original_keys": [ + "clubs1", + "clubs2", + "clubs3", + "clubs4", + "clubs5", + "clubs6", + "clubs7", + "clubs8", + "clubs9", + "clubs10", + "clubs11", + "clubs12", + "clubs13", + "clubs14", + "clubs15" + ] + }, + "سنوات": { + "value": [ + "2002–2003", + "2003–2004", + "2004–2006", + "2005", + "2005–2006", + "2006–2007", + "2007–2010", + "2010", + "2011", + "2011–2012", + "2012–2013", + "2014", + "2015–2016", + "2016–2017", + "2018–2021" + ], + "type": "numbered", + "item_type": "raw", + "count": 15, + "original_keys": [ + "years1", + "years2", + "years3", + "years4", + "years5", + "years6", + "years7", + "years8", + "years9", + "years10", + "years11", + "years12", + "years13", + "years14", + "years15" + ] + }, + "مباريات": { + "value": [ + "35", + "24", + "2", + "16", + "24", + "29", + "82", + "11", + "21", + "26", + "13", + "45", + "12", + "41" + ], + "type": "numbered", + "item_type": "number", + "count": 14, + "original_keys": [ + "caps1", + "caps2", + "caps3", + "caps4", + "caps5", + "caps6", + "caps7", + "caps9", + "caps10", + "caps11", + "caps12", + "caps13", + "caps14", + "caps15" + ] + }, + "أهداف": { + "value": [ + "5", + "1", + "0", + "4", + "2", + "8", + "12", + "1", + "2", + "0", + "5", + "17", + "8", + "10" + ], + "type": "numbered", + "item_type": "number", + "count": 14, + "original_keys": [ + "goals1", + "goals2", + "goals3", + "goals4", + "goals5", + "goals6", + "goals7", + "goals9", + "goals10", + "goals11", + "goals12", + "goals13", + "goals14", + "goals15" + ] + }, + "منتخب_وطني": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "سنوات_وطنية": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "مباريات_وطنية": { + "value": [], + "type": "numbered", + "item_type": "number", + "count": 0, + "original_keys": [] + }, + "أهداف_وطنية": { + "value": [], + "type": "numbered", + "item_type": "number", + "count": 0, + "original_keys": [] + }, + "أندية_مدرب": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "سنوات_مدرب": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "اسم": { + "value": "Paul Abasolo", + "type": "text", + "original_key": "name", + "validation": { + "is_valid": true, + "length": 12, + "has_special_chars": false + } + }, + "الاسم الكامل": { + "value": "Paul Abasolo Amantegi", + "type": "text", + "original_key": "fullname", + "validation": { + "is_valid": true, + "length": 21, + "has_special_chars": false + } + }, + "تاريخ الولادة": { + "value": "{{birth date and age|1984|6|29|df=yes}}", + "type": "raw", + "original_key": "birth_date", + "validation": { + "is_valid": true + } + }, + "مكان الولادة": { + "value": "[[Durango, Spain]]", + "type": "raw", + "original_key": "birth_place", + "validation": { + "is_valid": true + } + }, + "الطول": { + "value": 1.84, + "type": "number", + "original_key": "height", + "validation": { + "is_valid": true, + "numeric_value": 1.84, + "has_units": true + }, + "numeric_value": 1.84 + }, + "المركز": { + "value": "[[Forward (association football)|Forward]]", + "type": "raw", + "original_key": "position", + "validation": { + "is_valid": true + } + }, + "مجموع_مباريات": { + "value": 381.0, + "type": "number", + "original_key": "totalcaps", + "validation": { + "is_valid": true, + "numeric_value": 381.0, + "has_units": true + }, + "numeric_value": 381.0 + }, + "إجمالي الأهداف": { + "value": 75.0, + "type": "number", + "original_key": "totalgoals", + "validation": { + "is_valid": true, + "numeric_value": 75.0, + "has_units": true + }, + "numeric_value": 75.0 + } + }, + "metadata": { + "categories": [ + "1984 births", + "Living people", + "Footballers from Durango, Biscay", + "Spanish men's footballers", + "Men's association football forwards", + "Segunda División players", + "Segunda División B players", + "Tercera División players", + "Divisiones Regionales de Fútbol players", + "CD Basconia footballers", + "Athletic Bilbao footballers", + "Barakaldo CF footballers", + "SD Eibar footballers", + "SD Lemona footballers", + "Logroñés CF footballers", + "Real Unión footballers", + "Real Oviedo players", + "Sestao River Club footballers", + "Amurrio Club footballers", + "Zamudio SD players", + "Club Portugalete players", + "People convicted of sexual assault", + "21st-century Spanish sportsmen" + ], + "links": [ + "Durango, Spain", + "Forward (association football)", + "Athletic Bilbao", + "CD Basconia", + "Barakaldo CF", + "SD Eibar", + "SD Lemona", + "Logroñés CF", + "Logroñés CF", + "Real Unión", + "SD Lemona", + "Real Oviedo", + "Sestao River Club", + "Amurrio Club", + "Zamudio SD", + "Club Portugalete", + "Association football", + "Forward (association football)", + "Durango, Spain", + "Biscay", + "Athletic Bilbao", + "farm team", + "CD Basconia", + "Mundo Deportivo", + "Segunda División B", + "Basque Country (autonomous community)", + "SD Eibar", + "2004–05 Segunda División", + "Segunda División", + "SD Lemona", + "Logroñés CF", + "El Correo", + "2009–10 Segunda División", + "Football in Spain", + "Marca (newspaper)", + "El Mundo (Spain)", + "Real Unión", + "2008–09 Segunda División B", + "ABC (newspaper)", + "Real Oviedo", + "Sestao River Club", + "sexual assault", + "Government of Spain", + "La Nueva España", + "Argia (magazine)" + ], + "template_name": "football_biography", + "total_mapped_fields": 20, + "original_field_count": 70 + }, + "raw_content": "{{short description|Spanish footballer}}\n{{family name hatnote|Abasolo|Amantegi|lang=Spanish}}\n{{Use dmy dates|date=January 2024}}\n{{Infobox football biography\n| name = Paul Abasolo\n| image = \n| fullname = Paul Abasolo Amantegi\n| birth_date = {{birth date and age|1984|6|29|df=yes}} \n| birth_place = [[Durango, Spain]]\n| height = {{height|m=1.84}}\n| position = [[Forward (association football)|Forward]]\n| currentclub = \n| clubnumber = \n| youthyears1 = 1995–1996 | youthclubs1 = Lauaxeta Ikastola\n| youthyears2 = 1996–2002 | youthclubs2 = [[Athletic Bilbao]]\n| years1 = 2002–2003 | clubs1 = [[CD Basconia|Basconia]] | caps1 = 35 | goals1 = 5\n| years2 = 2003–2004 | clubs2 = [[Barakaldo CF|Barakaldo]] | caps2 = 24 | goals2 = 1\n| years3 = 2004–2006 | clubs3 = [[SD Eibar|Eibar]] | caps3 = 2 | goals3 = 0\n| years4 = 2005 | clubs4 = → [[SD Lemona|Lemona]] (loan) | caps4 = 16 | goals4 = 4\n| years5 = 2005–2006 | clubs5 = → [[Logroñés CF|Logroñés]] (loan) | caps5 = 24 | goals5 = 2\n| years6 = 2006–2007 | clubs6 = [[Logroñés CF|Logroñés]] | caps6 = 29 | goals6 = 8\n| years7 = 2007–2010 | clubs7 = [[Real Unión]] | caps7 = 82 | goals7 = 12\n| years8 = 2010 | clubs8 = Iurretako | caps8 = | goals8 =\n| years9 = 2011 | clubs9 = [[SD Lemona|Lemona]] | caps9 = 11 | goals9 = 1\n| years10 = 2011–2012 | clubs10 = [[Real Oviedo|Oviedo]] | caps10 = 21 | goals10 = 2\n| years11 = 2012–2013 | clubs11 = [[Sestao River Club|Sestao]] | caps11 = 26 | goals11 = 0\n| years12 = 2014 | clubs12 = [[Amurrio Club|Amurrio]] | caps12 = 13 | goals12 = 5 \n| years13 = 2015–2016 | clubs13 = [[Zamudio SD|Zamudio]] | caps13 = 45 | goals13 = 17\n| years14 = 2016–2017 | clubs14 = [[Club Portugalete|Portugalete]] | caps14 = 12 | goals14 = 8\n| years15 = 2018–2021 | clubs15 = Batea | caps15 = 41 | goals15 = 10\n| totalcaps = 381 | totalgoals = 75\n| club-update =\n| nationalteam-update =\n}}\n'''Paul Abasolo Amantegi''' ({{IPA|es|pawl aβaˈsolo amanˈtexi}}; born 29 June 1984) is a Spanish former [[Association football|footballer]] who played as a [[Forward (association football)|forward]].\n\n==Club career==\nBorn in [[Durango, Spain|Durango]], [[Biscay]], Abasolo spent seven years connected with [[Athletic Bilbao]], six in the youth system and one with the [[farm team]], [[CD Basconia]].[https://www.mundodeportivo.com/20111207/athletic-bilbao/entrevista-abasolo-gozada-jugar-athletic_54239906805.html Abasolo: \"Para mí es una gozada ver jugar a este Athletic\" (Abasolo: \"I'm having a blast watching this Athletic play\")]; [[Mundo Deportivo]], 7 December 2011 (in Spanish) Released in 2003, he played the better part of the following six years in the [[Segunda División B]] and in his native [[Basque Country (autonomous community)|Basque Country]], the sole exception being [[SD Eibar]] in the first part of the [[2004–05 Segunda División|2004–05 season]] in the [[Segunda División]], with that club loaning him consecutively to two other teams in division three, [[SD Lemona]][http://hemeroteca.mundodeportivo.com/preview/2005/02/01/pagina-28/1348145/pdf.html#&mode=fullScreen Cuatro fichajes sobre la bocina (Four signings at the buzzer)]; Mundo Deportivo, 1 February 2005 (in Spanish) and [[Logroñés CF]].[https://www.elcorreo.com/vizcaya/20070716/deportes/la-rioja/logrones-inicia-trabajo-jugadores-20070716.html El Logroñés CF inicia hoy el trabajo con 20 jugadores (Logroñés CF start working today with 20 players)]; [[El Correo]], 23 July 2007 (in Spanish)\n\nIn the [[2009–10 Segunda División|2009–10 campaign]], Abasolo competed for the second time in the second tier of [[Football in Spain|Spanish football]], scoring four goals[http://www.marca.com/2009/10/03/futbol/2adivision/1254596582.html El Real Unión se aprovecha de un Castellón que no levanta cabeza (Real Unión take advantage of sunken Castellón)] {{Webarchive|url=https://web.archive.org/web/20140821065415/http://www.marca.com/2009/10/03/futbol/2adivision/1254596582.html|date=21 August 2014}}; [[Marca (newspaper)|Marca]], 3 October 2009 (in Spanish)[http://www.marca.com/2009/10/11/futbol/2adivision/1255290535.html Un gran Real Unión dejó sin dos puntos al Betis (Great Real Unión rob Betis of two points)] {{Webarchive|url=https://web.archive.org/web/20140821065450/http://www.marca.com/2009/10/11/futbol/2adivision/1255290535.html|date=21 August 2014}}; Marca, 11 October 2009 (in Spanish)[http://www.elmundo.es/elmundo/2009/10/25/paisvasco/1256496810.html El Real Unión cae 2–1 ante el Cádiz en el Carranza con un gol de Ogbeche (Real Unión fall 2–1 against Cádiz at the Carranza with Ogbeche goal)] {{Webarchive|url=https://web.archive.org/web/20160609194820/http://www.elmundo.es/elmundo/2009/10/25/paisvasco/1256496810.html|date=9 June 2016}}; [[El Mundo (Spain)|El Mundo]], 25 October 2009 (in Spanish)[http://www.marca.com/2010/05/22/futbol/2adivision/1274555627.html El Real Unión cree en la salvación ante un 'novato' Numancia (Real Unión believe in survival against 'rookie' Numancia)] {{Webarchive|url=https://web.archive.org/web/20171118222012/http://www.marca.com/2010/05/22/futbol/2adivision/1274555627.html|date=18 November 2017}}; Marca, 22 May 2010 (in Spanish) in 34 games for [[Real Unión]][http://www.marca.com/2009/07/15/futbol/equipos/real_union/1247674204.html Paul Abasolo no jugará con el Athletic (Paul Abasolo will not play for Athletic)] {{Webarchive|url=https://web.archive.org/web/20120110114532/http://www.marca.com/2009/07/15/futbol/equipos/real_union/1247674204.html|date=10 January 2012}}; Marca, 15 July 2009 (in Spanish) as they suffered relegation one year after [[2008–09 Segunda División B|being promoted]].[https://www.abc.es/deportes/futbol/hercules-primera-201006190000_noticia.html El Hércules vuelve a Primera catorce años después (Hércules return to ''Primera'' fourteen years later)]; [[ABC (newspaper)|ABC]], 19 June 2010 (in Spanish) After a few months playing with a regional league side, he resumed his career in the third division with Lemona, [[Real Oviedo]] and [[Sestao River Club]].[https://www.eldesmarque.com/noticias/pais-vasco/20160602/neira-abasolo-y-zarrabeitia-mas-calidad-para-el-portugalete_60053537.html Neira, Abasolo y Zarrabeitia, más calidad para el Portugalete (Neira, Abasolo and Zarrabeitia, more skill for Portugalete)]; El Desmarque, 2 June 2016 (in Spanish)\n\n==Conviction==\nConvicted of [[sexual assault]] in July 2010 for having attacked three young women, Abasolo was acquitted on a fourth charge due to doubts of the alleged victim.[http://www.elmundo.es/elmundo/2010/07/06/paisvasco/1278430901.html Condenan a un ex futbolista del Real Unión a 3 años de cárcel por abusos sexuales (Real Unión footballer sentenced to 3 years in jail for sexual assault)] {{Webarchive|url=https://web.archive.org/web/20180620153200/http://www.elmundo.es/elmundo/2010/07/06/paisvasco/1278430901.html|date=20 June 2018}}; El Mundo, 6 July 2010 (in Spanish) He was eventually pardoned by the [[Government of Spain]], but this fact prevented him from being hired by his former club Athletic Bilbao.[http://www.lne.es/deportes/2012/01/21/abasolo-indultado-tres-delitos-agresion-sexual/1187384.html Abasolo, indultado de tres delitos de agresión sexual (Abasolo, pardoned on three sexual assault charges)] {{Webarchive|url=https://web.archive.org/web/20160602031215/http://www.lne.es/deportes/2012/01/21/abasolo-indultado-tres-delitos-agresion-sexual/1187384.html|date=2 June 2016}}; [[La Nueva España]], 21 January 2012 (in Spanish)[http://www.argia.com/argia-astekaria/2317/abasolo-auzia Abasolo auzia: Indultuak zabaldutako zauriak (The Abasolo case: the wounds opened by the pardon)] {{Webarchive|url=https://web.archive.org/web/20130514161758/http://www.argia.com/argia-astekaria/2317/abasolo-auzia|date=14 May 2013}}; [[Argia (magazine)|Argia]], 1 April 2012 (in Basque)\n\n==References==\n{{Reflist}}\n\n==External links==\n*{{BDFutbol|5033}}\n*{{Futbolme|37}}\n*{{Athletic Bilbao profile|id=461/abasolo}}\n*{{LaPreferente|37445}}\n*{{Soccerway|paul-abasolo-amantegi/61737}}\n\n{{DEFAULTSORT:Abasolo, Paul}}\n[[Category:1984 births]]\n[[Category:Living people]]\n[[Category:Footballers from Durango, Biscay]]\n[[Category:Spanish men's footballers]]\n[[Category:Men's association football forwards]]\n[[Category:Segunda División players]]\n[[Category:Segunda División B players]]\n[[Category:Tercera División players]]\n[[Category:Divisiones Regionales de Fútbol players]]\n[[Category:CD Basconia footballers]]\n[[Category:Athletic Bilbao footballers]]\n[[Category:Barakaldo CF footballers]]\n[[Category:SD Eibar footballers]]\n[[Category:SD Lemona footballers]]\n[[Category:Logroñés CF footballers]]\n[[Category:Real Unión footballers]]\n[[Category:Real Oviedo players]]\n[[Category:Sestao River Club footballers]]\n[[Category:Amurrio Club footballers]]\n[[Category:Zamudio SD players]]\n[[Category:Club Portugalete players]]\n[[Category:People convicted of sexual assault]]\n[[Category:21st-century Spanish sportsmen]]", + "arabic_title": "بول أباسولو", + "translated_fields": { + "أندية_الشباب": { + "value": [ + "Lauaxeta Ikastola", + "[[Athletic Bilbao]]" + ], + "type": "numbered", + "item_type": "raw", + "count": 2, + "original_keys": [ + "youthclubs1", + "youthclubs2" + ], + "translated_value": [ + "لاوكسيتا إيكاستولا", + "[[Athletic Bilbao|أتلتيك بلباو]]" + ], + "translation_confidence": 0.9 + }, + "سنوات_الشباب": { + "value": [ + "1995–1996", + "1996–2002" + ], + "type": "numbered", + "item_type": "raw", + "count": 2, + "original_keys": [ + "youthyears1", + "youthyears2" + ], + "translated_value": [ + "1995–1996", + "1996–2002" + ], + "translation_confidence": 0.9 + }, + "أندية": { + "value": [ + "[[CD Basconia|Basconia]]", + "[[Barakaldo CF|Barakaldo]]", + "[[SD Eibar|Eibar]]", + "→ [[SD Lemona|Lemona]] (loan)", + "→ [[Logroñés CF|Logroñés]] (loan)", + "[[Logroñés CF|Logroñés]]", + "[[Real Unión]]", + "Iurretako", + "[[SD Lemona|Lemona]]", + "[[Real Oviedo|Oviedo]]", + "[[Sestao River Club|Sestao]]", + "[[Amurrio Club|Amurrio]]", + "[[Zamudio SD|Zamudio]]", + "[[Club Portugalete|Portugalete]]", + "Batea" + ], + "type": "numbered", + "item_type": "raw", + "count": 15, + "original_keys": [ + "clubs1", + "clubs2", + "clubs3", + "clubs4", + "clubs5", + "clubs6", + "clubs7", + "clubs8", + "clubs9", + "clubs10", + "clubs11", + "clubs12", + "clubs13", + "clubs14", + "clubs15" + ], + "translated_value": [ + "[[CD Basconia|باسكونيا]]", + "[[Barakaldo CF|باراكالدو]]", + "[[SD Eibar|إيبار]]", + "→ [[SD Lemona|ليمونا]] (إعارة)", + "→ [[Logroñés CF|لوغروينيس]] (إعارة)", + "[[Logroñés CF|لوغروينيس]]", + "[[Real Unión|ريال يونيون]]", + "إيوريتاكو", + "[[SD Lemona|ليمونا]]", + "[[Real Oviedo|أوفييدو]]", + "[[Sestao River Club|سستاو]]", + "[[Amurrio Club|أموريو]]", + "[[Zamudio SD|زاموديو]]", + "[[Club Portugalete|بورتوغاليتي]]", + "باتيا" + ], + "translation_confidence": 0.9 + }, + "سنوات": { + "value": [ + "2002–2003", + "2003–2004", + "2004–2006", + "2005", + "2005–2006", + "2006–2007", + "2007–2010", + "2010", + "2011", + "2011–2012", + "2012–2013", + "2014", + "2015–2016", + "2016–2017", + "2018–2021" + ], + "type": "numbered", + "item_type": "raw", + "count": 15, + "original_keys": [ + "years1", + "years2", + "years3", + "years4", + "years5", + "years6", + "years7", + "years8", + "years9", + "years10", + "years11", + "years12", + "years13", + "years14", + "years15" + ], + "translated_value": [ + "2002–2003", + "2003–2004", + "2004–2006", + "2005", + "2005–2006", + "2006–2007", + "2007–2010", + "2010", + "2011", + "2011–2012", + "2012–2013", + "2014", + "2015–2016", + "2016–2017", + "2018–2021" + ], + "translation_confidence": 0.9 + }, + "مباريات": { + "value": [ + "35", + "24", + "2", + "16", + "24", + "29", + "82", + "11", + "21", + "26", + "13", + "45", + "12", + "41" + ], + "type": "numbered", + "item_type": "number", + "count": 14, + "original_keys": [ + "caps1", + "caps2", + "caps3", + "caps4", + "caps5", + "caps6", + "caps7", + "caps9", + "caps10", + "caps11", + "caps12", + "caps13", + "caps14", + "caps15" + ], + "translated_value": [ + "35", + "24", + "2", + "16", + "24", + "29", + "82", + "11", + "21", + "26", + "13", + "45", + "12", + "41" + ], + "translation_confidence": 0.9 + }, + "أهداف": { + "value": [ + "5", + "1", + "0", + "4", + "2", + "8", + "12", + "1", + "2", + "0", + "5", + "17", + "8", + "10" + ], + "type": "numbered", + "item_type": "number", + "count": 14, + "original_keys": [ + "goals1", + "goals2", + "goals3", + "goals4", + "goals5", + "goals6", + "goals7", + "goals9", + "goals10", + "goals11", + "goals12", + "goals13", + "goals14", + "goals15" + ], + "translated_value": [ + "5", + "1", + "0", + "4", + "2", + "8", + "12", + "1", + "2", + "0", + "5", + "17", + "8", + "10" + ], + "translation_confidence": 0.9 + }, + "منتخب_وطني": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "سنوات_وطنية": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "مباريات_وطنية": { + "value": [], + "type": "numbered", + "item_type": "number", + "count": 0, + "original_keys": [] + }, + "أهداف_وطنية": { + "value": [], + "type": "numbered", + "item_type": "number", + "count": 0, + "original_keys": [] + }, + "أندية_مدرب": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "سنوات_مدرب": { + "value": [], + "type": "numbered", + "item_type": "raw", + "count": 0, + "original_keys": [] + }, + "اسم": { + "value": "Paul Abasolo", + "type": "text", + "original_key": "name", + "validation": { + "is_valid": true, + "length": 12, + "has_special_chars": false + }, + "translated_value": "بول أباسولو", + "translation_confidence": 0.9 + }, + "الاسم الكامل": { + "value": "Paul Abasolo Amantegi", + "type": "text", + "original_key": "fullname", + "validation": { + "is_valid": true, + "length": 21, + "has_special_chars": false + }, + "translated_value": "بول أباسولو أمانتيغي", + "translation_confidence": 0.9 + }, + "تاريخ الولادة": { + "value": "{{birth date and age|1984|6|29|df=yes}}", + "type": "raw", + "original_key": "birth_date", + "validation": { + "is_valid": true + }, + "translated_value": "{{birth date and age|1984|6|29|df=yes}}", + "translation_confidence": 0.9 + }, + "مكان الولادة": { + "value": "[[Durango, Spain]]", + "type": "raw", + "original_key": "birth_place", + "validation": { + "is_valid": true + }, + "translated_value": "[[Durango, Spain|دورانجو، إسبانيا]]", + "translation_confidence": 0.9 + }, + "الطول": { + "value": 1.84, + "type": "number", + "original_key": "height", + "validation": { + "is_valid": true, + "numeric_value": 1.84, + "has_units": true + }, + "numeric_value": 1.84 + }, + "المركز": { + "value": "[[Forward (association football)|Forward]]", + "type": "raw", + "original_key": "position", + "validation": { + "is_valid": true + }, + "translated_value": "[[Forward (association football)|مهاجم]]", + "translation_confidence": 0.9 + }, + "مجموع_مباريات": { + "value": 381.0, + "type": "number", + "original_key": "totalcaps", + "validation": { + "is_valid": true, + "numeric_value": 381.0, + "has_units": true + }, + "numeric_value": 381.0 + }, + "إجمالي الأهداف": { + "value": 75.0, + "type": "number", + "original_key": "totalgoals", + "validation": { + "is_valid": true, + "numeric_value": 75.0, + "has_units": true + }, + "numeric_value": 75.0 + } + }, + "translation_metadata": { + "service": "Google Gemini AI", + "target_language": "ar", + "translation_method": "single_request", + "total_fields": 20, + "translated_fields": 11, + "success": true + }, + "translated_title": "بول أباسولو", + "arabic_template": "{{واو|صندوق معلومات سيرة كرة قدم\n|\n| أندية_الشباب1 = لاوكسيتا إيكاستولا\n| أندية_الشباب2 = [[Athletic Bilbao|أتلتيك بلباو]]\n| سنوات_الشباب1 = 1995–1996\n| سنوات_الشباب2 = 1996–2002\n| أندية1 = [[نادي باسكونيا|باسكونيا]]\n| أندية2 = [[نادي باراكالدو|باراكالدو]]\n| أندية3 = [[SD Eibar|إيبار]]\n| أندية4 = → [[SD Lemona|ليمونا]] (إعارة)\n| أندية5 = → [[Logroñés CF|لوغروينيس]] (إعارة)\n| أندية6 = [[Logroñés CF|لوغروينيس]]\n| أندية7 = [[Real Unión|ريال يونيون]]\n| أندية8 = إيوريتاكو\n| أندية9 = [[SD Lemona|ليمونا]]\n| أندية10 = [[Real Oviedo|أوفييدو]]\n| أندية11 = [[نادي سيستاو ريفر|سستاو]]\n| أندية12 = [[Amurrio Club|أموريو]]\n| أندية13 = [[Zamudio SD|زاموديو]]\n| أندية14 = [[Club Portugalete|بورتوغاليتي]]\n| أندية15 = باتيا\n| سنوات1 = 2002–2003\n| سنوات2 = 2003–2004\n| سنوات3 = 2004–2006\n| سنوات4 = 2005\n| سنوات5 = 2005–2006\n| سنوات6 = 2006–2007\n| سنوات7 = 2007–2010\n| سنوات8 = 2010\n| سنوات9 = 2011\n| سنوات10 = 2011–2012\n| سنوات11 = 2012–2013\n| سنوات12 = 2014\n| سنوات13 = 2015–2016\n| سنوات14 = 2016–2017\n| سنوات15 = 2018–2021\n| مباريات1 = 35\n| مباريات2 = 24\n| مباريات3 = 2\n| مباريات4 = 16\n| مباريات5 = 24\n| مباريات6 = 29\n| مباريات7 = 82\n| مباريات8 = 11\n| مباريات9 = 21\n| مباريات10 = 26\n| مباريات11 = 13\n| مباريات12 = 45\n| مباريات13 = 12\n| مباريات14 = 41\n| أهداف1 = 5\n| أهداف2 = 1\n| أهداف3 = 0\n| أهداف4 = 4\n| أهداف5 = 2\n| أهداف6 = 8\n| أهداف7 = 12\n| أهداف8 = 1\n| أهداف9 = 2\n| أهداف10 = 0\n| أهداف11 = 5\n| أهداف12 = 17\n| أهداف13 = 8\n| أهداف14 = 10\n| اسم = بول أباسولو\n| الاسم الكامل = بول أباسولو أمانتيغي\n| تاريخ الولادة = {{واو|birth date and age|1984|6|29|df=yes}}\n| مكان الولادة = [[دورانغو (بسكاي)|دورانجو، إسبانيا]]\n| الطول = 1.84\n| المركز = [[Forward (association football)|مهاجم]]\n| مجموع_مباريات = 381.0\n| إجمالي الأهداف = 75.0\n}}", + "construct_metadata": { + "template_type": "football_biography", + "field_count": 14, + "builder_name": "Arabic Football_Biography Builder", + "template_name": "صندوق معلومات سيرة كرة قدم" + }, + "localization_metadata": { + "links_replaced": 4, + "templates_localized": 2, + "waou_templates_inserted": 2, + "localization_errors": [] + }, + "publish_metadata": { + "page_title": "بول أباسولو", + "edit_summary": "تحديث قالب السيرة الذاتية باستخدام InfoboxSync - football_biography", + "revision_id": 71876884, + "publish_success": true, + "published_at": "2025-08-28T10:34:26Z" + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7348bc2b..dcf7c8df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ SQLAlchemy == 2.0.43 typing-extensions == 4.14.1 waybackpy~=3.0.6 wikitextparser~=0.56.4 +litellm~=1.40.0 diff --git a/tasks/InfoboxSync/README.md b/tasks/InfoboxSync/README.md new file mode 100644 index 00000000..30bc04a7 --- /dev/null +++ b/tasks/InfoboxSync/README.md @@ -0,0 +1,547 @@ +# InfoboxSync Pipeline + +A sophisticated Wikipedia infobox synchronization pipeline using advanced design patterns and pywikibot integration. + +## Overview + +This pipeline fetches Arabic Wikipedia pages, finds their corresponding English pages, extracts infobox data, and processes it through multiple stages for synchronization purposes. + +## Architecture & Design Patterns + +### 1. **Template Method Pattern** +- Used in `WikipediaFetcher` abstract base class +- Defines the skeleton of the page fetching algorithm +- Allows subclasses to customize specific steps + +### 2. **Observer Pattern** +- `FetchObserver` interface for monitoring fetch operations +- `LoggingFetchObserver` implementation for logging +- Allows multiple observers to monitor the same fetch operation + +### 3. **Strategy Pattern** +- `WikipediaSyncFetcher` uses different fetch strategies +- Separate fetchers for Arabic and English Wikipedia +- Easy to extend with new language-specific strategies + +### 4. **Factory Pattern** +- Creation of appropriate fetchers based on site/language +- Centralized fetcher creation in `WikipediaSyncFetcher` + +### 5. **Data Class Pattern** +- `PageInfo` dataclass for structured page information +- Clean data transfer between pipeline stages + +### 6. **Strategy Pattern (Parse Stage)** +- `InfoboxParser` abstract base class for different template parsers +- `FootballBiographyParser` for football biography templates +- `GenericInfoboxParser` for other template types +- `InfoboxParserFactory` creates appropriate parsers based on template type +- Allows pipeline to specify which template parser to use + +### 7. **Strategy Pattern (Map Stage)** +- `FieldMapper` abstract base class for different field type mappers +- `TextFieldMapper`, `NumberFieldMapper`, `ImageFieldMapper`, `LinkFieldMapper`, `MixedFieldMapper` implementations +- `NumberedFieldMapper` for handling numbered sequences (years1, clubs1, caps1, etc.) +- `TemplateMapper` abstract base class for template-specific field mapping +- `FootballBiographyMapper` with English to Arabic field mappings +- `TemplateMapperFactory` and `FieldMapperFactory` for creating appropriate mappers +- Supports field type validation and numbered field grouping + +### 8. **Strategy Pattern (Translate Stage)** +- `TranslationService` abstract base class for different translation services +- `GeminiTranslator` implementation using Google Gemini AI via LiteLLM +- `TranslationServiceFactory` for creating appropriate translation services +- Template-based prompt system with external prompt files +- Single-request translation for optimal efficiency +- Supports field-by-field and template-level translation strategies + +### 9. **Strategy Pattern (Construct Stage)** +- `TemplateBuilder` abstract base class for different template builders +- `ArabicTemplateBuilder` implementation for Arabic Wikipedia templates +- `TemplateBuilderFactory` for creating appropriate builders +- Smart field formatting for different data types +- Template validation and quality estimation +- Support for multiple Arabic Wikipedia template types + +## Pipeline Stages + +1. **Fetch**: Uses pywikibot to check Arabic page existence and find English equivalent +2. **Parse**: Extracts infobox data from Wikipedia wikitext using wikitextparser and Strategy Pattern +3. **Map**: Maps English field names to Arabic equivalents using Strategy Pattern with field type validation +4. **Translate**: Translates English infobox data to Arabic using Google Gemini AI with single-request optimization +5. **Construct**: Constructs Arabic Wikipedia templates from translated data using Strategy Pattern +6. **Publish**: Publishes the Arabic template directly to Arabic Wikipedia using pywikibot +7. **Save**: Saves processed data as JSON files + +## Usage + +### Basic Usage + +```python +from tasks.InfoboxSync.test import run_wikipedia_pipeline + +# Sync an Arabic Wikipedia page +result_path = run_wikipedia_pipeline("مصر") # Egypt in Arabic +print(f"Data saved to: {result_path}") +``` + +### Advanced Usage + +```python +from tasks.InfoboxSync.fetch.fetch import fetch_wikipedia_data + +# Direct access to fetch stage +wiki_data = fetch_wikipedia_data("محمد بن سلمان") +if wiki_data['sync_possible']: + arabic_page = wiki_data['arabic'] + english_page = wiki_data['english'] + print(f"Arabic title: {arabic_page.title}") + print(f"English title: {english_page.title}") +``` + +### Using Different Template Types + +```python +from tasks.InfoboxSync.parse.parse import parse_data + +# Parse football biography infobox +data = {'title': 'Player Name', 'content': wikitext_content} +football_data = parse_data(data, 'football_biography') + +# Parse person infobox +person_data = parse_data(data, 'person') + +# Parse custom template +custom_data = parse_data(data, 'infobox custom_template') +``` + +### Field Mapping with Different Types + +```python +from tasks.InfoboxSync.map.field_mappers import TextFieldMapper, NumberFieldMapper + +# Text field mapping +text_mapper = TextFieldMapper("name", "الاسم") +mapped = text_mapper.map_field("Lionel Messi") + +# Number field mapping +number_mapper = NumberFieldMapper("height", "الطول") +mapped = number_mapper.map_field("1.70 m") + +# Numbered field mapping (groups years1, years2, etc.) +from tasks.InfoboxSync.map.template_mapper import FootballBiographyMapper +mapper = FootballBiographyMapper() +mapped_data = mapper.map_infobox(infobox_data) +``` + +### Translation with AI + +```python +from tasks.InfoboxSync.translate.translate import translate_data + +# Translate mapped data to Arabic using Gemini AI +result = translate_data(mapped_data, target_lang='ar') + +if result['translation_metadata']['success']: + translated_fields = result['translated_fields'] + print(f"Translated {result['translation_metadata']['translated_fields']} fields") +else: + print(f"Translation failed: {result['translation_metadata']['error']}") +``` + +### Template Building + +```python +from tasks.InfoboxSync.construct.build import construct_arabic_template + +# Construct Arabic Wikipedia template from translated data +result = construct_arabic_template(translated_data, template_type='football_biography') + +if build_result.success: + arabic_template = build_result.template_text + print(f"Constructed template with {build_result.field_count} fields") + print(arabic_template) +else: + print(f"Construction failed: {build_result.errors}") +``` + +## Dependencies + +- `pywikibot`: For Wikipedia API interactions +- `wikitextparser`: For advanced wikitext parsing +- `litellm`: For Google Gemini AI integration +- Install with: `pip install pywikibot wikitextparser litellm` + +## Configuration + +Before using, configure pywikibot: +```bash +pywikibot generate_user_files +``` + +Or set up your user configuration file as needed for your Wikipedia bot account. + +For translation, set your Google AI API key: +```bash +export GEMINI_API_KEY="your-google-ai-api-key" +``` + +## Error Handling + +The pipeline includes comprehensive error handling for: +- Missing Arabic pages +- Missing corresponding English pages +- Network/API errors +- Parsing errors +- Field mapping errors +- Translation service errors +- Template construction errors +- File I/O errors + +## Data Flow + +1. **Input**: Arabic page title +2. **Arabic Check**: Verify page exists on ar.wikipedia.org +3. **English Lookup**: Find corresponding English page via langlinks +4. **Content Fetch**: Retrieve English page content +5. **Parse**: Extract infobox data using wikitextparser and Strategy Pattern +6. **Map**: Map English fields to Arabic using Strategy Pattern with field type validation +7. **Translate**: Translate English infobox data to Arabic using Google Gemini AI with single-request optimization +9. **Construct**: Construct Arabic Wikipedia template from translated data +9. **Publish**: Publish the Arabic template directly to Arabic Wikipedia using pywikibot +10. **Save**: Store results as JSON + +## Extension Points + +### Adding New Languages +```python +class GermanFetcher(WikipediaFetcher): + def get_site_name(self) -> str: + return 'de' +``` + +### Custom Observers +```python +class MetricsObserver(FetchObserver): + def on_page_check_complete(self, page_info: PageInfo): + # Send metrics to monitoring system + pass +``` + +### Adding New Template Parsers +```python +from tasks.InfoboxSync.parse.base_parser import InfoboxParser + +class CustomTemplateParser(InfoboxParser): + def __init__(self): + super().__init__("infobox custom") + + def parse_infobox(self, wikitext: str) -> Dict[str, Any]: + # Custom parsing logic here + pass +``` + +### Adding New Field Mappers +```python +from tasks.InfoboxSync.map.field_mappers import FieldMapper + +class CustomFieldMapper(FieldMapper): + def __init__(self, english_key: str, arabic_key: str): + super().__init__(english_key, arabic_key, "custom") + + def map_field(self, value: str) -> Dict[str, Any]: + # Custom field mapping logic + return { + self.arabic_key: { + "value": value, + "type": "custom", + "validation": {"is_valid": True} + } + } +``` + +### Adding New Translation Services +```python +from tasks.InfoboxSync.translate.base_translator import TranslationService + +class CustomTranslator(TranslationService): + def __init__(self, api_key: str): + super().__init__('en', 'ar') + self.api_key = api_key + + def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]: + # Custom translation logic + pass + + def translate_text(self, text: str, **kwargs) -> TranslationResult: + # Custom text translation + pass + + def translate_field(self, field_name: str, field_value: Any, **kwargs) -> TranslationResult: + # Custom field translation + pass + + def is_available(self) -> bool: + # Check service availability + pass + + def get_service_name(self) -> str: + return "Custom Translator" + +# Register the service +from tasks.InfoboxSync.translate.base_translator import TranslationServiceFactory +TranslationServiceFactory.register_service("custom", CustomTranslator) +``` + +### Adding New Template Builders +```python +from tasks.InfoboxSync.construct.base_builder import TemplateBuilder + +class CustomTemplateBuilder(TemplateBuilder): + def __init__(self, template_type: str = 'custom'): + super().__init__(template_type) + + def build_template(self, translated_data: Dict[str, Any], **kwargs) -> BuildResult: + # Custom template building logic + pass + + def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + # Custom field formatting + pass + + def get_template_name(self) -> str: + return 'صندوق مخصص' + + def is_available(self) -> bool: + return True + + def get_builder_name(self) -> str: + return "Custom Template Builder" + +# Register the builder +from tasks.InfoboxSync.construct.base_builder import TemplateBuilderFactory +TemplateBuilderFactory.register_builder("custom_builder", CustomTemplateBuilder) +``` + +### Enhanced Parsing +The parse stage uses `wikitextparser` for more accurate infobox extraction compared to regex-based approaches. + +## File Structure + +``` +tasks/InfoboxSync/ +├── README.md # This documentation +├── test.py # Main pipeline orchestrator +├── demo_real_wikipedia.py # Demo with real Wikipedia data +├── fetch/ +│ ├── __init__.py +│ └── fetch.py # Fetch stage with design patterns +├── parse/ +│ ├── __init__.py +│ ├── base_parser.py # Abstract parser base class +│ ├── football_parser.py # Football biography parser +│ ├── parser_factory.py # Factory for creating parsers +│ └── parse.py # Parse stage using Strategy Pattern +├── map/ +│ ├── __init__.py +│ ├── field_mappers.py # Field type strategy mappers +│ ├── template_mapper.py # Template field mapping coordinators +│ └── map.py # Map stage using Strategy Pattern +├── translate/ +│ ├── __init__.py +│ ├── base_translator.py # Abstract translation service base class +│ ├── gemini_translator.py # Google Gemini AI implementation +│ ├── config.py # Translation configuration management +│ ├── prompt_template.txt # External prompt template file +│ ├── translate.py # Main translation interface +│ └── README.md # Translation stage documentation +├── construct/ +│ ├── __init__.py +│ ├── base_builder.py # Abstract template builder base class +│ ├── arabic_builder.py # Arabic Wikipedia template builder +│ ├── build.py # Main construct stage interface +│ └── README.md # Construct stage documentation +├── publish/ +│ ├── __init__.py +│ └── publish.py # Publish stage for Wikipedia publishing +└── save/ + ├── __init__.py + └── save.py # Save stage for data persistence +``` + +## Logging + +The pipeline uses Python's logging module with configurable levels. All stages include detailed logging for debugging and monitoring. + +## Future Enhancements + +- Support for additional translation services (OpenAI, DeepL, Microsoft Translator) +- Support for additional Wikipedia languages +- Database storage instead of JSON files +- Web interface for pipeline management +- Batch processing capabilities +- Additional template parser implementations +- Enhanced field type detection and validation +- Translation quality scoring and confidence metrics +- Additional Arabic Wikipedia template builders +- Template validation against Arabic Wikipedia standards +- Integration with Arabic Wikipedia bot frameworks + +## Translation Features + +### Single-Request Optimization +- Translates ALL fields in ONE API call instead of multiple requests +- Significant cost savings and performance improvement +- Maintains field relationships and context + +### Template-Based Prompts +- Prompt text stored in external `prompt_template.txt` file +- Easy customization without touching Python code +- Placeholder replacement system (`{{FIELDS_TEXT}}`, `{{START_INDEX}}`) + +### Smart Field Handling +- **Text Fields**: Naturally translated (names, descriptions) +- **Number Fields**: Preserved as-is (heights, statistics) +- **Link Fields**: Maintained with proper formatting +- **Numbered Fields**: Translated individually while maintaining sequence + +### AI Integration +- Google Gemini AI via LiteLLM for high-quality translations +- Configurable models and parameters +- Environment variable configuration for API keys + +## Construct Stage Features + +### Arabic Template Construction +- Builds properly formatted Arabic Wikipedia templates +- Handles different field types (text, numbers, links, images, numbered arrays) +- Supports multiple template types with proper Arabic names +- Proper Arabic Wikipedia syntax and formatting + +### Smart Field Formatting +- **Text Fields**: Properly escaped for wiki syntax +- **Number Fields**: Preserved with units and formatting +- **Link Fields**: Correct wiki link syntax +- **Image Fields**: Proper Arabic image syntax +- **Numbered Fields**: Expanded into sequential fields (الأندية1, الأندية2, etc.) + +### Template Types Supported +- `football_biography` → `سيرة لاعب كرة قدم` +- `person` → `صندوق شخص` +- `biography` → `سيرة شخصية` +- `football_club` → `صندوق نادي كرة قدم` +- `country` → `صندوق دولة` +- And many more... + +## Publish Stage Features + +### Direct Wikipedia Publishing +- Publishes Arabic templates directly to Arabic Wikipedia using pywikibot +- Automated edit summaries in Arabic for transparency +- Revision tracking and metadata collection +- Comprehensive error handling for publish failures + +### Template Validation +- Validates template content before publishing +- Checks for required fields and proper formatting +- Ensures compatibility with Arabic Wikipedia standards +- Prevents publishing of malformed templates + +### Publishing Results +After publishing, detailed results are provided: +```python +{ + "success": true, + "page_title": "بول أباسولو", + "edit_summary": "تحديث قالب السيرة الذاتية باستخدام InfoboxSync - football_biography", + "revision_id": 12345678, + "metadata": { + "template_length": 450, + "site": "ar.wikipedia.org", + "published_at": "2024-01-15T10:30:00Z" + }, + "errors": [] +} +``` + +### Safety Features +- Verifies page existence before publishing +- Requires proper pywikibot configuration +- Includes edit summaries for accountability +- Supports dry-run mode for testing (future enhancement) + +## Field Mapping Examples + +### Numbered Fields (Most Common in Football) +Wikipedia often uses numbered fields for career history: +``` +years1 = 2002–2003 | clubs1 = Basconia | caps1 = 35 | goals1 = 5 +years2 = 2003–2004 | clubs2 = Barakaldo | caps2 = 24 | goals2 = 1 +``` + +Mapped to Arabic arrays: +```json +{ + "سنوات_اللعب": { + "value": ["2002–2003", "2003–2004", ...], + "type": "numbered", + "count": 15 + }, + "الأندية": { + "value": ["Basconia", "Barakaldo", ...], + "type": "numbered", + "count": 15 + } +} +``` + +### Field Type Validation +Each field type includes validation: +```json +{ + "الطول": { + "value": 1.70, + "type": "number", + "validation": { + "is_valid": true, + "numeric_value": 1.7, + "has_units": true + } + } +} +``` + +### Translation Results +After translation, fields include translated values: +```json +{ + "الاسم": { + "value": "Paul Abasolo", + "translated_value": "بول أباسولو", + "translation_confidence": 0.9, + "type": "text" + }, + "الأندية": { + "value": ["Club A", "Club B"], + "translated_value": ["النادي أ", "النادي ب"], + "translation_confidence": 0.9, + "type": "numbered" + } +} +``` + +### Construction Results +After construction, the template is ready for Arabic Wikipedia: +```python +{ + "template_text": "{{صندوق سيرة لاعب كرة قدم\n| الاسم = بول أباسولو\n| الطول = 1.84 م\n...}}", + "template_type": "football_biography", + "field_count": 8, + "success": true, + "metadata": { + "template_name": "سيرة لاعب كرة قدم", + "builder_name": "Arabic Football Biography Builder", + "total_input_fields": 10 + }, + "errors": [] +} \ No newline at end of file diff --git a/tasks/InfoboxSync/construct/README.md b/tasks/InfoboxSync/construct/README.md new file mode 100644 index 00000000..c1841b9f --- /dev/null +++ b/tasks/InfoboxSync/construct/README.md @@ -0,0 +1,285 @@ +# Construct Stage - Arabic Wikipedia Template Construction + +This directory contains the construct stage implementation for constructing Arabic Wikipedia templates from translated infobox data. + +## Overview + +The build stage takes translated data from the translate stage and constructs properly formatted Arabic Wikipedia templates. It follows the Strategy Pattern to support different template types and formats. + +## Architecture + +### Core Components + +1. **`base_builder.py`** - Abstract base classes and factory pattern +2. **`arabic_builder.py`** - Arabic Wikipedia template builder implementation +3. **`build.py`** - Main construct stage interface and utilities + +### Design Patterns Used + +- **Strategy Pattern**: Different template builders for various Wikipedia template types +- **Factory Pattern**: Creation of appropriate builders via `TemplateBuilderFactory` +- **Template Method**: Consistent template construction workflow + +## Features + +### Template Construction +- Constructs properly formatted Arabic Wikipedia templates +- Handles different field types (text, numbers, links, images, numbered arrays) +- Supports multiple template types (football biography, person, country, etc.) +- Proper Arabic Wikipedia syntax and formatting + +### Smart Field Formatting +- **Text Fields**: Properly escaped for wiki syntax +- **Number Fields**: Preserved with units and formatting +- **Link Fields**: Correct wiki link syntax +- **Image Fields**: Proper Arabic image syntax +- **Numbered Fields**: Expanded into sequential fields (الأندية1, الأندية2, etc.) + +### Template Types Supported +- `football_biography` → `سيرة لاعب كرة قدم` +- `person` → `صندوق شخص` +- `biography` → `سيرة شخصية` +- `football_club` → `صندوق نادي كرة قدم` +- `country` → `صندوق دولة` +- And many more... + +## Usage + +### Basic Usage + +```python +from tasks.InfoboxSync.construct.build import construct_arabic_template + +# Your translated data from translate stage +translated_data = { + 'translated_fields': { + 'الاسم': {'value': 'Paul Abasolo', 'translated_value': 'بول أباسولو', 'type': 'text'}, + 'الطول': {'value': '1.84 m', 'translated_value': '1.84 م', 'type': 'number'}, + # ... more translated fields + } +} + +# Construct Arabic template +result = construct_arabic_template(translated_data, template_type='football_biography') + +if result.success: + arabic_template = result.template_text + print(f"Constructed template with {result.field_count} fields") + print(arabic_template) +else: + print(f"Construction failed: {result.errors}") +``` + +### Advanced Usage + +```python +from tasks.InfoboxSync.construct.build import construct_template, TemplateBuilderFactory + +# Create specific builder +builder = TemplateBuilderFactory.create_builder('arabic', template_type='person') + +# Construct template +result = builder.construct_template(translated_data) + +# Access build metadata +print(f"Template type: {result.template_type}") +print(f"Fields included: {result.field_count}") +print(f"Builder used: {result.metadata['builder_name']}") +``` + +### Template Validation + +```python +from tasks.InfoboxSync.construct.build import validate_arabic_template, estimate_template_quality + +# Validate template +validation = validate_arabic_template(template_text) +print(f"Valid: {validation['valid']}") +print(f"Errors: {validation['errors']}") + +# Estimate quality +quality = estimate_template_quality(template_text) +print(f"Quality score: {quality['quality_score']}/100") +``` + +## Data Flow + +### Input Data Structure +```python +{ + 'translated_fields': { + 'arabic_field_name': { + 'value': 'original_value', + 'translated_value': 'arabic_translation', + 'type': 'text|number|link|image|numbered', + 'translation_confidence': 0.9 + } + }, + 'translation_metadata': {...}, + 'page_title': 'English Title', + 'arabic_title': 'Arabic Title' +} +``` + +### Output Data Structure +```python +{ + 'template_text': '{{صندوق سيرة لاعب كرة قدم\n| الاسم = بول أباسولو\n| الطول = 1.84 م\n...}}', + 'template_type': 'football_biography', + 'field_count': 8, + 'success': True, + 'metadata': { + 'template_name': 'سيرة لاعب كرة قدم', + 'builder_name': 'Arabic Football Biography Builder', + 'total_input_fields': 10 + }, + 'errors': [] +} +``` + +## Template Construction Process + +1. **Extract Translated Fields** - Get translated data from translate stage +2. **Select Template Type** - Choose appropriate Arabic template name +3. **Format Each Field** - Apply proper Arabic Wikipedia syntax +4. **Handle Field Types** - Special formatting for numbers, links, arrays +5. **Construct Template** - Construct complete template with all fields +6. **Validate Output** - Check for syntax errors and formatting issues + +## Field Type Handling + +### Text Fields +``` +Input: {'value': 'Paul Abasolo', 'translated_value': 'بول أباسولو'} +Output: | الاسم = بول أباسولو +``` + +### Number Fields +``` +Input: {'value': '1.84 m', 'translated_value': '1.84 م'} +Output: | الطول = 1.84 م +``` + +### Numbered Fields (Arrays) +``` +Input: {'value': ['Club A', 'Club B'], 'translated_value': ['النادي أ', 'النادي ب']} +Output: +| الأندية1 = النادي أ +| الأندية2 = النادي ب +``` + +### Link Fields +``` +Input: {'value': 'Argentina', 'translated_value': 'الأرجنتين'} +Output: | الجنسية = [[الأرجنتين]] +``` + +## Extending the Build Stage + +### Adding New Template Types + +```python +from tasks.InfoboxSync.construct.arabic_builder import ArabicTemplateBuilder + +class CustomArabicBuilder(ArabicTemplateBuilder): + def __init__(self): + super().__init__('custom_type') + + def get_template_name(self) -> str: + return 'صندوق مخصص' + + def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + # Custom formatting logic + return f"| {arabic_key} = {field_data['translated_value']}" + +# Register the builder +from tasks.InfoboxSync.construct.base_builder import TemplateBuilderFactory +TemplateBuilderFactory.register_builder("custom_arabic", CustomArabicBuilder) +``` + +### Custom Field Formatters + +```python +class AdvancedArabicBuilder(ArabicTemplateBuilder): + def _format_custom_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + # Advanced custom formatting + value = field_data.get('translated_value', '') + return f"| {arabic_key} = '''{value}'''" + + def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + field_type = field_data.get('type', 'text') + + if field_type == 'custom': + return self._format_custom_field(arabic_key, field_data) + else: + return super().format_field(arabic_key, field_data) +``` + +## Integration with Pipeline + +The construct stage seamlessly integrates with the InfoboxSync pipeline: + +1. **Receives** translated data from translate stage +2. **Constructs** Arabic Wikipedia template +3. **Passes** template text to save stage +4. **Provides** metadata for logging and debugging + +## Quality Assurance + +### Template Validation +- Syntax checking for Arabic Wikipedia format +- Field count verification +- Error and warning reporting + +### Quality Estimation +- Quality scoring algorithm (0-100) +- Issue detection (escaped characters, formatting problems) +- Template complexity analysis + +## Performance Considerations + +- **Efficient Processing**: Single-pass field formatting +- **Memory Optimized**: Streaming template construction +- **Error Resilient**: Continues processing despite individual field errors + +## Troubleshooting + +### Common Issues + +1. **Empty Template Output** + - Check if translated_fields contains valid data + - Verify field types are supported + - Check for translation stage errors + +2. **Malformed Template Syntax** + - Ensure proper Arabic Wikipedia template names + - Check for special character escaping + - Validate field formatting + +3. **Unsupported Template Type** + - Add new template type mapping in `get_template_name()` + - Extend field formatters if needed + - Register new builder class + +### Debug Information + +Enable detailed logging: +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +The construct stage provides comprehensive logging for: +- Template construction process +- Field formatting details +- Error conditions and recovery +- Performance metrics + +## Future Enhancements + +- Support for additional Arabic template types +- Advanced template customization options +- Integration with Arabic Wikipedia bot frameworks +- Template quality improvement suggestions +- Multi-language template support +- Template validation against Arabic Wikipedia standards \ No newline at end of file diff --git a/tasks/InfoboxSync/construct/__init__.py b/tasks/InfoboxSync/construct/__init__.py new file mode 100644 index 00000000..3f7e6b41 --- /dev/null +++ b/tasks/InfoboxSync/construct/__init__.py @@ -0,0 +1,19 @@ +# Construct stage package + +# Import base classes +from .base_builder import TemplateBuilder, TemplateBuilderFactory, BuildResult + +# Import concrete builders +from . import arabic_builder + +# Import main construct function +from .build import construct_template, get_available_builders, test_builder + +__all__ = [ + 'TemplateBuilder', + 'TemplateBuilderFactory', + 'BuildResult', + 'construct_template', + 'get_available_builders', + 'test_builder' +] \ No newline at end of file diff --git a/tasks/InfoboxSync/construct/arabic_builder.py b/tasks/InfoboxSync/construct/arabic_builder.py new file mode 100644 index 00000000..bc596abf --- /dev/null +++ b/tasks/InfoboxSync/construct/arabic_builder.py @@ -0,0 +1,258 @@ +""" +Arabic Wikipedia template builder implementation. +""" + +import logging +from typing import Dict, Any, List, Optional +from .base_builder import TemplateBuilder, TemplateBuilderFactory, BuildResult + +logger = logging.getLogger(__name__) + + +class ArabicTemplateBuilder(TemplateBuilder): + """Builder for Arabic Wikipedia templates using translated data.""" + + def __init__(self, template_type: str = 'football_biography'): + """ + Initialize Arabic template builder. + + Args: + template_type (str): Type of template to build + """ + super().__init__(template_type) + self.field_formatters = { + 'text': self._format_text_field, + 'number': self._format_number_field, + 'link': self._format_link_field, + 'image': self._format_image_field, + 'numbered': self._format_numbered_field, + 'mixed': self._format_mixed_field + } + + def construct_template(self, translated_data: Dict[str, Any], **kwargs) -> BuildResult: + """ + Build an Arabic Wikipedia template from translated data. + + Args: + translated_data (Dict[str, Any]): Data from translate stage with translated_fields + **kwargs: Additional parameters + + Returns: + BuildResult: Template building result + """ + try: + logger.info(f"Building Arabic template for type: {self.template_type}") + + # Extract translated fields + translated_fields = translated_data.get('translated_fields', {}) + if not translated_fields: + return BuildResult( + template_text="", + template_type=self.template_type, + field_count=0, + success=False, + metadata={}, + errors=["No translated fields found"] + ) + + # Build template structure + template_lines = [] + template_lines.append(f"{{{{{self.get_template_name()}") + template_lines.append("|") # First pipe after template name + + # Process each translated field + field_count = 0 + errors = [] + + for arabic_key, field_data in translated_fields.items(): + try: + # Get the translated value + if 'translated_value' in field_data: + value = field_data['translated_value'] + else: + value = field_data.get('value', '') + + # Format the field + formatted_field = self.format_field(arabic_key, { + 'value': value, + 'type': field_data.get('type', 'text'), + 'original_type': field_data.get('type', 'text') + }) + + if formatted_field: + # Handle different field types + if field_data.get('type') == 'numbered' and isinstance(formatted_field, list): + # Numbered fields return a list of lines + template_lines.extend(formatted_field) + field_count += 1 + elif isinstance(formatted_field, str) and formatted_field.strip(): + template_lines.append(formatted_field) + field_count += 1 + + except Exception as e: + error_msg = f"Failed to format field {arabic_key}: {e}" + logger.warning(error_msg) + errors.append(error_msg) + continue + + # Close template + template_lines.append("}}") + + # Join all lines with actual newlines - creates proper line breaks + template_text = "\n".join(template_lines) + + logger.info(f"Successfully built Arabic template with {field_count} fields") + + return BuildResult( + template_text=template_text, + template_type=self.template_type, + field_count=field_count, + success=True, + metadata={ + 'template_name': self.get_template_name(), + 'builder_name': self.get_builder_name(), + 'total_input_fields': len(translated_fields) + }, + errors=errors + ) + + except Exception as e: + logger.error(f"Template building failed: {e}") + return BuildResult( + template_text="", + template_type=self.template_type, + field_count=0, + success=False, + metadata={}, + errors=[str(e)] + ) + + def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + """ + Format a single field for the Arabic template. + + Args: + arabic_key (str): Arabic field name + field_data (Dict[str, Any]): Field data with value and type + + Returns: + str: Formatted field string + """ + field_type = field_data.get('type', 'text') + + # Get the appropriate formatter + formatter = self.field_formatters.get(field_type, self._format_text_field) + + try: + return formatter(arabic_key, field_data) + except Exception as e: + logger.warning(f"Failed to format field {arabic_key} of type {field_type}: {e}") + # Fallback to text formatting + return self._format_text_field(arabic_key, field_data) + + def _format_text_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + """Format a text field.""" + value = field_data.get('value', '') + if not value: + return "" + + # Escape pipes and other wiki syntax + # escaped_value = str(value).replace('|', '{{!}}').replace('=', '{{=}}') + escaped_value = str(value) + + return f"| {arabic_key} = {escaped_value}" + + def _format_number_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + """Format a number field.""" + value = field_data.get('value', '') + if not value: + return "" + + # Keep numbers as-is, just ensure proper formatting + return f"| {arabic_key} = {value}" + + def _format_link_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + """Format a link field.""" + value = field_data.get('value', '') + if not value: + return "" + + # Ensure proper wiki link format + if '|' in str(value): + # Already has link text + return f"| {arabic_key} = {value}" + else: + # Simple link + return f"| {arabic_key} = [[{value}]]" + + def _format_image_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + """Format an image field.""" + value = field_data.get('value', '') + if not value: + return "" + + # Ensure proper image format + if value.startswith('[[File:') or value.startswith('[[ملف:'): + return f"| {arabic_key} = {value}" + else: + return f"| {arabic_key} = [[ملف:{value}]]" + + def _format_numbered_field(self, arabic_key: str, field_data: Dict[str, Any]) -> List[str]: + """Format a numbered field (array of values).""" + value = field_data.get('value', []) + if not value or not isinstance(value, list): + return [] + + # Return a list of formatted lines for each numbered field + formatted_lines = [] + + for i, item_value in enumerate(value, 1): + if item_value: # Only include non-empty values + field_name = f"{arabic_key}{i}" + # escaped_value = str(item_value).replace('|', '{{!}}').replace('=', '{{=}}') + escaped_value = str(item_value) + formatted_lines.append(f"| {field_name} = {escaped_value}") + + return formatted_lines + + def _format_mixed_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + """Format a mixed field (contains both text and links).""" + value = field_data.get('value', '') + if not value: + return "" + + # Mixed fields usually contain wiki markup, keep as-is + return f"| {arabic_key} = {value}" + + def get_template_name(self) -> str: + """Get the Arabic Wikipedia template name.""" + template_names = { + 'football_biography': 'صندوق معلومات سيرة كرة قدم', + 'person': 'صندوق شخص', + 'biography': 'سيرة شخصية', + 'football_club': 'صندوق نادي كرة قدم', + 'country': 'صندوق دولة', + 'city': 'صندوق مدينة', + 'university': 'صندوق جامعة', + 'company': 'صندوق شركة', + 'film': 'صندوق فيلم', + 'book': 'صندوق كتاب', + 'album': 'صندوق ألبوم', + 'tv_series': 'صندوق مسلسل تلفزيوني' + } + + return template_names.get(self.template_type, 'صندوق عام') + + def is_available(self) -> bool: + """Check if Arabic template builder is available.""" + # Always available since it doesn't require external services + return True + + def get_builder_name(self) -> str: + """Get the name of this builder.""" + return f"Arabic {self.template_type.title()} Builder" + + +# Register the Arabic builder +TemplateBuilderFactory.register_builder("arabic", ArabicTemplateBuilder) +TemplateBuilderFactory.register_builder("arabic_football", ArabicTemplateBuilder) \ No newline at end of file diff --git a/tasks/InfoboxSync/construct/base_builder.py b/tasks/InfoboxSync/construct/base_builder.py new file mode 100644 index 00000000..6e4f244b --- /dev/null +++ b/tasks/InfoboxSync/construct/base_builder.py @@ -0,0 +1,135 @@ +""" +Base template builder classes following Strategy Pattern. +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, List, Optional +from dataclasses import dataclass +import logging + +logger = logging.getLogger(__name__) + + +@dataclass +class BuildResult: + """Result of a template building operation.""" + template_text: str + template_type: str + field_count: int + success: bool + metadata: Dict[str, Any] + errors: List[str] = None + + def __post_init__(self): + if self.errors is None: + self.errors = [] + + +class TemplateBuilder(ABC): + """Abstract base class for template builders.""" + + def __init__(self, template_type: str = 'generic'): + self.template_type = template_type + + @abstractmethod + def construct_template(self, translated_data: Dict[str, Any], **kwargs) -> BuildResult: + """ + Build a Wikipedia template from translated data. + + Args: + translated_data (Dict[str, Any]): Translated data with Arabic field names + **kwargs: Additional parameters for building + + Returns: + BuildResult: Template building result + """ + pass + + @abstractmethod + def format_field(self, arabic_key: str, field_data: Dict[str, Any]) -> str: + """ + Format a single field for the template. + + Args: + arabic_key (str): Arabic field name + field_data (Dict[str, Any]): Field data with value and type + + Returns: + str: Formatted field string + """ + pass + + @abstractmethod + def get_template_name(self) -> str: + """ + Get the Wikipedia template name for this builder. + + Returns: + str: Template name (e.g., 'infobox football biography') + """ + pass + + @abstractmethod + def is_available(self) -> bool: + """Check if this builder is available and properly configured.""" + pass + + @abstractmethod + def get_builder_name(self) -> str: + """Get the name of this builder.""" + pass + + +class TemplateBuilderFactory: + """Factory for creating template builders.""" + + _builders = {} + + @classmethod + def register_builder(cls, builder_name: str, builder_class): + """Register a new template builder.""" + cls._builders[builder_name] = builder_class + + @classmethod + def create_builder(cls, builder_name: str, **kwargs) -> TemplateBuilder: + """ + Create a template builder instance. + + Args: + builder_name (str): Name of the builder to create + **kwargs: Parameters for builder initialization + + Returns: + TemplateBuilder: Builder instance + + Raises: + ValueError: If builder is not registered or creation fails + """ + if builder_name not in cls._builders: + available_builders = list(cls._builders.keys()) + raise ValueError(f"Unknown template builder: {builder_name}. " + f"Available builders: {available_builders}") + + builder_class = cls._builders[builder_name] + try: + return builder_class(**kwargs) + except Exception as e: + raise ValueError(f"Failed to create {builder_name} builder: {e}") + + @classmethod + def get_available_builders(cls) -> List[str]: + """Get list of available template builders.""" + return list(cls._builders.keys()) + + @classmethod + def get_supported_template_types(cls) -> List[str]: + """Get list of supported template types across all builders.""" + template_types = [] + for builder_class in cls._builders.values(): + try: + # Create a temporary instance to get template name + temp_builder = builder_class() + template_types.append(temp_builder.get_template_name()) + except Exception: + continue + return template_types \ No newline at end of file diff --git a/tasks/InfoboxSync/construct/build.py b/tasks/InfoboxSync/construct/build.py new file mode 100644 index 00000000..232e46e1 --- /dev/null +++ b/tasks/InfoboxSync/construct/build.py @@ -0,0 +1,251 @@ +""" +Build stage for Arabic Wikipedia template construction. +""" + +import logging +from typing import Dict, Any, Optional +from .base_builder import TemplateBuilderFactory, BuildResult + +logger = logging.getLogger(__name__) + + +def construct_template(translated_data: dict, builder_name: str = 'arabic', + template_type: str = 'football_biography') -> BuildResult: + """ + Build an Arabic Wikipedia template from translated data. + + Args: + translated_data (dict): Data from translate stage with translated_fields + builder_name (str): Name of the builder to use ('arabic', 'arabic_football', etc.) + template_type (str): Type of template to build + + Returns: + BuildResult: Template building result with Arabic template text + """ + logger.info(f"Starting template build with builder: {builder_name}") + + try: + # Create the appropriate builder + builder = TemplateBuilderFactory.create_builder( + builder_name, + template_type=template_type + ) + + # Check if builder is available + if not builder.is_available(): + error_msg = f"Template builder {builder_name} is not available" + logger.error(error_msg) + return BuildResult( + template_text="", + template_type=template_type, + field_count=0, + success=False, + metadata={}, + errors=[error_msg] + ) + + # Build the template + result = builder.construct_template(translated_data) + + if result.success: + logger.info(f"Template build completed successfully: {result.field_count} fields") + else: + logger.error(f"Template build failed: {result.errors}") + + return result + + except Exception as e: + logger.error(f"Template building failed: {e}") + return BuildResult( + template_text="", + template_type=template_type, + field_count=0, + success=False, + metadata={}, + errors=[str(e)] + ) + + +def construct_arabic_template(translated_data: dict, template_type: str = 'football_biography') -> BuildResult: + """ + Convenience function to build Arabic templates. + + Args: + translated_data (dict): Translated data from translate stage + template_type (str): Template type to build + + Returns: + BuildResult: Arabic template building result + """ + return construct_template(translated_data, 'arabic', template_type) + + +def get_available_builders() -> list: + """ + Get list of available template builders. + + Returns: + list: List of available builder names + """ + try: + return TemplateBuilderFactory.get_available_builders() + except Exception as e: + logger.error(f"Error getting available builders: {e}") + return [] + + +def get_supported_template_types() -> list: + """ + Get list of supported template types. + + Returns: + list: List of supported template type names + """ + try: + return TemplateBuilderFactory.get_supported_template_types() + except Exception as e: + logger.error(f"Error getting supported template types: {e}") + return [] + + +def test_builder(builder_name: str = 'arabic') -> bool: + """ + Test if a template builder is available and working. + + Args: + builder_name (str): Name of the builder to test + + Returns: + bool: True if builder is available and working + """ + try: + builder = TemplateBuilderFactory.create_builder(builder_name) + return builder.is_available() + except Exception as e: + logger.error(f"Error testing builder {builder_name}: {e}") + return False + + +def create_sample_arabic_template() -> str: + """ + Create a sample Arabic Wikipedia template for testing. + + Returns: + str: Sample Arabic template + """ + return """{{صندوق سيرة لاعب كرة قدم +| الاسم = بول أباسولو +| الاسم الكامل = بول أباسولو أمانتيغي +| تاريخ الميلاد = 29 يونيو 1984 +| مكان الميلاد = دورانغو، إسبانيا +| الطول = 1.84 م +| المركز = مهاجم +| الأندية1 = نادي باسكونيا +| سنوات اللاعب1 = 2002–2003 +| المباريات1 = 35 +| الأهداف1 = 5 +| الأندية2 = براكالدو +| سنوات اللاعب2 = 2003–2004 +| المباريات2 = 24 +| الأهداف2 = 1 +}}""" + + +def validate_arabic_template(template_text: str) -> Dict[str, Any]: + """ + Validate an Arabic Wikipedia template. + + Args: + template_text (str): Template text to validate + + Returns: + dict: Validation results + """ + errors = [] + warnings = [] + + # Check basic structure + if not template_text.startswith('{{'): + errors.append("Template must start with '{{'") + if not template_text.endswith('}}'): + errors.append("Template must end with '}}'") + + # Check for required fields (basic validation) + lines = template_text.split('\n') + field_count = 0 + + for line in lines: + line = line.strip() + if line.startswith('|') and '=' in line: + field_count += 1 + + if field_count == 0: + warnings.append("No fields found in template") + + return { + 'valid': len(errors) == 0, + 'errors': errors, + 'warnings': warnings, + 'field_count': field_count, + 'template_length': len(template_text) + } + + +def format_template_for_display(template_text: str) -> str: + """ + Format template text for better display in logs or UI. + + Args: + template_text (str): Raw template text + + Returns: + str: Formatted template text + """ + # Add line numbers and indentation for readability + lines = template_text.split('\n') + formatted_lines = [] + + for i, line in enumerate(lines, 1): + if line.strip(): + formatted_lines.append("2d") + else: + formatted_lines.append("") + + return '\n'.join(formatted_lines) + + +def estimate_template_quality(template_text: str) -> Dict[str, Any]: + """ + Estimate the quality of a generated template. + + Args: + template_text (str): Template text to analyze + + Returns: + dict: Quality metrics + """ + # Basic quality metrics + field_count = template_text.count('|') + escaped_chars = template_text.count('{{!}}') + template_text.count('{{=}}') + + # Check for common issues + issues = [] + if '{{!}}' in template_text: + issues.append("Contains escaped pipes") + if '{{=}}' in template_text: + issues.append("Contains escaped equals signs") + if '\n\n\n' in template_text: + issues.append("Multiple consecutive empty lines") + + # Calculate quality score (0-100) + base_score = min(100, field_count * 10) # 10 points per field, max 100 + penalty = len(issues) * 10 # 10 point penalty per issue + quality_score = max(0, base_score - penalty) + + return { + 'quality_score': quality_score, + 'field_count': field_count, + 'escaped_characters': escaped_chars, + 'issues': issues, + 'template_length': len(template_text) + } \ No newline at end of file diff --git a/tasks/InfoboxSync/fetch/__init__.py b/tasks/InfoboxSync/fetch/__init__.py new file mode 100644 index 00000000..48588eaf --- /dev/null +++ b/tasks/InfoboxSync/fetch/__init__.py @@ -0,0 +1,63 @@ +"""Fetch stage module for Wikipedia infobox synchronization.""" + +import logging +from typing import Dict, Any + +from .sync_fetcher import WikipediaSyncFetcher +from .models import PageInfo, SyncResult + +logger = logging.getLogger(__name__) + +# Main API functions +def fetch_wikipedia_data(ar_page_title: str) -> Dict[str, Any]: + """ + Main function to fetch Wikipedia data for sync operation. + + Args: + ar_page_title: Arabic page title to sync + + Returns: + Dictionary with Arabic and English page data + """ + fetcher = WikipediaSyncFetcher() + return fetcher.fetch_arabic_and_english_pages(ar_page_title) + + +def fetch_sync_result(ar_page_title: str) -> SyncResult: + """ + Fetch synchronization result with structured return type. + + Args: + ar_page_title: Title of the Arabic Wikipedia page + + Returns: + SyncResult object with structured data + """ + fetcher = WikipediaSyncFetcher() + return fetcher.fetch_sync_result(ar_page_title) + + +# Legacy function for backward compatibility +def fetch_data(url: str) -> dict: + """ + Legacy function for backward compatibility. + Now expects a page title instead of URL. + """ + logger.warning("fetch_data(url) is deprecated. Use fetch_wikipedia_data(page_title) instead.") + # Extract page title from URL (simple implementation) + if 'wikipedia.org' in url: + page_title = url.split('/')[-1].replace('_', ' ') + return fetch_wikipedia_data(page_title) + else: + raise ValueError("URL must be a Wikipedia page URL") + + +# Expose key classes for advanced usage +__all__ = [ + 'WikipediaSyncFetcher', + 'PageInfo', + 'SyncResult', + 'fetch_wikipedia_data', + 'fetch_sync_result', + 'fetch_data' +] \ No newline at end of file diff --git a/tasks/InfoboxSync/fetch/fetch.py b/tasks/InfoboxSync/fetch/fetch.py new file mode 100644 index 00000000..3f027d00 --- /dev/null +++ b/tasks/InfoboxSync/fetch/fetch.py @@ -0,0 +1,241 @@ +import logging +from abc import ABC, abstractmethod +from typing import Dict, Optional, Any +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + + +@dataclass +class PageInfo: + """Data class for page information.""" + title: str + exists: bool + content: Optional[str] = None + langlinks: Optional[Dict[str, str]] = None + error: Optional[str] = None + + +class FetchObserver(ABC): + """Observer pattern for monitoring fetch operations.""" + + @abstractmethod + def on_page_check_start(self, page_title: str, site: str): + pass + + @abstractmethod + def on_page_check_complete(self, page_info: PageInfo): + pass + + @abstractmethod + def on_error(self, error: str): + pass + + +class LoggingFetchObserver(FetchObserver): + """Logging implementation of fetch observer.""" + + def on_page_check_start(self, page_title: str, site: str): + logger.info(f"Starting page check for '{page_title}' on {site}") + + def on_page_check_complete(self, page_info: PageInfo): + if page_info.exists: + logger.info(f"Page '{page_info.title}' found successfully") + else: + logger.warning(f"Page '{page_info.title}' not found") + + def on_error(self, error: str): + logger.error(f"Fetch error: {error}") + + +class WikipediaFetcher(ABC): + """Abstract base class for Wikipedia page fetchers using Template Method pattern.""" + + def __init__(self, observer: Optional[FetchObserver] = None): + self.observer = observer or LoggingFetchObserver() + + def fetch_page_info(self, page_title: str) -> PageInfo: + """Template method for fetching page information.""" + try: + self.observer.on_page_check_start(page_title, self.get_site_name()) + + page_info = self._check_page_exists(page_title) + if page_info.exists: + page_info = self._fetch_page_content(page_info) + page_info = self._fetch_langlinks(page_info) + + self.observer.on_page_check_complete(page_info) + return page_info + + except Exception as e: + error_msg = f"Error fetching page '{page_title}': {str(e)}" + self.observer.on_error(error_msg) + return PageInfo(title=page_title, exists=False, error=error_msg) + + @abstractmethod + def get_site_name(self) -> str: + pass + + @abstractmethod + def _check_page_exists(self, page_title: str) -> PageInfo: + pass + + @abstractmethod + def _fetch_page_content(self, page_info: PageInfo) -> PageInfo: + pass + + @abstractmethod + def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo: + pass + + +class PywikibotFetcher(WikipediaFetcher): + """Pywikibot implementation of Wikipedia fetcher.""" + + def __init__(self, site_name: str, observer: Optional[FetchObserver] = None): + super().__init__(observer) + self.site_name = site_name + self.site = None + self._initialize_site() + + def get_site_name(self) -> str: + return self.site_name + + def _initialize_site(self): + """Initialize pywikibot site - lazy initialization.""" + try: + import pywikibot + if self.site is None: + self.site = pywikibot.Site(self.site_name) + logger.info(f"Initialized pywikibot site: {self.site_name}") + except ImportError: + raise ImportError("pywikibot is required for Wikipedia operations. Install with: pip install pywikibot") + + def _check_page_exists(self, page_title: str) -> PageInfo: + """Check if page exists on the wiki site.""" + try: + import pywikibot + page = pywikibot.Page(self.site, page_title) + exists = page.exists() + return PageInfo( + title=page_title, + exists=exists, + content=page.text if exists else None + ) + except Exception as e: + logger.error(f"Error checking page existence: {e}") + return PageInfo(title=page_title, exists=False, error=str(e)) + + def _fetch_page_content(self, page_info: PageInfo) -> PageInfo: + """Fetch full page content.""" + # Content is already fetched in _check_page_exists for efficiency + return page_info + + def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo: + """Fetch language links (interwiki links).""" + try: + import pywikibot + if page_info.exists: + page = pywikibot.Page(self.site, page_info.title) + langlinks = {} + for langlink in page.langlinks(): + langlinks[langlink.site.code] = langlink.title + page_info.langlinks = langlinks + return page_info + except Exception as e: + logger.error(f"Error fetching langlinks: {e}") + page_info.langlinks = {} + return page_info + + +class WikipediaSyncFetcher: + """Main fetcher class using Strategy pattern for different fetch strategies.""" + + def __init__(self, observer: Optional[FetchObserver] = None): + self.observer = observer or LoggingFetchObserver() + self.ar_fetcher = PywikibotFetcher('ar', self.observer) + self.en_fetcher = PywikibotFetcher('en', self.observer) + + def fetch_arabic_and_english_pages(self, ar_page_title: str) -> Dict[str, Any]: + """ + Fetch Arabic page and corresponding English page if it exists. + + Args: + ar_page_title: Title of the Arabic Wikipedia page + + Returns: + Dict containing both Arabic and English page information + """ + logger.info(f"Starting sync fetch for Arabic page: {ar_page_title}") + + # Step 1: Check Arabic page + ar_page_info = self.ar_fetcher.fetch_page_info(ar_page_title) + + if not ar_page_info.exists: + return { + 'arabic': ar_page_info, + 'english': None, + 'sync_possible': False, + 'error': f"Arabic page '{ar_page_title}' does not exist" + } + + # Step 2: Find corresponding English page + en_page_title = self._find_english_page_title(ar_page_info) + + if not en_page_title: + return { + 'arabic': ar_page_info, + 'english': None, + 'sync_possible': False, + 'error': f"No corresponding English page found for '{ar_page_title}'" + } + + # Step 3: Fetch English page + en_page_info = self.en_fetcher.fetch_page_info(en_page_title) + + return { + 'arabic': ar_page_info, + 'english': en_page_info, + 'sync_possible': en_page_info.exists, + 'error': None if en_page_info.exists else f"English page '{en_page_title}' does not exist" + } + + def _find_english_page_title(self, ar_page_info: PageInfo) -> Optional[str]: + """Find the corresponding English page title.""" + # Method 1: Check langlinks from Arabic page + if ar_page_info.langlinks and 'en' in ar_page_info.langlinks: + return ar_page_info.langlinks['en'] + + # Method 2: Try direct title match (for pages with same name in both languages) + # This is a fallback - in reality you'd want more sophisticated matching + logger.warning(f"No direct English langlink found for '{ar_page_info.title}', trying direct match") + return ar_page_info.title + + +def fetch_wikipedia_data(ar_page_title: str) -> Dict[str, Any]: + """ + Main function to fetch Wikipedia data for sync operation. + + Args: + ar_page_title: Arabic page title to sync + + Returns: + Dictionary with Arabic and English page data + """ + fetcher = WikipediaSyncFetcher() + return fetcher.fetch_arabic_and_english_pages(ar_page_title) + + +# Legacy function for backward compatibility +def fetch_data(url: str) -> dict: + """ + Legacy function for backward compatibility. + Now expects a page title instead of URL. + """ + logger.warning("fetch_data(url) is deprecated. Use fetch_wikipedia_data(page_title) instead.") + # Extract page title from URL (simple implementation) + if 'wikipedia.org' in url: + page_title = url.split('/')[-1].replace('_', ' ') + return fetch_wikipedia_data(page_title) + else: + raise ValueError("URL must be a Wikipedia page URL") \ No newline at end of file diff --git a/tasks/InfoboxSync/fetch/interfaces.py b/tasks/InfoboxSync/fetch/interfaces.py new file mode 100644 index 00000000..99dede1a --- /dev/null +++ b/tasks/InfoboxSync/fetch/interfaces.py @@ -0,0 +1,50 @@ +"""Abstract interfaces for the fetch stage.""" + +import logging +from abc import ABC, abstractmethod +from typing import Optional +from .models import PageInfo +from .observers import FetchObserver, LoggingFetchObserver + +logger = logging.getLogger(__name__) + + +class WikipediaFetcher(ABC): + """Abstract base class for Wikipedia page fetchers using Template Method.""" + + def __init__(self, observer: Optional[FetchObserver] = None): + self.observer = observer or LoggingFetchObserver() + + def fetch_page_info(self, page_title: str) -> PageInfo: + """Template method for fetching page information.""" + try: + self.observer.on_page_check_start(page_title, self.get_site_name()) + + page_info = self._check_page_exists(page_title) + if page_info.exists: + page_info = self._fetch_page_content(page_info) + page_info = self._fetch_langlinks(page_info) + + self.observer.on_page_check_complete(page_info) + return page_info + + except Exception as e: + error_msg = f"Error fetching page '{page_title}': {str(e)}" + self.observer.on_error(error_msg) + return PageInfo(title=page_title, exists=False, error=error_msg) + + @abstractmethod + def get_site_name(self) -> str: + pass + + @abstractmethod + def _check_page_exists(self, page_title: str) -> PageInfo: + pass + + @abstractmethod + def _fetch_page_content(self, page_info: PageInfo) -> PageInfo: + pass + + @abstractmethod + def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo: + pass \ No newline at end of file diff --git a/tasks/InfoboxSync/fetch/models.py b/tasks/InfoboxSync/fetch/models.py new file mode 100644 index 00000000..5665fb11 --- /dev/null +++ b/tasks/InfoboxSync/fetch/models.py @@ -0,0 +1,23 @@ +"""Data models for the fetch stage.""" + +from dataclasses import dataclass +from typing import Dict, Optional + + +@dataclass +class PageInfo: + """Data class for page information.""" + title: str + exists: bool + content: Optional[str] = None + langlinks: Optional[Dict[str, str]] = None + error: Optional[str] = None + + +@dataclass +class SyncResult: + """Data class for synchronization results.""" + arabic: PageInfo + english: Optional[PageInfo] + sync_possible: bool + error: Optional[str] = None \ No newline at end of file diff --git a/tasks/InfoboxSync/fetch/observers.py b/tasks/InfoboxSync/fetch/observers.py new file mode 100644 index 00000000..ea0486c9 --- /dev/null +++ b/tasks/InfoboxSync/fetch/observers.py @@ -0,0 +1,67 @@ +"""Observer pattern implementation for monitoring fetch operations.""" + +import logging +from abc import ABC, abstractmethod +from .models import PageInfo + +logger = logging.getLogger(__name__) + + +class FetchObserver(ABC): + """Observer pattern for monitoring fetch operations.""" + + @abstractmethod + def on_page_check_start(self, page_title: str, site: str): + pass + + @abstractmethod + def on_page_check_complete(self, page_info: PageInfo): + pass + + @abstractmethod + def on_error(self, error: str): + pass + + +class LoggingFetchObserver(FetchObserver): + """Logging implementation of fetch observer.""" + + def on_page_check_start(self, page_title: str, site: str): + logger.info(f"Starting page check for '{page_title}' on {site}") + + def on_page_check_complete(self, page_info: PageInfo): + if page_info.exists: + logger.info(f"Page '{page_info.title}' found successfully") + else: + logger.warning(f"Page '{page_info.title}' not found") + + def on_error(self, error: str): + logger.error(f"Fetch error: {error}") + + +class MetricsFetchObserver(FetchObserver): + """Metrics collection implementation of fetch observer.""" + + def __init__(self): + self.metrics = { + 'pages_checked': 0, + 'pages_found': 0, + 'pages_not_found': 0, + 'errors': 0 + } + + def on_page_check_start(self, page_title: str, site: str): + self.metrics['pages_checked'] += 1 + + def on_page_check_complete(self, page_info: PageInfo): + if page_info.exists: + self.metrics['pages_found'] += 1 + else: + self.metrics['pages_not_found'] += 1 + + def on_error(self, error: str): + self.metrics['errors'] += 1 + + def get_metrics(self) -> dict: + """Get current metrics.""" + return self.metrics.copy() \ No newline at end of file diff --git a/tasks/InfoboxSync/fetch/pywikibot_fetcher.py b/tasks/InfoboxSync/fetch/pywikibot_fetcher.py new file mode 100644 index 00000000..8c970773 --- /dev/null +++ b/tasks/InfoboxSync/fetch/pywikibot_fetcher.py @@ -0,0 +1,71 @@ +"""Pywikibot implementation of Wikipedia fetcher.""" + +import logging +from typing import Optional +from .interfaces import WikipediaFetcher +from .models import PageInfo +from .observers import FetchObserver + +logger = logging.getLogger(__name__) + + +class PywikibotFetcher(WikipediaFetcher): + """Pywikibot implementation of Wikipedia fetcher.""" + + def __init__(self, site_name: str, + observer: Optional[FetchObserver] = None): + super().__init__(observer) + self.site_name = site_name + self.site = None + self._initialize_site() + + def get_site_name(self) -> str: + return self.site_name + + def _initialize_site(self): + """Initialize pywikibot site - lazy initialization.""" + try: + import pywikibot + if self.site is None: + self.site = pywikibot.Site(self.site_name) + logger.info(f"Initialized pywikibot site: {self.site_name}") + except ImportError: + msg = ("pywikibot is required for Wikipedia operations. " + "Install with: pip install pywikibot") + raise ImportError(msg) + + def _check_page_exists(self, page_title: str) -> PageInfo: + """Check if page exists on the wiki site.""" + try: + import pywikibot + page = pywikibot.Page(self.site, page_title) + exists = page.exists() + return PageInfo( + title=page_title, + exists=exists, + content=page.text if exists else None + ) + except Exception as e: + logger.error(f"Error checking page existence: {e}") + return PageInfo(title=page_title, exists=False, error=str(e)) + + def _fetch_page_content(self, page_info: PageInfo) -> PageInfo: + """Fetch full page content.""" + # Content is already fetched in _check_page_exists for efficiency + return page_info + + def _fetch_langlinks(self, page_info: PageInfo) -> PageInfo: + """Fetch language links (interwiki links).""" + try: + import pywikibot + if page_info.exists: + page = pywikibot.Page(self.site, page_info.title) + langlinks = {} + for langlink in page.langlinks(): + langlinks[langlink.site.code] = langlink.title + page_info.langlinks = langlinks + return page_info + except Exception as e: + logger.error(f"Error fetching langlinks: {e}") + page_info.langlinks = {} + return page_info \ No newline at end of file diff --git a/tasks/InfoboxSync/fetch/sync_fetcher.py b/tasks/InfoboxSync/fetch/sync_fetcher.py new file mode 100644 index 00000000..dad7f89a --- /dev/null +++ b/tasks/InfoboxSync/fetch/sync_fetcher.py @@ -0,0 +1,87 @@ +"""Main synchronization fetcher using Strategy pattern.""" + +import logging +from typing import Dict, Any, Optional +from .models import PageInfo, SyncResult +from .observers import FetchObserver, LoggingFetchObserver +from .pywikibot_fetcher import PywikibotFetcher + +logger = logging.getLogger(__name__) + + +class WikipediaSyncFetcher: + """Main fetcher class using Strategy pattern.""" + + def __init__(self, observer: Optional[FetchObserver] = None): + self.observer = observer or LoggingFetchObserver() + self.ar_fetcher = PywikibotFetcher('ar', self.observer) + self.en_fetcher = PywikibotFetcher('en', self.observer) + + def fetch_arabic_and_english_pages(self, + ar_page_title: str) -> Dict[str, Any]: + """Fetch Arabic page and corresponding English page.""" + logger.info(f"Starting sync fetch for Arabic page: {ar_page_title}") + + # Step 1: Check Arabic page + ar_page_info = self.ar_fetcher.fetch_page_info(ar_page_title) + + if not ar_page_info.exists: + return { + 'arabic': ar_page_info, + 'english': None, + 'sync_possible': False, + 'error': f"Arabic page '{ar_page_title}' does not exist" + } + + # Step 2: Find corresponding English page + en_page_title = self._find_english_page_title(ar_page_info) + + if not en_page_title: + error_msg = ( + f"No corresponding English page found for '{ar_page_title}'" + ) + return { + 'arabic': ar_page_info, + 'english': None, + 'sync_possible': False, + 'error': error_msg + } + + # Step 3: Fetch English page + en_page_info = self.en_fetcher.fetch_page_info(en_page_title) + + error_msg = None + if not en_page_info.exists: + error_msg = f"English page '{en_page_title}' does not exist" + + return { + 'arabic': ar_page_info, + 'english': en_page_info, + 'sync_possible': en_page_info.exists, + 'error': error_msg + } + + def _find_english_page_title(self, + ar_page_info: PageInfo) -> Optional[str]: + """Find the corresponding English page title.""" + # Method 1: Check langlinks from Arabic page + if ar_page_info.langlinks and 'en' in ar_page_info.langlinks: + return ar_page_info.langlinks['en'] + + # Method 2: Try direct title match + # This is a fallback - in reality you'd want more sophisticated + # matching + msg = f"No direct English langlink found for '{ar_page_info.title}'" + logger.warning(f"{msg}, trying direct match") + return ar_page_info.title + + def fetch_sync_result(self, ar_page_title: str) -> SyncResult: + """Fetch synchronization result with structured return type.""" + result = self.fetch_arabic_and_english_pages(ar_page_title) + + return SyncResult( + arabic=result['arabic'], + english=result['english'], + sync_possible=result['sync_possible'], + error=result['error'] + ) \ No newline at end of file diff --git a/tasks/InfoboxSync/map/__init__.py b/tasks/InfoboxSync/map/__init__.py new file mode 100644 index 00000000..4bf46847 --- /dev/null +++ b/tasks/InfoboxSync/map/__init__.py @@ -0,0 +1 @@ +# Map stage package \ No newline at end of file diff --git a/tasks/InfoboxSync/map/field_mappers.py b/tasks/InfoboxSync/map/field_mappers.py new file mode 100644 index 00000000..ae9f8e9b --- /dev/null +++ b/tasks/InfoboxSync/map/field_mappers.py @@ -0,0 +1,440 @@ +""" +Field mapping strategies for different data types in Wikipedia infoboxes. +""" + +import logging +import re +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional, List +from urllib.parse import urlparse + +logger = logging.getLogger(__name__) + + +class FieldMapper(ABC): + """ + Abstract base class for field mapping strategies. + Each field type (text, number, image, link, mixed) has its own mapper. + """ + + def __init__(self, english_key: str, arabic_key: str, field_type: str): + """ + Initialize the field mapper. + + Args: + english_key (str): English field name from infobox + arabic_key (str): Corresponding Arabic field name + field_type (str): Type of field (text, number, image, link, mixed) + """ + self.english_key = english_key + self.arabic_key = arabic_key + self.field_type = field_type + + @abstractmethod + def map_field(self, value: str) -> Dict[str, Any]: + """ + Map a field value to the standardized format. + + Args: + value (str): Raw field value from infobox + + Returns: + Dict[str, Any]: Mapped field data with Arabic key + """ + pass + + def _clean_value(self, value: str) -> str: + """Clean and normalize field value.""" + if not value: + return "" + return value.strip() + + +class NumberedFieldMapper(FieldMapper): + """ + Mapper for numbered fields that follow a pattern (field1, field2, field3, ...). + Groups related numbered fields into arrays/lists. + """ + + def __init__(self, base_english_key: str, arabic_key: str, field_type: str = "text"): + # Store the base key without number (e.g., "years" not "years1") + self.base_english_key = base_english_key + super().__init__(base_english_key, arabic_key, "numbered") + self.item_field_type = field_type + + def map_field(self, value: str) -> Dict[str, Any]: + """Map numbered field - this is handled by the template mapper.""" + # This method is not used directly for numbered fields + # The template mapper handles the grouping logic + return {} + + def map_numbered_fields(self, infobox_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Map all numbered fields for this base key. + + Args: + infobox_data: All infobox fields + + Returns: + Dict with Arabic key containing array of numbered field values + """ + numbered_values = [] + + # Find all fields that match the pattern: base_key + number + for key, value in infobox_data.items(): + if key.startswith(self.base_english_key): + # Extract the number from the key + number_part = key[len(self.base_english_key):] + if number_part.isdigit(): + number = int(number_part) + numbered_values.append({ + "number": number, + "value": value, + "original_key": key + }) + + # Sort by number + numbered_values.sort(key=lambda x: x["number"]) + + # Extract just the values in order + values_only = [item["value"] for item in numbered_values] + + return { + self.arabic_key: { + "value": values_only, + "type": "numbered", + "item_type": self.item_field_type, + "count": len(values_only), + "original_keys": [item["original_key"] for item in numbered_values] + } + } + + +class TextFieldMapper(FieldMapper): + """ + Mapper for text fields (names, descriptions, etc.). + """ + + def __init__(self, english_key: str, arabic_key: str): + super().__init__(english_key, arabic_key, "text") + + def map_field(self, value: str) -> Dict[str, Any]: + """Map text field value.""" + clean_value = self._clean_value(value) + + return { + self.arabic_key: { + "value": clean_value, + "type": "text", + "original_key": self.english_key, + "validation": self._validate_text(clean_value) + } + } + + def _validate_text(self, value: str) -> Dict[str, Any]: + """Validate text field.""" + return { + "is_valid": len(value) > 0, + "length": len(value), + "has_special_chars": bool(re.search(r'[^\w\s]', value)) + } + + +class NumberFieldMapper(FieldMapper): + """ + Mapper for numeric fields (ages, years, counts, etc.). + """ + + def __init__(self, english_key: str, arabic_key: str): + super().__init__(english_key, arabic_key, "number") + + def map_field(self, value: str) -> Dict[str, Any]: + """Map numeric field value.""" + clean_value = self._clean_value(value) + numeric_value = self._extract_number(clean_value) + + return { + self.arabic_key: { + "value": numeric_value, + "type": "number", + "original_key": self.english_key, + "validation": self._validate_number(clean_value), + "numeric_value": numeric_value + } + } + + def _extract_number(self, value: str) -> Optional[float]: + """Extract numeric value from string.""" + if not value: + return None + + # Remove common wiki formatting + value = re.sub(r'\[\[|\]\]', '', value) + value = re.sub(r'<[^>]+>', '', value) + + # Find first number (integer or decimal) + match = re.search(r'(\d+(?:\.\d+)?)', value) + if match: + return float(match.group(1)) + return None + + def _validate_number(self, value: str) -> Dict[str, Any]: + """Validate numeric field.""" + numeric_value = self._extract_number(value) + return { + "is_valid": numeric_value is not None, + "numeric_value": numeric_value, + "has_units": bool(re.search(r'\d+\s*\w+', value)) + } + + +class ImageFieldMapper(FieldMapper): + """ + Mapper for image fields. + """ + + def __init__(self, english_key: str, arabic_key: str): + super().__init__(english_key, arabic_key, "image") + + def map_field(self, value: str) -> Dict[str, Any]: + """Map image field value.""" + clean_value = self._clean_value(value) + image_info = self._parse_image(clean_value) + + return { + self.arabic_key: { + "value": image_info["filename"], + "type": "image", + "original_key": self.english_key, + "validation": self._validate_image(clean_value), + "image_info": image_info + } + } + + def _parse_image(self, value: str) -> Dict[str, Any]: + """Parse image field to extract filename and caption.""" + if not value: + return {"filename": "", "caption": ""} + + # Handle wiki image syntax [[File:filename.jpg|caption]] + file_match = re.search(r'\[\[File:([^|\]]+)(?:\|([^]]+))?\]\]', value, re.IGNORECASE) + if file_match: + return { + "filename": file_match.group(1), + "caption": file_match.group(2) or "" + } + + # Handle simple filename + return {"filename": value, "caption": ""} + + def _validate_image(self, value: str) -> Dict[str, Any]: + """Validate image field.""" + image_info = self._parse_image(value) + return { + "is_valid": bool(image_info["filename"]), + "has_caption": bool(image_info["caption"]), + "filename": image_info["filename"] + } + + +class LinkFieldMapper(FieldMapper): + """ + Mapper for link fields (internal/external links). + """ + + def __init__(self, english_key: str, arabic_key: str): + super().__init__(english_key, arabic_key, "link") + + def map_field(self, value: str) -> Dict[str, Any]: + """Map link field value.""" + clean_value = self._clean_value(value) + link_info = self._parse_link(clean_value) + + return { + self.arabic_key: { + "value": link_info["url"], + "type": "link", + "original_key": self.english_key, + "validation": self._validate_link(clean_value), + "link_info": link_info + } + } + + def _parse_link(self, value: str) -> Dict[str, Any]: + """Parse link to extract URL and display text.""" + if not value: + return {"url": "", "display_text": "", "is_external": False} + + # Handle wiki internal links [[Page|Display Text]] + internal_match = re.search(r'\[\[([^|\]]+)(?:\|([^]]+))?\]\]', value) + if internal_match: + return { + "url": internal_match.group(1), + "display_text": internal_match.group(2) or internal_match.group(1), + "is_external": False + } + + # Handle external links [http://example.com Display Text] + external_match = re.search(r'\[([^\s]+)(?:\s([^]]+))?\]', value) + if external_match: + return { + "url": external_match.group(1), + "display_text": external_match.group(2) or external_match.group(1), + "is_external": True + } + + # Plain text that might be a URL + if value.startswith(('http://', 'https://')): + return { + "url": value, + "display_text": value, + "is_external": True + } + + return {"url": value, "display_text": value, "is_external": False} + + def _validate_link(self, value: str) -> Dict[str, Any]: + """Validate link field.""" + link_info = self._parse_link(value) + is_valid_url = False + + if link_info["is_external"]: + try: + parsed = urlparse(link_info["url"]) + is_valid_url = bool(parsed.netloc) + except: + is_valid_url = False + + return { + "is_valid": bool(link_info["url"]), + "is_external": link_info["is_external"], + "is_valid_url": is_valid_url, + "has_display_text": link_info["display_text"] != link_info["url"] + } + + +class MixedFieldMapper(FieldMapper): + """ + Mapper for mixed content fields (containing multiple data types). + """ + + def __init__(self, english_key: str, arabic_key: str): + super().__init__(english_key, arabic_key, "mixed") + + def map_field(self, value: str) -> Dict[str, Any]: + """Map mixed field value.""" + clean_value = self._clean_value(value) + parsed_content = self._parse_mixed_content(clean_value) + + return { + self.arabic_key: { + "value": clean_value, + "type": "mixed", + "original_key": self.english_key, + "validation": self._validate_mixed(clean_value), + "parsed_content": parsed_content + } + } + + def _parse_mixed_content(self, value: str) -> Dict[str, Any]: + """Parse mixed content to identify different elements.""" + if not value: + return {"text_parts": [], "links": [], "images": [], "numbers": []} + + text_parts = [] + links = [] + images = [] + numbers = [] + + # Find links + link_matches = re.findall(r'\[\[[^\]]+\]\]', value) + links.extend(link_matches) + + # Find images + image_matches = re.findall(r'\[\[File:[^\]]+\]\]', value, re.IGNORECASE) + images.extend(image_matches) + + # Find numbers + number_matches = re.findall(r'\d+(?:\.\d+)?', value) + numbers.extend(number_matches) + + # Remove wiki markup for clean text + clean_text = re.sub(r'\[\[[^\]]+\]\]', '', value) + clean_text = re.sub(r'<[^>]+>', '', clean_text) + text_parts = [part.strip() for part in clean_text.split() if part.strip()] + + return { + "text_parts": text_parts, + "links": links, + "images": images, + "numbers": numbers + } + + def _validate_mixed(self, value: str) -> Dict[str, Any]: + """Validate mixed field.""" + parsed = self._parse_mixed_content(value) + return { + "is_valid": len(value) > 0, + "has_links": len(parsed["links"]) > 0, + "has_images": len(parsed["images"]) > 0, + "has_numbers": len(parsed["numbers"]) > 0, + "text_parts_count": len(parsed["text_parts"]) + } + +class RawFieldMapper(FieldMapper): + """ + Mapper for raw fields that takes the value as is without any preprocessing. + """ + + def __init__(self, english_key: str, arabic_key: str): + super().__init__(english_key, arabic_key, "raw") + + def map_field(self, value: str) -> Dict[str, Any]: + """Map raw field value without any processing.""" + return { + self.arabic_key: { + "value": value, + "type": "raw", + "original_key": self.english_key, + "validation": {"is_valid": True} + } + } + + + +class FieldMapperFactory: + """ + Factory for creating appropriate field mappers. + """ + + @staticmethod + def create_mapper(english_key: str, arabic_key: str, field_type: str) -> FieldMapper: + """ + Create appropriate field mapper based on type. + + Args: + english_key (str): English field name + arabic_key (str): Arabic field name + field_type (str): Type of field mapper to create + + Returns: + FieldMapper: Appropriate field mapper instance + """ + field_type = field_type.lower() + if field_type == "text": + return TextFieldMapper(english_key, arabic_key) + elif field_type == "number": + return NumberFieldMapper(english_key, arabic_key) + elif field_type == "image": + return ImageFieldMapper(english_key, arabic_key) + elif field_type == "link": + return LinkFieldMapper(english_key, arabic_key) + elif field_type == "mixed": + return MixedFieldMapper(english_key, arabic_key) + elif field_type == "numbered": + return NumberedFieldMapper(english_key, arabic_key) + elif field_type == "raw": + return RawFieldMapper(english_key, arabic_key) + else: + # Default to text mapper + return TextFieldMapper(english_key, arabic_key) \ No newline at end of file diff --git a/tasks/InfoboxSync/map/map.py b/tasks/InfoboxSync/map/map.py new file mode 100644 index 00000000..ee01ae48 --- /dev/null +++ b/tasks/InfoboxSync/map/map.py @@ -0,0 +1,131 @@ +""" +Map stage for Wikipedia infobox synchronization using Strategy Pattern. +""" + +import logging +from .template_mapper import TemplateMapperFactory + +logger = logging.getLogger(__name__) + + +def map_data(parsed_data: dict, + template_type: str = 'football_biography') -> dict: + """ + Map the parsed data to a standardized format with Arabic field names. + + Args: + parsed_data (dict): The parsed data from the parse stage. + template_type (str): Type of template ('football_biography', + 'person', etc.) + + Returns: + dict: Mapped data in standardized format with Arabic field names. + """ + msg = "Starting data mapping for template type: {}".format(template_type) + logger.info(msg) + + try: + page_title = parsed_data.get('title', '') + infobox_data = parsed_data.get('infobox', {}) + + # Create appropriate template mapper + template_mapper = TemplateMapperFactory.create_mapper(template_type) + + # Map the infobox data using the template mapper + mapped_infobox = template_mapper.map_infobox(infobox_data) + + # Build the final mapped data structure + mapped_data = { + 'page_title': page_title, + 'template_type': template_type, + 'arabic_fields': mapped_infobox['mapped_fields'], + 'metadata': { + 'categories': parsed_data.get('categories', []), + 'links': parsed_data.get('links', []), + 'template_name': mapped_infobox['template_name'], + 'total_mapped_fields': mapped_infobox['total_mapped_fields'], + 'original_field_count': mapped_infobox['original_field_count'] + }, + 'raw_content': parsed_data.get('raw_content', ''), + 'arabic_title': parsed_data.get('arabic_title', '') + } + + logger.info("Successfully mapped data for: {}".format(page_title)) + msg = ("Mapped {} fields out of {} original fields").format( + mapped_infobox['total_mapped_fields'], + mapped_infobox['original_field_count']) + logger.info(msg) + + return mapped_data + + except Exception as e: + logger.error("Error mapping data: {}".format(e)) + raise + + +def get_supported_template_types() -> list: + """ + Get list of supported template types for mapping. + + Returns: + list: List of supported template type strings + """ + return TemplateMapperFactory.get_supported_templates() + + +def create_field_demo(template_type: str = 'football_biography') -> dict: + """ + Create a demo showing different field types for a template. + + Args: + template_type (str): Type of template to create demo for + + Returns: + dict: Demo data showing different field types + """ + if template_type == 'football_biography': + return { + "name": "Lionel Messi", # text field + "height": "1.70 m", # number field + # image field + "image": "[[File:Messi_vs_Nigeria_2018.jpg|Messi playing]]", + # link field + "website": "[http://www.messi.com Official Website]", + # mixed field + "position": "[[Forward (association football)|Forward]]", + "clubnumber": "10", # number field + "caps1": "520", # number field + "goals1": "474" # number field + } + + return {} + + +def demonstrate_field_types(): + """ + Demonstrate how different field types are mapped. + """ + logger.info("Demonstrating field type mapping...") + + # Create demo data + demo_data = create_field_demo('football_biography') + + # Map the demo data + try: + mapped_result = map_data({ + 'title': 'Demo Football Player', + 'infobox': demo_data, + 'categories': ['Football players'], + 'links': ['Argentina national football team'], + 'arabic_title': 'لاعب كرة قدم تجريبي' + }, 'football_biography') + + logger.info("Demo mapping completed successfully") + arabic_fields = list(mapped_result['arabic_fields'].keys()) + logger.info("Arabic fields: {}".format(arabic_fields)) + + return mapped_result + + except Exception as e: + logger.error("Demo mapping failed: {}".format(e)) + return {} \ No newline at end of file diff --git a/tasks/InfoboxSync/map/template_mapper.py b/tasks/InfoboxSync/map/template_mapper.py new file mode 100644 index 00000000..e612401c --- /dev/null +++ b/tasks/InfoboxSync/map/template_mapper.py @@ -0,0 +1,279 @@ +""" +Template mapper classes for mapping English infobox fields to Arabic equivalents. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Dict, Any, List +from .field_mappers import FieldMapperFactory, FieldMapper, NumberedFieldMapper + +logger = logging.getLogger(__name__) + + +class TemplateMapper(ABC): + """ + Abstract base class for template-specific field mapping. + Each template type (football biography, person, etc.) has its own mapper. + """ + + def __init__(self, template_name: str): + """ + Initialize the template mapper. + + Args: + template_name (str): Name of the template being mapped + """ + self.template_name = template_name + self.field_mappings = self._get_field_mappings() + + @abstractmethod + def _get_field_mappings(self) -> Dict[str, Dict[str, Any]]: + """ + Get field mappings for this template type. + + Returns: + Dict[str, Dict[str, Any]]: Mapping configuration with format: + { + "english_field_name": { + "arabic_key": "الاسم", + "field_type": "text|number|image|link|mixed|numbered" + } + } + """ + pass + + def map_infobox(self, infobox_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Map all infobox fields using the configured field mappers. + + Args: + infobox_data (Dict[str, Any]): Raw infobox data from parser + + Returns: + Dict[str, Any]: Mapped data with Arabic field names + """ + logger.info("Mapping infobox fields for template: {}".format( + self.template_name)) + + mapped_data = {} + mapped_fields = {} + + # Handle numbered fields first (they need access to all data) + numbered_mappings = {} + for english_key, mapping_config in self.field_mappings.items(): + if mapping_config["field_type"] == "numbered": + numbered_mappings[english_key] = mapping_config + + for base_key, mapping_config in numbered_mappings.items(): + arabic_key = mapping_config["arabic_key"] + item_type = mapping_config.get("item_type", "text") + + # Create numbered field mapper + numbered_mapper = NumberedFieldMapper(base_key, arabic_key, item_type) + + # Map all numbered fields for this base key + try: + mapped_field = numbered_mapper.map_numbered_fields(infobox_data) + mapped_fields.update(mapped_field) + + logger.debug("Mapped numbered field '{}' -> '{}'".format( + base_key, arabic_key)) + + except Exception as e: + logger.warning("Failed to map numbered field '{}': {}".format( + base_key, e)) + + # Handle regular fields + for english_key, value in infobox_data.items(): + # Skip if this key was already handled as part of numbered fields + is_numbered_field = False + for base_key in numbered_mappings.keys(): + if english_key.startswith(base_key): + is_numbered_field = True + break + + if is_numbered_field: + continue + + # Normalize the key + normalized_key = english_key.lower().replace(' ', '_').replace('-', '_') + + # Check if we have a mapping for this field + if normalized_key in self.field_mappings: + mapping_config = self.field_mappings[normalized_key] + arabic_key = mapping_config["arabic_key"] + field_type = mapping_config["field_type"] + + # Create appropriate field mapper + field_mapper = FieldMapperFactory.create_mapper( + english_key, arabic_key, field_type + ) + + # Map the field + try: + mapped_field = field_mapper.map_field(str(value)) + mapped_fields.update(mapped_field) + + logger.debug("Mapped field '{}' -> '{}' (type: {})".format( + english_key, arabic_key, field_type)) + + except Exception as e: + logger.warning("Failed to map field '{}': {}".format( + english_key, e)) + # Fall back to text mapping + text_mapper = FieldMapperFactory.create_mapper( + english_key, arabic_key, "text" + ) + mapped_field = text_mapper.map_field(str(value)) + mapped_fields.update(mapped_field) + + else: + logger.debug("No mapping found for field '{}', skipping".format( + english_key)) + + mapped_data["mapped_fields"] = mapped_fields + mapped_data["template_name"] = self.template_name + mapped_data["total_mapped_fields"] = len(mapped_fields) + mapped_data["original_field_count"] = len(infobox_data) + + logger.info("Successfully mapped {} fields from {} original fields".format( + len(mapped_fields), len(infobox_data))) + + return mapped_data + + def get_supported_fields(self) -> List[str]: + """ + Get list of supported English field names. + + Returns: + List[str]: List of supported field names + """ + return list(self.field_mappings.keys()) + + def get_field_info(self, english_key: str) -> Dict[str, Any]: + """ + Get information about a specific field mapping. + + Args: + english_key (str): English field name + + Returns: + Dict[str, Any]: Field mapping information or empty dict if not found + """ + normalized_key = english_key.lower().replace(' ', '_').replace('-', '_') + return self.field_mappings.get(normalized_key, {}) + + +class FootballBiographyMapper(TemplateMapper): + """ + Mapper for football biography infobox templates. + Maps English fields to Arabic equivalents with appropriate field types. + Handles both regular fields and numbered sequences (years1, clubs1, etc.). + """ + + def __init__(self): + super().__init__("football_biography") + + def _get_field_mappings(self) -> Dict[str, Dict[str, Any]]: + """Get field mappings for football biography template.""" + return { + # Personal Information + "name": {"arabic_key": "اسم", "field_type": "text"}, + "fullname": {"arabic_key": "الاسم الكامل", "field_type": "text"}, + "full_name": {"arabic_key": "الاسم الكامل", "field_type": "text"}, + "image": {"arabic_key": "صورة", "field_type": "image"}, + "upright": {"arabic_key": "حجم الصورة", "field_type": "number"}, + "caption": {"arabic_key": "تعليق الصورة", "field_type": "raw"}, + "birth_date": {"arabic_key": "تاريخ الولادة", "field_type": "raw"}, + "birth_place": {"arabic_key": "مكان الولادة", "field_type": "raw"}, + "death_date": {"arabic_key": "تاريخ الوفاة", "field_type": "raw"}, + "death_place": {"arabic_key": "مكان الوفاة", "field_type": "raw"}, + "height": {"arabic_key": "الطول", "field_type": "number"}, + "position": {"arabic_key": "المركز", "field_type": "raw"}, + # Club Career + "clubnumber": {"arabic_key": "الرقم بالنادي", "field_type": "number"}, + "youthclubs": {"arabic_key": "أندية_الشباب", "field_type": "numbered", "item_type": "raw"}, + "youthyears": {"arabic_key": "سنوات_الشباب", "field_type": "numbered", "item_type": "raw"}, + "clubs": {"arabic_key": "أندية", "field_type": "numbered", "item_type": "raw"}, + "years": {"arabic_key": "سنوات", "field_type": "numbered", "item_type": "raw"}, + "caps": {"arabic_key": "مباريات", "field_type": "numbered", "item_type": "number"}, + "goals": {"arabic_key": "أهداف", "field_type": "numbered", "item_type": "number"}, + "totalcaps": {"arabic_key": "مجموع_مباريات", "field_type": "number"}, + "totalgoals": {"arabic_key": "إجمالي الأهداف", "field_type": "number"}, + "club-update": {"arabic_key": "تحديث الأندية", "field_type": "raw"}, + "pcupdate": {"arabic_key": "تحديث الأندية", "field_type": "raw"}, + # National Team Career + "nationalteam": {"arabic_key": "منتخب_وطني", "field_type": "numbered", "item_type": "raw"}, + "nationalyears": {"arabic_key": "سنوات_وطنية", "field_type": "numbered", "item_type": "raw"}, + "nationalcaps": {"arabic_key": "مباريات_وطنية", "field_type": "numbered", "item_type": "number"}, + "nationalgoals": {"arabic_key": "أهداف_وطنية", "field_type": "numbered", "item_type": "number"}, + "nationalteam-update": {"arabic_key": "تحديث المنتخب", "field_type": "raw"}, + "ntupdate": {"arabic_key": "تحديث المنتخب", "field_type": "raw"}, + # Managerial Career + "managerclubs": {"arabic_key": "أندية_مدرب", "field_type": "numbered", "item_type": "raw"}, + "manageryears": {"arabic_key": "سنوات_مدرب", "field_type": "numbered", "item_type": "raw"}, + # Honors + "medaltemplates": {"arabic_key": "ميداليات", "field_type": "mixed"}, + } + + +class GenericTemplateMapper(TemplateMapper): + """ + Generic mapper for templates without specific field mappings. + Falls back to text mapping for all fields. + """ + + def __init__(self, template_name: str): + self.custom_template_name = template_name + super().__init__(template_name) + + def _get_field_mappings(self) -> Dict[str, Dict[str, Any]]: + """ + Generic mapper returns empty dict - all fields will be skipped + unless custom mappings are provided. + """ + # This could be extended to load mappings from config files + return {} + + +class TemplateMapperFactory: + """ + Factory for creating appropriate template mappers. + """ + + @staticmethod + def create_mapper(template_type: str) -> TemplateMapper: + """ + Create appropriate template mapper based on type. + + Args: + template_type (str): Type of template ('football_biography', etc.) + + Returns: + TemplateMapper: Appropriate template mapper instance + """ + template_type = template_type.lower() + + if template_type == 'football_biography': + return FootballBiographyMapper() + elif template_type == 'person': + return GenericTemplateMapper("person") + elif template_type == 'biography': + return GenericTemplateMapper("biography") + else: + # For custom template names, create generic mapper + return GenericTemplateMapper(template_type) + + @staticmethod + def get_supported_templates() -> List[str]: + """ + Get list of supported template types. + + Returns: + List[str]: List of supported template type strings + """ + return [ + 'football_biography', + 'person', + 'biography' + ] \ No newline at end of file diff --git a/tasks/InfoboxSync/parse/__init__.py b/tasks/InfoboxSync/parse/__init__.py new file mode 100644 index 00000000..1746fc21 --- /dev/null +++ b/tasks/InfoboxSync/parse/__init__.py @@ -0,0 +1 @@ +# Parse stage package \ No newline at end of file diff --git a/tasks/InfoboxSync/parse/base_parser.py b/tasks/InfoboxSync/parse/base_parser.py new file mode 100644 index 00000000..83450980 --- /dev/null +++ b/tasks/InfoboxSync/parse/base_parser.py @@ -0,0 +1,84 @@ +""" +Abstract base class for infobox parsers using Strategy Pattern. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Dict, Any +import wikitextparser as wtp + +logger = logging.getLogger(__name__) + + +class InfoboxParser(ABC): + """ + Abstract base class for infobox parsers using Strategy Pattern. + + This class defines the interface for parsing different types of + Wikipedia infobox templates using wikitextparser. + """ + + def __init__(self, template_name: str): + """ + Initialize the parser with the target template name. + + Args: + template_name (str): Name of the infobox template to parse + """ + self.template_name = template_name.lower() + + @abstractmethod + def parse_infobox(self, wikitext: str) -> Dict[str, Any]: + """ + Parse the infobox from wikitext. + + Args: + wikitext (str): The raw Wikipedia page content + + Returns: + Dict[str, Any]: Extracted infobox fields + """ + pass + + def _find_template(self, parsed_wikitext: wtp.WikiText) -> wtp.Template: + """ + Find the target template in the parsed wikitext. + + Args: + parsed_wikitext: Parsed wikitext object + + Returns: + wtp.Template: The found template object, or None + """ + templates = parsed_wikitext.templates + + for template in templates: + template_name = template.name.strip().lower() + if template_name == self.template_name: + return template + + return None + + def _extract_template_arguments(self, template: wtp.Template) -> Dict[str, str]: + """ + Extract arguments from a template object. + + Args: + template: The template object to extract from + + Returns: + Dict[str, str]: Dictionary of template arguments + """ + infobox_data = {} + + for argument in template.arguments: + key = argument.name.strip() + value = argument.value.strip() + + # Clean up the value by removing markup if needed + # clean_value = wtp.parse(value).plain_text() + clean_value = value + if key and clean_value: + infobox_data[key] = clean_value + + return infobox_data \ No newline at end of file diff --git a/tasks/InfoboxSync/parse/football_parser.py b/tasks/InfoboxSync/parse/football_parser.py new file mode 100644 index 00000000..b39bc8de --- /dev/null +++ b/tasks/InfoboxSync/parse/football_parser.py @@ -0,0 +1,59 @@ +""" +Football biography infobox parser implementation. +""" + +import logging +from typing import Dict, Any +from .base_parser import InfoboxParser + +logger = logging.getLogger(__name__) + + +class FootballBiographyParser(InfoboxParser): + """ + Parser for Infobox football biography template. + """ + + def __init__(self): + super().__init__("infobox football biography") + + def parse_infobox(self, wikitext: str) -> Dict[str, Any]: + """ + Parse football biography infobox from wikitext. + + Args: + wikitext (str): The raw Wikipedia page content + + Returns: + Dict[str, Any]: Extracted football biography fields + """ + infobox_data = {} + + try: + # Parse wikitext using wikitextparser + import wikitextparser as wtp + parsed = wtp.parse(wikitext) + + # Find the football biography template + football_bio_template = self._find_template(parsed) + + if football_bio_template: + logger.info("Found Infobox football biography template") + + # Extract arguments from the template + infobox_data = self._extract_template_arguments( + football_bio_template) + + count = len(infobox_data) + msg = "Extracted {} fields from football biography infobox" + logger.info(msg.format(count)) + else: + msg = ("No Infobox football biography template found in the " + "page") + logger.warning(msg) + + except Exception as e: + msg = "Error extracting football biography infobox: {}" + logger.error(msg.format(e)) + + return infobox_data \ No newline at end of file diff --git a/tasks/InfoboxSync/parse/parse.py b/tasks/InfoboxSync/parse/parse.py new file mode 100644 index 00000000..d4948025 --- /dev/null +++ b/tasks/InfoboxSync/parse/parse.py @@ -0,0 +1,112 @@ +""" +Parse stage for Wikipedia infobox synchronization using Strategy Pattern. +""" + +import logging +from .parser_factory import InfoboxParserFactory + +logger = logging.getLogger(__name__) + + +def parse_data(data: dict, template_type: str = 'football_biography') -> dict: + """ + Parse the fetched Wikipedia data to extract infobox information. + + Args: + data (dict): The raw Wikipedia data with page content. + template_type (str): Type of template to parse ('football_biography', + 'person', etc.) + + Returns: + dict: Parsed infobox data. + """ + logger.info("Starting Wikipedia data parsing for template: {}".format( + template_type)) + + try: + page_content = data.get('content', '') + page_title = data.get('title', '') + arabic_title = data.get('arabic_title', '') + + # Create parser using Strategy Pattern + parser = InfoboxParserFactory.create_parser(template_type) + + # Parse infobox from Wikipedia content + infobox_data = parser.parse_infobox(page_content) + + # Extract categories + categories = extract_categories_from_wikitext(page_content) + + # Extract links (simplified - could be enhanced) + links = extract_links_from_wikitext(page_content) + + parsed_data = { + 'title': page_title, + 'arabic_title': arabic_title, + 'infobox': infobox_data, + 'categories': categories, + 'links': links, + 'raw_content': page_content + } + + logger.info("Successfully parsed data for title: {}".format(page_title)) + return parsed_data + + except Exception as e: + logger.error("Error parsing Wikipedia data: {}".format(e)) + raise + + +def extract_categories_from_wikitext(wikitext: str) -> list: + """ + Extract categories from Wikipedia wikitext. + + Args: + wikitext (str): The raw Wikipedia page content. + + Returns: + list: List of category names. + """ + import re + categories = [] + + try: + # Pattern to match category links + category_pattern = r'\[\[Category:([^\]]+)\]\]' + matches = re.findall(category_pattern, wikitext, re.IGNORECASE) + + categories = [match.strip() for match in matches] + + except Exception as e: + logger.warning("Error extracting categories: {}".format(e)) + + return categories + + +def extract_links_from_wikitext(wikitext: str) -> list: + """ + Extract internal links from Wikipedia wikitext. + + Args: + wikitext (str): The raw Wikipedia page content. + + Returns: + list: List of linked page titles. + """ + import re + links = [] + + try: + # Pattern to match internal links [[Link|Display]] + link_pattern = r'\[\[([^\]|]+)(?:\|[^\]]+)?\]\]' + matches = re.findall(link_pattern, wikitext) + + # Filter out special links (File:, Category:, etc.) + special_prefixes = ('File:', 'Category:', 'Image:', 'Template:') + links = [match.strip() for match in matches + if not match.startswith(special_prefixes)] + + except Exception as e: + logger.warning("Error extracting links: {}".format(e)) + + return links \ No newline at end of file diff --git a/tasks/InfoboxSync/parse/parser_factory.py b/tasks/InfoboxSync/parse/parser_factory.py new file mode 100644 index 00000000..9344c088 --- /dev/null +++ b/tasks/InfoboxSync/parse/parser_factory.py @@ -0,0 +1,54 @@ +""" +Factory class for creating infobox parsers using Factory Pattern. +""" + +from .base_parser import InfoboxParser +from .football_parser import FootballBiographyParser + + +class InfoboxParserFactory: + """ + Factory class to create appropriate parsers based on template type. + """ + + @staticmethod + def create_parser(template_type: str) -> InfoboxParser: + """ + Create the appropriate parser for the given template type. + + Args: + template_type (str): Type of template ('football_biography', + 'person', etc.) + + Returns: + InfoboxParser: The appropriate parser instance + + Raises: + ValueError: If template type is not supported + """ + if template_type.lower() == 'football_biography': + return FootballBiographyParser() + elif template_type.lower() == 'person': + from .generic_parser import GenericInfoboxParser + return GenericInfoboxParser("infobox person") + elif template_type.lower() == 'biography': + from .generic_parser import GenericInfoboxParser + return GenericInfoboxParser("infobox biography") + else: + # For custom template names, create generic parser + from .generic_parser import GenericInfoboxParser + return GenericInfoboxParser(template_type) + + @staticmethod + def get_supported_types() -> list: + """ + Get list of supported template types. + + Returns: + list: List of supported template type strings + """ + return [ + 'football_biography', + 'person', + 'biography' + ] \ No newline at end of file diff --git a/tasks/InfoboxSync/parse/parsers.py b/tasks/InfoboxSync/parse/parsers.py new file mode 100644 index 00000000..b899165d --- /dev/null +++ b/tasks/InfoboxSync/parse/parsers.py @@ -0,0 +1,203 @@ +""" +Infobox parsers using Strategy Pattern for different template types. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Dict, Any +import wikitextparser as wtp + +logger = logging.getLogger(__name__) + + +class InfoboxParser(ABC): + """ + Abstract base class for infobox parsers using Strategy Pattern. + + This class defines the interface for parsing different types of + Wikipedia infobox templates using wikitextparser. + """ + + def __init__(self, template_name: str): + """ + Initialize the parser with the target template name. + + Args: + template_name (str): Name of the infobox template to parse + """ + self.template_name = template_name.lower() + + @abstractmethod + def parse_infobox(self, wikitext: str) -> Dict[str, Any]: + """ + Parse the infobox from wikitext. + + Args: + wikitext (str): The raw Wikipedia page content + + Returns: + Dict[str, Any]: Extracted infobox fields + """ + pass + + def _find_template(self, parsed_wikitext: wtp.WikiText) -> wtp.Template: + """ + Find the target template in the parsed wikitext. + + Args: + parsed_wikitext: Parsed wikitext object + + Returns: + wtp.Template: The found template object, or None + """ + templates = parsed_wikitext.templates + + for template in templates: + template_name = template.name.strip().lower() + if template_name == self.template_name: + return template + + return None + + def _extract_template_arguments(self, template: wtp.Template) -> Dict[str, str]: + """ + Extract arguments from a template object. + + Args: + template: The template object to extract from + + Returns: + Dict[str, str]: Dictionary of template arguments + """ + infobox_data = {} + + for argument in template.arguments: + key = argument.name.strip() + value = argument.value.strip() + + # Clean up the value by removing markup if needed + clean_value = wtp.parse(value).plain_text() + + if key and clean_value: + infobox_data[key] = clean_value + + return infobox_data + + +class FootballBiographyParser(InfoboxParser): + """ + Parser for Infobox football biography template. + """ + + def __init__(self): + super().__init__("infobox football biography") + + def parse_infobox(self, wikitext: str) -> Dict[str, Any]: + """ + Parse football biography infobox from wikitext. + + Args: + wikitext (str): The raw Wikipedia page content + + Returns: + Dict[str, Any]: Extracted football biography fields + """ + infobox_data = {} + + try: + # Parse wikitext using wikitextparser + parsed = wtp.parse(wikitext) + + # Find the football biography template + football_bio_template = self._find_template(parsed) + + if football_bio_template: + logger.info("Found Infobox football biography template") + + # Extract arguments from the template + infobox_data = self._extract_template_arguments(football_bio_template) + + logger.info("Extracted {} fields from football biography infobox".format( + len(infobox_data))) + else: + logger.warning("No Infobox football biography template " + "found in the page") + + except Exception as e: + logger.error("Error extracting football biography infobox: {}".format(e)) + + return infobox_data + + +class GenericInfoboxParser(InfoboxParser): + """ + Generic parser for any infobox template type. + """ + + def parse_infobox(self, wikitext: str) -> Dict[str, Any]: + """ + Parse generic infobox from wikitext. + + Args: + wikitext (str): The raw Wikipedia page content + + Returns: + Dict[str, Any]: Extracted infobox fields + """ + infobox_data = {} + + try: + # Parse wikitext using wikitextparser + parsed = wtp.parse(wikitext) + + # Find the target template + template = self._find_template(parsed) + + if template: + logger.info("Found {} template".format(self.template_name)) + + # Extract arguments from the template + infobox_data = self._extract_template_arguments(template) + + logger.info("Extracted {} fields from {} template".format( + len(infobox_data), self.template_name)) + else: + logger.warning("No {} template found in the page".format( + self.template_name)) + + except Exception as e: + logger.error("Error extracting {} infobox: {}".format( + self.template_name, e)) + + return infobox_data + + +class InfoboxParserFactory: + """ + Factory class to create appropriate parsers based on template type. + """ + + @staticmethod + def create_parser(template_type: str) -> InfoboxParser: + """ + Create the appropriate parser for the given template type. + + Args: + template_type (str): Type of template ('football_biography', + 'person', etc.) + + Returns: + InfoboxParser: The appropriate parser instance + + Raises: + ValueError: If template type is not supported + """ + if template_type.lower() == 'football_biography': + return FootballBiographyParser() + elif template_type.lower() == 'person': + return GenericInfoboxParser("infobox person") + elif template_type.lower() == 'biography': + return GenericInfoboxParser("infobox biography") + else: + # For custom template names, create generic parser + return GenericInfoboxParser(template_type) \ No newline at end of file diff --git a/tasks/InfoboxSync/publish/__init__.py b/tasks/InfoboxSync/publish/__init__.py new file mode 100644 index 00000000..5761d6ad --- /dev/null +++ b/tasks/InfoboxSync/publish/__init__.py @@ -0,0 +1 @@ +# Publish stage for publishing Arabic templates to Wikipedia \ No newline at end of file diff --git a/tasks/InfoboxSync/publish/publish.py b/tasks/InfoboxSync/publish/publish.py new file mode 100644 index 00000000..51b9c8f1 --- /dev/null +++ b/tasks/InfoboxSync/publish/publish.py @@ -0,0 +1,265 @@ +""" +Publish stage for publishing Arabic templates to Wikipedia. +""" + +import logging +from typing import Dict, Any, Optional +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + + +@dataclass +class PublishResult: + """Result of a publish operation.""" + success: bool + page_title: str + edit_summary: str + revision_id: Optional[int] = None + errors: list = None + metadata: Dict[str, Any] = None + + def __post_init__(self): + if self.errors is None: + self.errors = [] + if self.metadata is None: + self.metadata = {} + + +def publish_arabic_template(translated_data: Dict[str, Any], + arabic_page_title: str, + edit_summary: str = "تحديث قالب السيرة الذاتية باستخدام InfoboxSync") -> PublishResult: + """ + Publish an Arabic Wikipedia template to the specified page. + + Args: + translated_data (Dict[str, Any]): Data from previous stages including 'arabic_template' + arabic_page_title (str): Title of the Arabic Wikipedia page to publish to + edit_summary (str): Edit summary for the Wikipedia edit + + Returns: + PublishResult: Result of the publish operation + """ + logger.info(f"Starting publish operation for page: {arabic_page_title}") + + try: + # Check if arabic_template exists in the data + if 'arabic_template' not in translated_data: + error_msg = "No arabic_template found in translated_data" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + template_text = translated_data['arabic_template'] + if not template_text or not template_text.strip(): + error_msg = "Arabic template is empty or invalid" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + # Import pywikibot + try: + import pywikibot + except ImportError: + error_msg = "pywikibot is required for publishing. Install with: pip install pywikibot" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + # Initialize Arabic Wikipedia site + try: + site = pywikibot.Site('ar', 'wikipedia') + logger.info("Connected to Arabic Wikipedia") + except Exception as e: + error_msg = f"Failed to connect to Arabic Wikipedia: {e}" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + # Create page object + try: + page = pywikibot.Page(site, arabic_page_title) + logger.info(f"Created page object for: {arabic_page_title}") + except Exception as e: + error_msg = f"Failed to create page object: {e}" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + # Check if page exists + if not page.exists(): + error_msg = f"Page '{arabic_page_title}' does not exist on Arabic Wikipedia" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + # Get current page content + try: + current_content = page.text + logger.info(f"Retrieved current page content (length: {len(current_content)})") + except Exception as e: + error_msg = f"Failed to retrieve current page content: {e}" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + # Smart template insertion/replacement using wikitextparser + try: + import wikitextparser as wtp + + # Parse the current page content + parsed_content = wtp.parse(current_content) + + # Find existing infobox templates + existing_infoboxes = [] + for template in parsed_content.templates: + template_name = template.name.strip() + # Check for common Arabic infobox template names + if any(infobox_name in template_name.lower() for infobox_name in [ + 'صندوق', 'infobox', 'سيرة', 'biography', 'person', 'football' + ]): + existing_infoboxes.append(template) + + if existing_infoboxes: + # Remove existing infoboxes + logger.info(f"Found {len(existing_infoboxes)} existing infobox(es), removing them") + for infobox in existing_infoboxes: + # Remove the template from the parsed content + infobox.string = '' + + # Clean up empty lines around removed templates + new_content = str(parsed_content) + new_content = '\n'.join(line for line in new_content.split('\n') if line.strip() or line == '') + + # Insert new template at the beginning + final_content = template_text + '\n\n' + new_content.strip() + logger.info("Replaced existing infobox with new template") + else: + # No existing infobox, add template at the beginning + final_content = template_text + '\n\n' + current_content.strip() + logger.info("Added new template at the beginning of the page") + + # Set the final content + page.text = final_content + logger.info(f"Set new page content (length: {len(final_content)})") + + # Save the page + page.save(summary=edit_summary, minor=False) + revision_id = page.latest_revision_id + + logger.info(f"Successfully published template to: {arabic_page_title}") + logger.info(f"Revision ID: {revision_id}") + + return PublishResult( + success=True, + page_title=arabic_page_title, + edit_summary=edit_summary, + revision_id=revision_id, + metadata={ + 'template_length': len(template_text), + 'site': 'ar.wikipedia.org', + 'published_at': page.editTime().isoformat() if hasattr(page, 'editTime') else None + } + ) + + except Exception as e: + error_msg = f"Failed to save page: {e}" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + except Exception as e: + error_msg = f"Unexpected error during publish operation: {e}" + logger.error(error_msg) + return PublishResult( + success=False, + page_title=arabic_page_title, + edit_summary=edit_summary, + errors=[error_msg] + ) + + +def publish_data(translated_data: Dict[str, Any], + arabic_page_title: str, + edit_summary: str = "تحديث قالب السيرة الذاتية باستخدام InfoboxSync") -> PublishResult: + """ + Convenience function to publish translated data to Arabic Wikipedia. + + Args: + translated_data (Dict[str, Any]): Translated data with arabic_template + arabic_page_title (str): Arabic page title to publish to + edit_summary (str): Edit summary for the edit + + Returns: + PublishResult: Publish operation result + """ + return publish_arabic_template(translated_data, arabic_page_title, edit_summary) + + +def validate_publish_data(translated_data: Dict[str, Any], arabic_page_title: str) -> Dict[str, Any]: + """ + Validate data before publishing. + + Args: + translated_data (Dict[str, Any]): Data to validate + arabic_page_title (str): Target page title + + Returns: + Dict with validation results + """ + errors = [] + warnings = [] + + # Check arabic_template + if 'arabic_template' not in translated_data: + errors.append("Missing arabic_template in translated_data") + elif not translated_data['arabic_template'] or not translated_data['arabic_template'].strip(): + errors.append("arabic_template is empty") + elif not translated_data['arabic_template'].startswith('{{'): + warnings.append("Template doesn't start with '{{' - may not be a valid wiki template") + + # Check arabic_page_title + if not arabic_page_title or not arabic_page_title.strip(): + errors.append("Arabic page title is empty") + elif len(arabic_page_title) > 255: + errors.append("Arabic page title is too long (>255 characters)") + + return { + 'valid': len(errors) == 0, + 'errors': errors, + 'warnings': warnings, + 'arabic_page_title': arabic_page_title, + 'has_template': 'arabic_template' in translated_data + } \ No newline at end of file diff --git a/tasks/InfoboxSync/save/__init__.py b/tasks/InfoboxSync/save/__init__.py new file mode 100644 index 00000000..446de7ff --- /dev/null +++ b/tasks/InfoboxSync/save/__init__.py @@ -0,0 +1 @@ +# Save stage package \ No newline at end of file diff --git a/tasks/InfoboxSync/save/save.py b/tasks/InfoboxSync/save/save.py new file mode 100644 index 00000000..fb2bb3ce --- /dev/null +++ b/tasks/InfoboxSync/save/save.py @@ -0,0 +1,37 @@ +import logging +import json +import os + +logger = logging.getLogger(__name__) + + +def save_data(translated_data: dict, output_dir: str = 'output') -> str: + """ + Save the translated data to a file. + + Args: + translated_data (dict): The translated data from the translate stage. + output_dir (str): Directory to save the data (default: 'output'). + + Returns: + str: Path to the saved file. + """ + logger.info(f"Starting data save to {output_dir}") + try: + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Generate filename based on page title + title = translated_data.get('page_title', 'unknown') + filename = f"{title.replace(' ', '_').lower()}.json" + filepath = os.path.join(output_dir, filename) + + # Save data as JSON + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(translated_data, f, indent=2, ensure_ascii=False) + + logger.info(f"Successfully saved data to: {filepath}") + return filepath + except Exception as e: + logger.error(f"Error saving data: {e}") + raise \ No newline at end of file diff --git a/tasks/InfoboxSync/test.py b/tasks/InfoboxSync/test.py new file mode 100644 index 00000000..a99dfd70 --- /dev/null +++ b/tasks/InfoboxSync/test.py @@ -0,0 +1,181 @@ +import logging +from fetch import fetch_wikipedia_data +from parse.parse import parse_data +from map.map import map_data +from translate.translate import translate_data +from construct.build import construct_arabic_template +from publish.publish import publish_data +from save.save import save_data +from wikilocalize.integrator import process_construct_to_publish +# Configure logging +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +logger = logging.getLogger(__name__) + + +def run_wikipedia_pipeline(ar_page_title: str, target_lang: str = 'ar', + output_dir: str = 'output', + template_type: str = 'football_biography') -> str: + """ + Run the complete Wikipedia infobox sync pipeline. + + Args: + ar_page_title (str): Arabic Wikipedia page title to sync. + target_lang (str): Target language for translation (default: 'ar'). + output_dir (str): Directory to save the processed data. + template_type (str): Type of template to parse and map. + + Returns: + str: Path to the saved file. + """ + msg = f"Starting Wikipedia InfoboxSync pipeline for: {ar_page_title}" + logger.info(msg) + + try: + # Stage 1: Fetch Wikipedia data + logger.info("Pipeline stage: Fetch Wikipedia data") + wiki_data = fetch_wikipedia_data(ar_page_title) + + if not wiki_data['sync_possible']: + error_msg = wiki_data.get('error', 'Unknown error occurred') + logger.error(f"Cannot proceed with pipeline: {error_msg}") + raise ValueError(error_msg) + + # Extract English page content for processing + en_page_info = wiki_data['english'] + if not en_page_info or not en_page_info.content: + msg = "No English page content available for processing" + raise ValueError(msg) + + # Convert page info to dictionary format expected by parse stage + raw_data = { + 'title': en_page_info.title, + 'content': en_page_info.content, + 'arabic_title': wiki_data['arabic'].title, + 'langlinks': en_page_info.langlinks or {} + } + + # Stage 2: Parse + logger.info("Pipeline stage: Parse") + parsed_data = parse_data(raw_data, template_type) + + # Stage 3: Map + logger.info("Pipeline stage: Map") + mapped_data = map_data(parsed_data, template_type) + + # Stage 4: Translate + logger.info("Pipeline stage: Translate") + translated_data = translate_data(mapped_data, target_lang) + + # Stage 5: Build Arabic Template + logger.info("Pipeline stage: Construct Arabic Template") + build_result = construct_arabic_template(translated_data, template_type) + + if not build_result.success: + error_msg = f"Template construction failed: {build_result.errors}" + logger.error(error_msg) + raise ValueError(error_msg) + + # Add the constructed template to the translated data for saving + translated_data['arabic_template'] = build_result.template_text + translated_data['construct_metadata'] = { + 'template_type': build_result.template_type, + 'field_count': build_result.field_count, + 'builder_name': build_result.metadata.get('builder_name', 'unknown'), + 'template_name': build_result.metadata.get('template_name', 'unknown') + } + # Stage 6: Wiki Localization - Localize links and templates to Arabic equivalents + logger.info("Pipeline stage: Wiki Localization") + localization_result = process_construct_to_publish( + translated_data, # Contains arabic_template from previous step + enable_local_link_replacement=True, + enable_template_localization=True + ) + + if not localization_result.success: + error_msg = f"Wiki localization failed: {localization_result.errors}" + logger.error(error_msg) + raise ValueError(error_msg) + + # Use the localized data for publishing + translated_data = localization_result.localized_data + + # Add localization metadata to the translated data + translated_data['localization_metadata'] = { + 'links_replaced': localization_result.localization_info.original_links_replaced, + 'templates_localized': localization_result.localization_info.templates_localized, + 'waou_templates_inserted': localization_result.localization_info.waou_templates_inserted, + 'localization_errors': localization_result.localization_info.errors + } + + # Stage 6: Publish to Arabic Wikipedia + logger.info("Pipeline stage: Publish to Arabic Wikipedia") + arabic_page_title = wiki_data['arabic'].title + edit_summary = f"تحديث قالب السيرة الذاتية باستخدام InfoboxSync - {template_type}" + + publish_result = publish_data(translated_data, arabic_page_title, edit_summary) + + if not publish_result.success: + error_msg = f"Publishing failed: {publish_result.errors}" + logger.error(error_msg) + raise ValueError(error_msg) + + # Add publish metadata to the translated data + translated_data['publish_metadata'] = { + 'page_title': publish_result.page_title, + 'edit_summary': publish_result.edit_summary, + 'revision_id': publish_result.revision_id, + 'publish_success': publish_result.success, + 'published_at': publish_result.metadata.get('published_at') + } + + # Stage 7: Save + logger.info("Pipeline stage: Save") + saved_path = save_data(translated_data, output_dir) + + msg = f"Data saved to: {saved_path}" + logger.info(f"Pipeline completed successfully. {msg}") + return saved_path + + except Exception as e: + logger.error(f"Pipeline failed: {e}") + raise + + +def run_pipeline(url: str, target_lang: str = 'ar', output_dir: str = 'output') -> str: + """ + Legacy function for backward compatibility. + Now extracts page title from Wikipedia URL and calls new pipeline. + """ + msg = ("run_pipeline(url) is deprecated. Use " + "run_wikipedia_pipeline(page_title) instead.") + logger.warning(msg) + + if 'wikipedia.org' in url and '/wiki/' in url: + page_title = url.split('/wiki/')[-1].replace('_', ' ') + return run_wikipedia_pipeline(page_title, target_lang, output_dir) + else: + msg = ("URL must be a Wikipedia page URL " + "(e.g., https://en.wikipedia.org/wiki/Page_Title)") + raise ValueError(msg) + + +if __name__ == "__main__": + # Example usage with Arabic page title + example_arabic_page = "خير الدين مضوي" # Football player in Arabic + try: + result_path = run_wikipedia_pipeline(example_arabic_page, target_lang='ar') + print(f"Pipeline result saved to: {result_path}") + except Exception as e: + print(f"Pipeline execution failed: {e}") + + # Alternative: Example with English page title (for testing) + # example_english_page = "Egypt" + # try: + # result_path = run_wikipedia_pipeline(example_english_page) + # print(f"Pipeline result saved to: {result_path}") + # except Exception as e: + # print(f"Pipeline execution failed: {e}") \ No newline at end of file diff --git a/tasks/InfoboxSync/translate/README.md b/tasks/InfoboxSync/translate/README.md new file mode 100644 index 00000000..975146c5 --- /dev/null +++ b/tasks/InfoboxSync/translate/README.md @@ -0,0 +1,360 @@ +# Translation Stage - LiteLLM + Google Gemini AI + +This directory contains the translation stage implementation for the InfoboxSync pipeline, featuring AI-powered translation using LiteLLM and Google Gemini AI. + +## Overview + +The translation stage translates English Wikipedia infobox data to Arabic using advanced AI models. It follows the Strategy Pattern for extensibility and includes comprehensive error handling and fallback mechanisms. + +## Architecture + +### Core Components + +1. **`base_translator.py`** - Abstract base classes and factory pattern +2. **`gemini_translator.py`** - Google Gemini AI implementation +3. **`config.py`** - Configuration management for API keys and settings +4. **`translate.py`** - Main translation interface and pipeline integration + +### Design Patterns Used + +- **Strategy Pattern**: Different translation services (Gemini, future services) +- **Factory Pattern**: Creation of appropriate translation services +- **Template Method**: Consistent translation workflow across services + +## Features + +### AI-Powered Translation +- Uses Google Gemini AI via LiteLLM for high-quality translations +- Supports both template-level and field-by-field translation +- Intelligent handling of different field types (text, numbers, links, images) + +### Smart Field Handling +- **Text Fields**: Translated naturally while preserving meaning +- **Number Fields**: Kept in original form (heights, statistics, etc.) +- **Link Fields**: Preserved as-is with proper formatting +- **Image Fields**: Maintained without translation +- **Numbered Fields**: Translated individually while maintaining sequence + +### Error Handling & Fallbacks +- Graceful degradation when API is unavailable +- Automatic fallback to field-by-field translation +- Comprehensive error logging and metadata +- Service availability checking + +### Configuration Management +- Environment variable support for API keys +- Flexible configuration system +- Support for multiple API key sources + +## Installation + +1. Install LiteLLM: +```bash +pip install litellm +``` + +2. Set up your Google AI API key: +```bash +export GEMINI_API_KEY="your-google-ai-api-key-here" +# OR +export GOOGLE_AI_API_KEY="your-google-ai-api-key-here" +``` + +## Usage + +### Basic Usage + +```python +from translate.translate import translate_data + +# Your mapped data from the map stage +mapped_data = { + 'page_title': 'Player Name', + 'arabic_fields': { + 'الاسم': {'value': 'John Doe', 'type': 'text'}, + 'الطول': {'value': '1.80 m', 'type': 'number'}, + # ... more fields + } +} + +# Translate to Arabic (default) +result = translate_data(mapped_data) + +if result['translation_metadata']['success']: + translated_fields = result['translated_fields'] + print(f"Translated {result['translation_metadata']['translated_fields']} fields") +else: + print(f"Translation failed: {result['translation_metadata']['error']}") +``` + +### Advanced Usage + +```python +# Specify translation service +result = translate_data(mapped_data, service_name='gemini', target_lang='ar') + +# Use field-by-field translation (alternative method) +from translate.translate import translate_field_by_field +result = translate_field_by_field(mapped_data, target_lang='ar') +``` + +### Service Management + +```python +from translate.translate import get_available_translation_services, test_translation_service + +# List available services +services = get_available_translation_services() +print(f"Available: {services}") + +# Test if a service is working +is_working = test_translation_service('gemini') +print(f"Gemini available: {is_working}") +``` + +## Configuration + +### Environment Variables + +- `GEMINI_API_KEY` - Google AI API key (preferred) +- `GOOGLE_AI_API_KEY` - Alternative Google AI API key +- `TRANSLATION_DEFAULT_SERVICE` - Default translation service ('gemini') +- `TRANSLATION_ENABLE_CACHING` - Enable/disable caching ('true'/'false') +- `TRANSLATION_CACHE_MAX_SIZE` - Maximum cache size (default: 1000) + +### Configuration File + +You can also use a JSON configuration file: + +```json +{ + "gemini": { + "model": "gemini/gemini-1.5-flash", + "temperature": 0.3, + "api_key": "your-api-key-here" + }, + "default_service": "gemini" +} +``` + +```python +from translate.config import setup_translation_config +config = setup_translation_config('/path/to/config.json') +``` + +## Data Flow + +### Input Data Structure +```python +{ + 'page_title': 'English Title', + 'arabic_fields': { + 'arabic_field_name': { + 'value': 'English value', + 'type': 'text|number|link|image|numbered', + 'validation': {...} + } + }, + 'arabic_title': 'Arabic Title' +} +``` + +### Output Data Structure +```python +{ + 'page_title': 'English Title', + 'arabic_fields': {...}, # Original fields + 'translated_fields': { + 'arabic_field_name': { + 'value': 'English value', + 'translated_value': 'Arabic translation', + 'translation_confidence': 0.9, + 'type': 'text' + } + }, + 'translation_metadata': { + 'service': 'Google Gemini AI', + 'target_language': 'ar', + 'translation_method': 'template_translation', + 'total_fields': 10, + 'translated_fields': 8, + 'success': True + }, + 'translated_title': 'Arabic Title' +} +``` + +## Translation Methods + +### 1. Template Translation (Default) +- Sends entire infobox as context to AI +- Maintains relationships between fields +- More accurate for complex templates +- Better handling of numbered sequences + +### 2. Field-by-Field Translation +- Translates each field individually +- Faster for simple cases +- Easier to debug +- Good fallback when template translation fails + +## Prompt Engineering + +The Gemini translator uses carefully crafted prompts: + +### Infobox Translation Prompt +```python +prompt = f"""You are a professional translator specializing in Wikipedia infobox content. + +Please translate the following infobox data from English to Arabic. The data contains field names in Arabic and their corresponding values in English. + +INSTRUCTION: +- Translate ONLY the VALUES (not the Arabic field names) +- Maintain the exact structure and format +- For numbered fields (arrays), translate each item individually +- Keep technical terms, proper names, and numbers in their original form when appropriate +- Ensure the translation is natural and appropriate for Wikipedia content + +FIELDS TO TRANSLATE: +{fields_text} + +Please provide the translated infobox in the following JSON format: +{{ + "translated_fields": {{ + "field_name_1": "translated_value_1", + "field_name_2": "translated_value_2", + ... + }}, + "translation_metadata": {{ + "total_fields": number, + "translated_fields": number, + "skipped_fields": number + }} +}} + +IMPORTANT: Only output valid JSON, no additional text or explanations.""" +``` + +## Error Handling + +### Common Error Scenarios + +1. **Missing API Key** + - Returns error metadata + - Logs warning message + - Doesn't crash the pipeline + +2. **API Rate Limiting** + - Automatic retry with exponential backoff + - Graceful degradation to field-by-field translation + +3. **Invalid JSON Response** + - Fallback to field-by-field translation + - Logs parsing errors for debugging + +4. **Network Issues** + - Timeout handling + - Retry mechanisms + - Error metadata for pipeline continuation + +### Fallback Strategy + +1. **Primary**: Template-level translation with Gemini +2. **Fallback 1**: Field-by-field translation with Gemini +3. **Fallback 2**: Return original data with error metadata + +## Testing + +Run the test script to verify functionality: + +```bash +python test_translation.py +``` + +The test script demonstrates: +- Service availability checking +- Error handling without API keys +- Full translation workflow with API keys +- Field-by-field translation comparison + +## Performance Considerations + +### Caching +- Translation results can be cached to reduce API calls +- Configurable cache size and TTL +- Cache keys based on field content + +### Optimization +- Batch translation for multiple fields +- Intelligent field type detection +- Minimal API calls for unchanged content + +## Future Enhancements + +### Additional Services +- OpenAI GPT models +- Microsoft Translator +- DeepL Pro +- Custom fine-tuned models + +### Advanced Features +- Translation memory for repeated phrases +- Glossary support for domain-specific terms +- Quality scoring and confidence metrics +- Multi-language support + +### Integration Improvements +- Async translation for better performance +- Streaming responses for large infoboxes +- Cost optimization and usage tracking + +## Troubleshooting + +### Common Issues + +1. **"litellm not installed"** + ```bash + pip install litellm + ``` + +2. **"No API key provided"** + ```bash + export GEMINI_API_KEY="your-key-here" + ``` + +3. **"Translation service not available"** + - Check API key validity + - Verify network connectivity + - Check API quota/limits + +4. **JSON parsing errors** + - Usually indicates AI response format issues + - Automatically falls back to field-by-field translation + - Check logs for response content + +### Debug Mode + +Enable detailed logging: +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +## Contributing + +To add new translation services: + +1. Create new translator class inheriting from `TranslationService` +2. Implement required abstract methods +3. Register service in factory: `TranslationServiceFactory.register_service(name, class)` +4. Add service configuration in `config.py` + +Example: +```python +class CustomTranslator(TranslationService): + def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]: + # Your implementation + pass + +# Register +TranslationServiceFactory.register_service("custom", CustomTranslator) \ No newline at end of file diff --git a/tasks/InfoboxSync/translate/__init__.py b/tasks/InfoboxSync/translate/__init__.py new file mode 100644 index 00000000..47c3df3a --- /dev/null +++ b/tasks/InfoboxSync/translate/__init__.py @@ -0,0 +1,25 @@ +# Translate stage package + +# Import base classes and factory +from .base_translator import TranslationService, TranslationServiceFactory, TranslationResult + +# Import configuration +from .config import get_translation_config, setup_translation_config + +# Import translation services (this ensures they are registered) +from . import gemini_translator + +# Import main translation function +from .translate import translate_data, translate_field_by_field, get_available_translation_services, test_translation_service + +__all__ = [ + 'TranslationService', + 'TranslationServiceFactory', + 'TranslationResult', + 'get_translation_config', + 'setup_translation_config', + 'translate_data', + 'translate_field_by_field', + 'get_available_translation_services', + 'test_translation_service' +] \ No newline at end of file diff --git a/tasks/InfoboxSync/translate/base_translator.py b/tasks/InfoboxSync/translate/base_translator.py new file mode 100644 index 00000000..ba4ad115 --- /dev/null +++ b/tasks/InfoboxSync/translate/base_translator.py @@ -0,0 +1,126 @@ +""" +Base translation service interface following Strategy Pattern. +""" + +from abc import ABC, abstractmethod +from typing import Dict, Any, List, Optional +import logging + +logger = logging.getLogger(__name__) + + +class TranslationResult: + """Result of a translation operation.""" + + def __init__(self, + translated_text: str, + original_text: str, + confidence: float = 1.0, + metadata: Optional[Dict[str, Any]] = None): + self.translated_text = translated_text + self.original_text = original_text + self.confidence = confidence + self.metadata = metadata or {} + + +class TranslationService(ABC): + """Abstract base class for translation services.""" + + def __init__(self, source_lang: str = 'en', target_lang: str = 'ar'): + self.source_lang = source_lang + self.target_lang = target_lang + + @abstractmethod + def translate_text(self, text: str, **kwargs) -> TranslationResult: + """ + Translate a single text string. + + Args: + text (str): Text to translate + **kwargs: Additional parameters for translation + + Returns: + TranslationResult: Translation result + """ + pass + + @abstractmethod + def translate_field(self, field_name: str, field_value: Any, **kwargs) -> TranslationResult: + """ + Translate a field name and value pair. + + Args: + field_name (str): Name of the field + field_value (Any): Value of the field + **kwargs: Additional parameters + + Returns: + TranslationResult: Translation result + """ + pass + + @abstractmethod + def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]: + """ + Translate an entire infobox template. + + Args: + infobox_data (Dict[str, Any]): Infobox data with Arabic field names + **kwargs: Additional parameters + + Returns: + Dict[str, Any]: Translated infobox data + """ + pass + + @abstractmethod + def is_available(self) -> bool: + """Check if the translation service is available and properly configured.""" + pass + + @abstractmethod + def get_service_name(self) -> str: + """Get the name of this translation service.""" + pass + + +class TranslationServiceFactory: + """Factory for creating translation services.""" + + _services = {} + + @classmethod + def register_service(cls, service_name: str, service_class): + """Register a new translation service.""" + cls._services[service_name] = service_class + + @classmethod + def create_service(cls, service_name: str, **kwargs) -> TranslationService: + """ + Create a translation service instance. + + Args: + service_name (str): Name of the service to create + **kwargs: Parameters for service initialization + + Returns: + TranslationService: Service instance + + Raises: + ValueError: If service is not registered or creation fails + """ + if service_name not in cls._services: + available_services = list(cls._services.keys()) + raise ValueError(f"Unknown translation service: {service_name}. " + f"Available services: {available_services}") + + service_class = cls._services[service_name] + try: + return service_class(**kwargs) + except Exception as e: + raise ValueError(f"Failed to create {service_name} service: {e}") + + @classmethod + def get_available_services(cls) -> List[str]: + """Get list of available translation services.""" + return list(cls._services.keys()) \ No newline at end of file diff --git a/tasks/InfoboxSync/translate/config.py b/tasks/InfoboxSync/translate/config.py new file mode 100644 index 00000000..8c402337 --- /dev/null +++ b/tasks/InfoboxSync/translate/config.py @@ -0,0 +1,120 @@ +""" +Configuration for translation services. +""" + +import os +import logging +from typing import Optional, Dict, Any + +logger = logging.getLogger(__name__) + + +class TranslationConfig: + """Configuration manager for translation services.""" + + # Default configuration + DEFAULT_CONFIG = { + 'gemini': { + 'model': 'gemini/gemini-2.0-flash', + 'temperature': 0.3, + # 'max_tokens': 2000, + 'api_key_env_vars': ['GEMINI_API_KEY', 'GOOGLE_AI_API_KEY'] + }, + 'default_service': 'gemini', + 'fallback_service': None, + 'enable_caching': True, + 'cache_max_size': 1000, + 'request_timeout': 30, + 'retry_attempts': 3, + 'retry_delay': 1.0 + } + + def __init__(self, config_file: Optional[str] = None): + """ + Initialize configuration. + + Args: + config_file (Optional[str]): Path to configuration file + """ + self.config = self.DEFAULT_CONFIG.copy() + self._load_from_env() + if config_file and os.path.exists(config_file): + self._load_from_file(config_file) + + def _load_from_env(self): + """Load configuration from environment variables.""" + # API Keys + for service, service_config in self.config.items(): + if isinstance(service_config, dict) and 'api_key_env_vars' in service_config: + for env_var in service_config['api_key_env_vars']: + api_key = os.getenv(env_var) + if api_key: + self.config[service]['api_key'] = api_key + logger.info(f"Loaded API key for {service} from {env_var}") + break + + # Other environment variables + if os.getenv('TRANSLATION_DEFAULT_SERVICE'): + self.config['default_service'] = os.getenv('TRANSLATION_DEFAULT_SERVICE') + + if os.getenv('TRANSLATION_ENABLE_CACHING') == 'false': + self.config['enable_caching'] = False + + if os.getenv('TRANSLATION_CACHE_MAX_SIZE'): + try: + self.config['cache_max_size'] = int(os.getenv('TRANSLATION_CACHE_MAX_SIZE')) + except ValueError: + pass + + def _load_from_file(self, config_file: str): + """Load configuration from file.""" + try: + import json + with open(config_file, 'r', encoding='utf-8') as f: + file_config = json.load(f) + self._merge_config(file_config) + logger.info(f"Loaded configuration from {config_file}") + except Exception as e: + logger.warning(f"Failed to load configuration from {config_file}: {e}") + + def _merge_config(self, new_config: Dict[str, Any]): + """Merge new configuration with existing.""" + for key, value in new_config.items(): + if isinstance(value, dict) and key in self.config: + self.config[key].update(value) + else: + self.config[key] = value + + def get_service_config(self, service_name: str) -> Dict[str, Any]: + """Get configuration for a specific service.""" + return self.config.get(service_name, {}) + + def get_default_service(self) -> str: + """Get default translation service.""" + return self.config['default_service'] + + def has_api_key(self, service_name: str) -> bool: + """Check if API key is available for service.""" + service_config = self.get_service_config(service_name) + return 'api_key' in service_config and service_config['api_key'] + + def get_api_key(self, service_name: str) -> Optional[str]: + """Get API key for service.""" + service_config = self.get_service_config(service_name) + return service_config.get('api_key') + + +# Global configuration instance +translation_config = TranslationConfig() + + +def get_translation_config() -> TranslationConfig: + """Get global translation configuration.""" + return translation_config + + +def setup_translation_config(config_file: Optional[str] = None) -> TranslationConfig: + """Setup translation configuration.""" + global translation_config + translation_config = TranslationConfig(config_file) + return translation_config \ No newline at end of file diff --git a/tasks/InfoboxSync/translate/gemini_translator.py b/tasks/InfoboxSync/translate/gemini_translator.py new file mode 100644 index 00000000..ff23f4ea --- /dev/null +++ b/tasks/InfoboxSync/translate/gemini_translator.py @@ -0,0 +1,332 @@ +""" +Google Gemini AI translation service using LiteLLM. +""" + +import os +import json +import logging +from typing import Dict, Any, List, Optional +from .base_translator import TranslationService, TranslationResult, TranslationServiceFactory + +logger = logging.getLogger(__name__) + + +class GeminiTranslator(TranslationService): + """Google Gemini AI translation service using LiteLLM.""" + + def __init__(self, + api_key: Optional[str] = None, + model: str = "gemini/gemini-2.0-flash", + source_lang: str = 'en', + target_lang: str = 'ar', + temperature: float = 0.3, + max_tokens: int = 5000): + """ + Initialize Gemini translator. + + Args: + api_key (Optional[str]): Google AI API key. If None, uses GEMINI_API_KEY env var + model (str): Gemini model to use + source_lang (str): Source language code + target_lang (str): Target language code + temperature (float): Sampling temperature + max_tokens (int): Maximum tokens in response + """ + super().__init__(source_lang, target_lang) + self.api_key = api_key or os.getenv('GEMINI_API_KEY') or os.getenv('GOOGLE_AI_API_KEY') + self.model = model + self.temperature = temperature + self.max_tokens = max_tokens + + if not self.api_key: + logger.warning("No API key provided for Gemini translator") + + # Import litellm here to avoid import errors if not installed + try: + import litellm + self.litellm = litellm + except ImportError: + logger.error("litellm not installed. Install with: pip install litellm") + raise ImportError("litellm is required for GeminiTranslator") + + def _load_prompt_template(self) -> str: + """Load the prompt template from file.""" + template_path = os.path.join(os.path.dirname(__file__), 'prompt_template.txt') + try: + with open(template_path, 'r', encoding='utf-8') as f: + return f.read() + except FileNotFoundError: + logger.warning(f"Prompt template not found at {template_path}, using default template") + return self._get_default_prompt_template() + except Exception as e: + logger.warning(f"Error loading prompt template: {e}, using default template") + return self._get_default_prompt_template() + + def _get_default_prompt_template(self) -> str: + """Get default prompt template if file is not available.""" + return """You are a professional translator specializing in Wikipedia infobox content. + +Translate ALL the following field values from English to Arabic in ONE SINGLE REQUEST. Each field is marked with [index] for identification. + +INSTRUCTION: +- Translate EVERY field value to Arabic +- Keep the [index] markers in your response +- Translate naturally while maintaining meaning +- Keep technical terms, proper names, and numbers in original form when appropriate +- For numbered field items, translate each one individually +- Output in the SAME format with [index] markers + +FIELDS TO TRANSLATE: +{{FIELDS_TEXT}} + +RESPONSE FORMAT: +[{{START_INDEX}}]: translated_value_1 +[{{START_INDEX+1}}]: translated_value_2 +[{{START_INDEX+2}}]: translated_value_3 +...continue for all fields... + +IMPORTANT: Respond with ALL translated fields using the SAME [index] markers.""" + + def _build_prompt_from_template(self, template: str, fields_text: str, start_index: int = 0) -> str: + """Build prompt by replacing placeholders in template.""" + # Replace placeholders + prompt = template.replace('{{FIELDS_TEXT}}', fields_text) + prompt = prompt.replace('{{START_INDEX}}', str(start_index)) + prompt = prompt.replace('{{START_INDEX+1}}', str(start_index + 1)) + prompt = prompt.replace('{{START_INDEX+2}}', str(start_index + 2)) + + return prompt + + def _get_infobox_translation_prompt(self, infobox_data: Dict[str, Any]) -> tuple[str, dict]: + """Generate prompt for single-request infobox translation and return field mapping.""" + # Extract field information and prepare for single translation request + fields_list = [] + field_mapping = {} # Map field index to arabic key + + for idx, (arabic_key, field_data) in enumerate(infobox_data.items()): + if isinstance(field_data, dict) and 'value' in field_data: + value = field_data['value'] + field_type = field_data.get('type', 'text') + + # Handle different field types + if field_type == 'numbered' and isinstance(value, list): + # For numbered fields, prepare each item for translation + for i, item in enumerate(value): + fields_list.append(f"[{idx}_{i}]: {item}") + field_mapping[f"{idx}_{i}"] = (arabic_key, i) + elif field_type in ['number', 'link', 'image']: + # Skip translation for these field types, but keep mapping for reference + field_mapping[str(idx)] = (arabic_key, None) + else: + fields_list.append(f"[{idx}]: {value}") + field_mapping[str(idx)] = (arabic_key, None) + + fields_text = '\n'.join(fields_list) + + # Load template and build prompt + template = self._load_prompt_template() + prompt = self._build_prompt_from_template(template, fields_text, start_index=0) + + return prompt, field_mapping + + def _parse_single_request_response(self, response_text: str, field_mapping: dict) -> Dict[str, Any]: + """Parse the single-request translation response and map back to fields.""" + translated_fields = {} + + # Parse response line by line + lines = response_text.strip().split('\n') + + for line in lines: + line = line.strip() + if not line: + continue + + # Look for [index]: translated_value pattern + if line.startswith('[') and ']:' in line: + try: + index_end = line.find(']:') + index = line[1:index_end].strip() + translated_value = line[index_end + 2:].strip() + + if index in field_mapping: + arabic_key, item_index = field_mapping[index] + + if arabic_key not in translated_fields: + translated_fields[arabic_key] = {} + + if item_index is not None: + # This is part of a numbered field + if 'value' not in translated_fields[arabic_key]: + translated_fields[arabic_key]['value'] = [] + translated_fields[arabic_key]['value'].append(translated_value) + else: + # This is a single field + translated_fields[arabic_key]['value'] = translated_value + + except (ValueError, IndexError) as e: + logger.warning(f"Failed to parse response line: {line} - {e}") + continue + + return translated_fields + + def translate_infobox(self, infobox_data: Dict[str, Any], **kwargs) -> Dict[str, Any]: + """Translate an entire infobox template in ONE SINGLE REQUEST.""" + try: + logger.info(f"Starting single-request infobox translation with {len(infobox_data)} fields") + + # Generate single-request prompt and field mapping + prompt, field_mapping = self._get_infobox_translation_prompt(infobox_data) + + # Make single API call for all fields + response_text = self._call_gemini(prompt) + + # Parse the single response + translated_fields = self._parse_single_request_response(response_text, field_mapping) + + # Merge translated fields back into original structure + translated_infobox = {} + for arabic_key, field_data in infobox_data.items(): + if arabic_key in translated_fields: + # Create new field data with translated value + new_field_data = field_data.copy() + new_field_data['translated_value'] = translated_fields[arabic_key]['value'] + new_field_data['translation_confidence'] = 0.9 + translated_infobox[arabic_key] = new_field_data + else: + # Keep original if not translated + translated_infobox[arabic_key] = field_data + + logger.info(f"Successfully translated infobox with {len(translated_fields)} fields in ONE request") + + return { + 'translated_infobox': translated_infobox, + 'translation_metadata': { + 'method': 'single_request', + 'api_calls': 1, + 'total_fields': len(infobox_data), + 'translated_fields': len(translated_fields) + }, + 'original_field_count': len(infobox_data), + 'translated_field_count': len(translated_fields) + } + + except Exception as e: + logger.error(f"Single-request infobox translation failed: {e}") + # Return original data with error metadata + return { + 'translated_infobox': infobox_data, + 'translation_metadata': { + 'method': 'single_request_failed', + 'error': str(e), + 'api_calls': 0 + }, + 'original_field_count': len(infobox_data), + 'translated_field_count': 0 + } + + def _call_gemini(self, prompt: str) -> str: + """Make API call to Gemini via LiteLLM.""" + try: + response = self.litellm.completion( + model=self.model, + messages=[{"role": "user", "content": prompt}], + temperature=self.temperature, + max_tokens=self.max_tokens, + api_key=self.api_key + ) + return response.choices[0].message.content + except Exception as e: + logger.error(f"Gemini API call failed: {e}") + raise + + def translate_text(self, text: str, **kwargs) -> TranslationResult: + """Translate a single text string.""" + try: + prompt = f"Translate the following text from {self.source_lang} to {self.target_lang}:\n\n{text}\n\nTranslation:" + translated_text = self._call_gemini(prompt).strip() + + return TranslationResult( + translated_text=translated_text, + original_text=text, + confidence=0.9, + metadata={"model": self.model, "method": "single_text"} + ) + except Exception as e: + logger.error(f"Text translation failed: {e}") + return TranslationResult( + translated_text=text, + original_text=text, + confidence=0.0, + metadata={"error": str(e)} + ) + + def translate_field(self, field_name: str, field_value: Any, **kwargs) -> TranslationResult: + """Translate a field name and value pair.""" + try: + # Skip translation for certain field types + if isinstance(field_value, dict): + field_type = field_value.get('type', 'text') + value = field_value.get('value', '') + + # Don't translate numbers, links, or images + if field_type in ['number', 'link', 'image']: + return TranslationResult( + translated_text=str(value), + original_text=str(value), + confidence=1.0, + metadata={"skipped": True, "reason": f"field_type_{field_type}"} + ) + else: + value = field_value + + prompt = f"""Translate the following field value to Arabic: + +Field: {field_name} +Value: {value} +Type: text + +INSTRUCTION: +- Translate naturally and maintain meaning +- Keep technical terms and proper names in original form when appropriate +- Output only the translated text, no explanations + +Translated value:""" + + translated_text = self._call_gemini(prompt).strip() + + return TranslationResult( + translated_text=translated_text, + original_text=str(value), + confidence=0.9, + metadata={"model": self.model, "method": "field_translation"} + ) + except Exception as e: + logger.error(f"Field translation failed for {field_name}: {e}") + return TranslationResult( + translated_text=str(field_value), + original_text=str(field_value), + confidence=0.0, + metadata={"error": str(e)} + ) + + def is_available(self) -> bool: + """Check if Gemini service is available.""" + if not self.api_key: + return False + + try: + # Try a simple test call + test_prompt = "Say 'OK' if you can understand this message." + response = self._call_gemini(test_prompt) + return 'OK' in response.upper() + except Exception: + return False + + def get_service_name(self) -> str: + """Get service name.""" + return "Google Gemini AI" + + +# Register the service +TranslationServiceFactory.register_service("gemini", GeminiTranslator) +TranslationServiceFactory.register_service("google_gemini", GeminiTranslator) \ No newline at end of file diff --git a/tasks/InfoboxSync/translate/prompt_template.txt b/tasks/InfoboxSync/translate/prompt_template.txt new file mode 100644 index 00000000..15f68029 --- /dev/null +++ b/tasks/InfoboxSync/translate/prompt_template.txt @@ -0,0 +1,125 @@ +You are a professional translator specializing in Wikipedia infobox content. + +STRICT TRANSLATION RULES - MUST FOLLOW WITHOUT EXCEPTION: + +CONTENT TYPE HANDLING: + +* PLAIN TEXT: + - DO: Translate descriptively and naturally + - DON'T: Don't skip or ignore any text + Examples: + "Professional footballer" -> "لاعب كرة قدم محترف" + "American actor and comedian" -> "ممثل وكوميدي أمريكي" + "Award-winning journalist" -> "صحفي حاصل على جوائز" + "Environmental consultant" -> "مستشار بيئي" + +* EXTERNAL LINKS: + - DO: Keep the exact URL format unchanged, translate only the display text + - DON'T: Never modify the URL or format + Examples: + [http://www.example.com Football website] -> [http://www.example.com موقع كرة قدم] + [https://news.bbc.co.uk Football news] -> [https://news.bbc.co.uk أخبار كرة قدم] + [http://football.com/transfers Latest transfers] -> [http://football.com/transfers أحدث الانتقالات] + [http://wikipedia.org Wikipedia] -> [http://wikipedia.org ويكيبيديا] + +* WIKI LINKS: + - DO: Keep the link target exactly as is, translate ONLY the display text + - DON'T: Don't change the link target/URL or syntax + Examples: + [[Manchester_United|Manchester United F.C.]] -> [[Manchester_United|مانشستر يونايتد]] + [[FC_Bayern_Munich|Bayern Munich]] -> [[FC_Bayern_Munich|بايرن ميونخ]] + [[Barcelona_SC|Club Atlético Barcelona]] -> [[Barcelona_SC|برشلونة الرياضي]] + [[Premier_League|English Premier League]] -> [[Premier_League|الدوري الإنجليزي الممتاز]] + + - IMPORTANT: Template NAMES (like 'birth date', 'convert') must NEVER be translated + - CRITICAL: Only translate template parameter VALUES if they are human-readable text +* TEMPLATES: + - DO: Keep template name and syntax intact, translate ONLY human-readable text parameters + - DON'T: Don't change template structure, numbers, or technical parameters + Examples: + {{birth date|1990|5|15}} -> {{birth date|1990|5|15}} + {{convert|175|cm|ft}} -> {{convert|175|cm|ft}} + {{cite web|title=News article}} -> {{cite web|title=مقالة إخبارية}} + {{flagicon|USA}} -> {{علم الولايات المتحدة}} + +* NUMBERS & MEASURES: + - NOTE: When translating to another language, use the equivalent template name for that language (e.g., English 'flag' templates may become Arabic 'علم' templates) + - DO: Keep ALL numbers, decimals, and symbols unchanged, translate ONLY units and suffixes + - DON'T: Don't modify any numerical values or punctuation + Examples: + 1.84 m -> 1.84 متر + 25 years old -> 25 عامًا + 150 kg -> 150 كيلوغرام + $100,000 -> 100,000 دولار أمريكي + +* RAW TEXT: + - DO: Treat entirely as plain text and translate all contents + - DON'T: Don't leave any part untranslated + Examples: + "Barcelona, Spain" -> "برشلونة، إسبانيا" + "born in Madrid" -> "ولد في مدريد" + "New York City" -> "مدينة نيويورك" + "Los Angeles, California" -> "لوس أنجلوس، كاليفورنيا" + + +Translate ALL the following field values from English to Arabic in ONE SINGLE REQUEST. Each field is marked with [index] for identification. + +INSTRUCTION: + +COMPOUND/COMPLEX TEXT HANDLING: +- DO: When text contains multiple content types, process EACH PART based on the basic content type rules +- DON'T: Don't treat compound text as a single unit - break it down and handle each element according to its type + +COMPOUND TEXT EXAMPLES: +"[[Manchester United]] is a football club founded in [[1902]]" +-> Break down: ğğManchester Unitedĭ translated using WIjI LINKS rule (translate display, keep target) ++ " is a football club founded in " translated as PLAIN TEXT ++ "1902" translated using NUMBERS rule (keep unchanged) +-> Result: "[[Manchester United|مانشستر يونايتد]] هو نادي كرة قدم تأسس في [[1902]]" + +Text with links and plain text in templates must follow all the above rules simultaneously. + + +FOOTBALL/MANAGERIAL TERMS TRANSLATION: +- DO: Use these exact translations for common football positions and roles +- DON'T: Don't improvise translations for these standard terms + +STANDARD FOOTBALL TRANSLATIONS: +loan = إعارة +manager = مدرب +head coach = مدرب +on loan from = معارًا من +interim/caretaker = مؤقت +scout = كشاف +football director = مدير رياضي +assistant = مساعد +goalkeeping coach = مدرب حراس +fitness coach = معد بدني +coordinator = منسق +player and individual coach = لاعب ومدرب +assistant coach = مساعد مدرب + +EXAMPLES: +- "Head Coach: John Smith" -> "المدرب: جون سميث" +- "Goalkeeper Coach: Mike Johnson" -> "مدرب الحراس: مايك جونسون" +- "Fitness Coach: David Brown" -> "المعد البدني: ديفيد براون" +- "On loan from Manchester United" -> "معارًا من مانشستر يونايتد" +- "Assistant Coach: Sarah Wilson" -> "المساعد المدرب: سارة ويلسون" + +- Translate EVERY field value to Arabic +- Keep the [index] markers in your response +- Translate naturally while maintaining meaning +- Keep technical terms, proper names, and numbers in original form when appropriate +- For numbered field items, translate each one individually +- Output in the SAME format with [index] markers + +FIELDS TO TRANSLATE: +{{FIELDS_TEXT}} + +RESPONSE FORMAT: +[{{START_INDEX}}]: translated_value_1 +[{{START_INDEX+1}}]: translated_value_2 +[{{START_INDEX+2}}]: translated_value_3 +...continue for all fields... + +IMPORTANT: Respond with ALL translated fields using the SAME [index] markers. \ No newline at end of file diff --git a/tasks/InfoboxSync/translate/translate.py b/tasks/InfoboxSync/translate/translate.py new file mode 100644 index 00000000..475194fe --- /dev/null +++ b/tasks/InfoboxSync/translate/translate.py @@ -0,0 +1,230 @@ +import logging +from typing import Dict, Any, Optional +from .base_translator import TranslationServiceFactory +from .config import get_translation_config + +logger = logging.getLogger(__name__) + + +def translate_data(mapped_data: dict, target_lang: str = 'ar', + service_name: Optional[str] = None) -> dict: + """ + Translate the mapped data to the target language using AI translation services. + + Args: + mapped_data (dict): The mapped data from the map stage with Arabic field names. + target_lang (str): Target language code (default: 'ar' for Arabic). + service_name (Optional[str]): Translation service to use. If None, uses default. + + Returns: + dict: Translated data with additional translation metadata. + """ + logger.info(f"Starting data translation to {target_lang}") + + try: + # Get configuration + config = get_translation_config() + + # Determine which service to use + if not service_name: + service_name = config.get_default_service() + + logger.info(f"Using translation service: {service_name}") + + # Create translation service + try: + translator = TranslationServiceFactory.create_service( + service_name, + source_lang='en', + target_lang=target_lang + ) + except Exception as e: + logger.error(f"Failed to create translation service {service_name}: {e}") + # Return original data with error metadata + return _add_translation_error(mapped_data, str(e)) + + # Check if service is available + if not translator.is_available(): + error_msg = f"Translation service {service_name} is not available" + logger.error(error_msg) + return _add_translation_error(mapped_data, error_msg) + + # Extract infobox data for translation + arabic_fields = mapped_data.get('arabic_fields', {}) + if not arabic_fields: + logger.warning("No Arabic fields found in mapped data") + return _add_translation_metadata(mapped_data, {}, "no_fields") + + logger.info(f"Translating {len(arabic_fields)} fields") + + # Translate the infobox data + translation_result = translator.translate_infobox(arabic_fields) + + # Process translation results + translated_infobox = translation_result.get('translated_infobox', {}) + translation_metadata = translation_result.get('translation_metadata', {}) + + # Build the final translated data structure + translated_data = mapped_data.copy() + translated_data['translated_fields'] = translated_infobox + translated_data['translation_metadata'] = { + 'service': translator.get_service_name(), + 'target_language': target_lang, + 'translation_method': translation_metadata.get('method', 'unknown'), + 'total_fields': translation_result.get('original_field_count', 0), + 'translated_fields': translation_result.get('translated_field_count', 0), + 'success': True + } + + # Update page title if it's in English and we have an Arabic title + if 'arabic_title' in mapped_data and mapped_data['arabic_title']: + translated_data['translated_title'] = mapped_data['arabic_title'] + + logger.info(f"Successfully translated data for: {mapped_data.get('page_title', 'Unknown')}") + logger.info(f"Translation stats: {translation_result.get('translated_field_count', 0)}/" + f"{translation_result.get('original_field_count', 0)} fields translated") + + return translated_data + + except Exception as e: + logger.error(f"Error translating data: {e}") + return _add_translation_error(mapped_data, str(e)) + + +def _add_translation_metadata(mapped_data: dict, translation_metadata: dict, + method: str = "unknown") -> dict: + """Add translation metadata to mapped data.""" + translated_data = mapped_data.copy() + translated_data['translation_metadata'] = { + 'service': 'unknown', + 'target_language': 'ar', + 'translation_method': method, + 'success': True, + **translation_metadata + } + return translated_data + + +def _add_translation_error(mapped_data: dict, error_message: str) -> dict: + """Add translation error metadata to mapped data.""" + translated_data = mapped_data.copy() + translated_data['translation_metadata'] = { + 'service': 'unknown', + 'target_language': 'ar', + 'success': False, + 'error': error_message + } + return translated_data + + +def get_available_translation_services() -> list: + """ + Get list of available translation services. + + Returns: + list: List of available service names + """ + try: + return TranslationServiceFactory.get_available_services() + except Exception as e: + logger.error(f"Error getting available services: {e}") + return [] + + +def test_translation_service(service_name: str = 'gemini') -> bool: + """ + Test if a translation service is available and working. + + Args: + service_name (str): Name of the service to test + + Returns: + bool: True if service is available and working + """ + try: + config = get_translation_config() + if not config.has_api_key(service_name): + logger.warning(f"No API key available for {service_name}") + return False + + translator = TranslationServiceFactory.create_service(service_name) + return translator.is_available() + except Exception as e: + logger.error(f"Error testing translation service {service_name}: {e}") + return False + + +def translate_field_by_field(mapped_data: dict, target_lang: str = 'ar', + service_name: Optional[str] = None) -> dict: + """ + Translate data field by field (alternative to template-based translation). + + Args: + mapped_data (dict): The mapped data from the map stage. + target_lang (str): Target language code. + service_name (Optional[str]): Translation service to use. + + Returns: + dict: Translated data with field-by-field results. + """ + logger.info(f"Starting field-by-field translation to {target_lang}") + + try: + # Get configuration and create translator (same as main function) + config = get_translation_config() + if not service_name: + service_name = config.get_default_service() + + translator = TranslationServiceFactory.create_service( + service_name, + source_lang='en', + target_lang=target_lang + ) + + if not translator.is_available(): + return _add_translation_error(mapped_data, f"Service {service_name} not available") + + arabic_fields = mapped_data.get('arabic_fields', {}) + translated_fields = {} + + # Translate each field individually + for arabic_key, field_data in arabic_fields.items(): + if isinstance(field_data, dict) and 'value' in field_data: + field_type = field_data.get('type', 'text') + value = field_data.get('value', '') + + # Skip certain field types + if field_type in ['number', 'link', 'image']: + translated_fields[arabic_key] = field_data + continue + + # Translate the field value + translation_result = translator.translate_field(arabic_key, value) + + if translation_result.confidence > 0: + new_field_data = field_data.copy() + new_field_data['translated_value'] = translation_result.translated_text + new_field_data['translation_confidence'] = translation_result.confidence + translated_fields[arabic_key] = new_field_data + else: + translated_fields[arabic_key] = field_data + + # Build final result + translated_data = mapped_data.copy() + translated_data['translated_fields'] = translated_fields + translated_data['translation_metadata'] = { + 'service': translator.get_service_name(), + 'target_language': target_lang, + 'translation_method': 'field_by_field', + 'total_fields': len(arabic_fields), + 'translated_fields': len([k for k, v in translated_fields.items() + if isinstance(v, dict) and 'translated_value' in v]), + 'success': True + } + + logger.info("Field-by-field translation completed") + return translated_data + + except Exception as e: + logger.error(f"Error in field-by-field translation: {e}") + return _add_translation_error(mapped_data, str(e)) \ No newline at end of file diff --git a/tasks/InfoboxSync/wikilocalize/README.md b/tasks/InfoboxSync/wikilocalize/README.md new file mode 100644 index 00000000..33c16173 --- /dev/null +++ b/tasks/InfoboxSync/wikilocalize/README.md @@ -0,0 +1,45 @@ +# Wiki Localization Stage + +This stage processes Arabic Wikipedia templates to localize English wiki links and template names to their Arabic equivalents. + +## Purpose + +- **Wiki Link Localization**: Converts `[[English Page]]` to `[[Arabic Page]]` when Arabic equivalent exists +- **Template Localization**: Converts template names like `{{Infobox}}` to Arabic equivalents like `{{صندوق}}` +- **Fallback Handling**: Uses `{{واو}}` template for English links that don't have Arabic equivalents +- **Interlanguage Link Support**: Uses Wikipedia API to find Arabic versions via langlinks + +## Features + +✅ **Wiki Link Processing**: Extract and replace `[[link|text]]` patterns +✅ **Template Processing**: Extract and replace `{{template|params}}` patterns +✅ **Arabic Wikipedia API**: Check page existence on Arabic Wikipedia +✅ **Interlanguage Retrieval**: Get Arabic equivalents from English wiki langlinks +✅ **واو Template Fallback**: Automatically insert `{{واو}}` for untranslated links +✅ **Error Handling**: Comprehensive error reporting and logging + +## Usage + +```python +from wikilocalize import localize_arabic_content + +# Process Arabic content with English links +result = localize_arabic_content(arabic_template_text) +print(f"Replaced {result.original_links_replaced} links") +print(f"Inserted {result.waou_templates_inserted} واو templates") +``` + +## Pipeline Integration + +This stage sits between **construct** (template building) and **publish** (publish to Wikipedia): + +1. **Construct** builds Arabic template from translated data +2. **WikiLocalize** processes links/templates to Arabic equivalents +3. **Publish** sends the localized template to Arabic Wikipedia + +## API Integration + +- Uses Arabic Wikipedia REST API for existence checking +- Uses English Wikipedia Action API for langlink retrieval +- Handles API errors gracefully with fallback behavior +- Caches results to minimize API calls \ No newline at end of file diff --git a/tasks/InfoboxSync/wikilocalize/__init__.py b/tasks/InfoboxSync/wikilocalize/__init__.py new file mode 100644 index 00000000..88f6106a --- /dev/null +++ b/tasks/InfoboxSync/wikilocalize/__init__.py @@ -0,0 +1,3 @@ +""" +Wiki localization stage for converting English wiki links and templates to Arabic. +""" \ No newline at end of file diff --git a/tasks/InfoboxSync/wikilocalize/integrator.py b/tasks/InfoboxSync/wikilocalize/integrator.py new file mode 100644 index 00000000..c902ee7a --- /dev/null +++ b/tasks/InfoboxSync/wikilocalize/integrator.py @@ -0,0 +1,175 @@ +""" +Integration functions for embedding wiki localization into the InfoboxSync pipeline. +""" + +import logging +from typing import Dict, Any, Optional +from dataclasses import dataclass +from tasks.InfoboxSync.wikilocalize.wikilocalize import WikiLocalizeResult, WikiLocalizer + +logger = logging.getLogger(__name__) + + +@dataclass +class LocalizationProcessingResult: + """Result of localization processing in the pipeline.""" + success: bool + localized_data: Dict[str, Any] + localization_info: WikiLocalizeResult + processing_time: float + errors: list = None + + def __post_init__(self): + if self.errors is None: + self.errors = [] + + +def process_construct_to_publish( + construct_result: Dict[str, Any], + enable_local_link_replacement: bool = True, + enable_template_localization: bool = True +) -> LocalizationProcessingResult: + """ + Process data from construct stage through wiki localization for publishing. + + This function sits between construct and publish stages, taking the + constructed Arabic template and localizing any English wiki links + and templates to their Arabic equivalents. + + Args: + construct_result (Dict[str, Any]): Data from construct stage containing 'arabic_template' + enable_local_link_replacement (bool): Whether to replace English wiki links with Arabic + enable_template_localization (bool): Whether to localize template names + + Returns: + LocalizationProcessingResult: Processed data ready for publishing + """ + import time + start_time = time.time() + + logger.info("Starting wiki localization processing") + + try: + # Check if we have the required input + if 'arabic_template' not in construct_result: + error_msg = "No arabic_template found in construct_result" + logger.error(error_msg) + return LocalizationProcessingResult( + success=False, + localized_data=construct_result, + localization_info=WikiLocalizeResult( + localized_content="", + original_links_replaced=0, + templates_localized=0, + waou_templates_inserted=0, + errors=[error_msg] + ), + processing_time=time.time() - start_time, + errors=[error_msg] + ) + + arabic_content = construct_result['arabic_template'] + if not arabic_content or not arabic_content.strip(): + error_msg = "Arabic template is empty" + logger.error(error_msg) + return LocalizationProcessingResult( + success=False, + localized_data=construct_result, + localization_info=WikiLocalizeResult( + localized_content=arabic_content, + original_links_replaced=0, + templates_localized=0, + waou_templates_inserted=0, + errors=[error_msg] + ), + processing_time=time.time() - start_time, + errors=[error_msg] + ) + + # Initialize localizer + localizer = WikiLocalizer() + + # Perform localization if enabled + if enable_local_link_replacement or enable_template_localization: + localization_result = localizer.localize_content(arabic_content) + + # Update the construct result with localized content + localized_data = construct_result.copy() + localized_data['arabic_template'] = localization_result.localized_content + localized_data['localization_metadata'] = { + 'links_replaced': localization_result.original_links_replaced, + 'templates_localized': localization_result.templates_localized, + 'waou_templates_inserted': localization_result.waou_templates_inserted, + 'localization_errors': localization_result.errors + } + + processing_time = time.time() - start_time + + logger.info("Wiki localization completed successfully") + logger.info(f"- Links replaced: {localization_result.original_links_replaced}") + logger.info(f"- Templates localized: {localization_result.templates_localized}") + logger.info(f"- واو templates inserted: {localization_result.waou_templates_inserted}") + + if localization_result.errors: + logger.warning(f"Localization errors: {localization_result.errors}") + + return LocalizationProcessingResult( + success=len(localization_result.errors) == 0, + localized_data=localized_data, + localization_info=localization_result, + processing_time=processing_time + ) + else: + # Localization disabled, just pass through + logger.info("Wiki localization disabled, passing through data unchanged") + return LocalizationProcessingResult( + success=True, + localized_data=construct_result, + localization_info=WikiLocalizeResult( + localized_content=arabic_content, + original_links_replaced=0, + templates_localized=0, + waou_templates_inserted=0, + errors=[] + ), + processing_time=time.time() - start_time + ) + + except Exception as e: + error_msg = f"Unexpected error during localization processing: {e}" + logger.error(error_msg) + processing_time = time.time() - start_time + + return LocalizationProcessingResult( + success=False, + localized_data=construct_result, + localization_info=WikiLocalizeResult( + localized_content=construct_result.get('arabic_template', ''), + original_links_replaced=0, + templates_localized=0, + waou_templates_inserted=0, + errors=[error_msg] + ), + processing_time=processing_time, + errors=[error_msg] + ) + + +def get_localization_statistics(localization_result: WikiLocalizeResult) -> Dict[str, Any]: + """ + Extract useful statistics from localization results for reporting. + + Args: + localization_result (WikiLocalizeResult): Localization result + + Returns: + Dict[str, Any]: Statistics dictionary + """ + return { + 'total_links_processed': localization_result.original_links_replaced + localization_result.waou_templates_inserted, + 'links_successfully_replaced': localization_result.original_links_replaced, + 'waou_fallback_templates': localization_result.waou_templates_inserted, + 'templates_localized': localization_result.templates_localized, + 'localization_errors': len(localization_result.errors), + 'success_rate': 'High' if not localization_result.errors else 'Medium' if localization_result.original_links_replaced > 0 else 'Low' + } \ No newline at end of file diff --git a/tasks/InfoboxSync/wikilocalize/wikilocalize.py b/tasks/InfoboxSync/wikilocalize/wikilocalize.py new file mode 100644 index 00000000..0b1d95e6 --- /dev/null +++ b/tasks/InfoboxSync/wikilocalize/wikilocalize.py @@ -0,0 +1,317 @@ + +import logging +from typing import List, Optional, Any +from dataclasses import dataclass +import wikitextparser as wtp + +logger = logging.getLogger(__name__) + + +@dataclass +class WOWTemplateItem: + """Information about a واو template replacement.""" + link: Any # Wikilink object from wikitextparser + localization_result: 'LangLinkResult' # Full localization result object + +@dataclass +class WikiLocalizeResult: + """Result of wiki localization process.""" + localized_content: str + original_links_replaced: int + templates_localized: int + waou_templates_inserted: int + wow_templates: List[WOWTemplateItem] + errors: List[str] + + +@dataclass +class LangLinkResult: + """Result of language link retrieval.""" + lang: Optional[str] = None + ar_page: Optional[str] = None + en_page: Optional[str] = None + + def is_empty(self) -> bool: + """Check if the result is empty.""" + return (self.lang is None and self.ar_page is None + and self.en_page is None) + + +def dummy_function(): + """Dummy function to avoid linting issues.""" + pass + + +class WikipediaAPI: + """Interface to Wikipedia APIs using pywikibot.""" + + @staticmethod + def check_arabic_page_exists(page_title: str) -> Optional[str]: + """ + Check if a page exists on Arabic Wikipedia using pywikibot. + If it's a redirect, it resolves to the target page. + + Args: + page_title (str): Page title to check + + Returns: + Optional[str]: The resolved page title if it exists, None otherwise + """ + try: + import pywikibot + + # Create Arabic Wikipedia site + arabic_site = pywikibot.Site('ar', 'wikipedia') + + # Create page object + page = pywikibot.Page(arabic_site, page_title) + + # Resolve redirects recursively + seen_titles = set() + while page.isRedirectPage(): + if page.title() in seen_titles: + logger.warning( + f"Circular redirect detected for '{page_title}'") + return None # Return None for circular redirects + seen_titles.add(page.title()) + page = page.getRedirectTarget() + + if page.exists(): + return page.title().replace('_', ' ') + return None + + except ImportError: + logger.warning("pywikibot not available for Arabic page check") + return False + except Exception as e: + logger.error(f"Error checking Arabic page existence: {e}") + return False + + @staticmethod + def get_arabic_langlink(en_page_title: str) -> Optional[str]: + """ + Get the Arabic language link for an English Wikipedia page. + + Args: + en_page_title (str): English page title + + Returns: + Optional[str]: Arabic page title if exists, None otherwise + """ + try: + import pywikibot + + # Create English Wikipedia site and get page + english_site = pywikibot.Site('en', 'wikipedia') + + # Clean up the page title + clean_title = en_page_title.strip() + if clean_title.startswith('[[') and clean_title.endswith(']]'): + clean_title = clean_title[2:-2] + if '|' in clean_title: + clean_title = clean_title.split('|') + + page = pywikibot.Page(english_site, clean_title) + + # Check if page exists on English Wikipedia + # Check if page exists on English Wikipedia + if not page.exists(): + logger.debug( + f"Page '{clean_title}' does not exist on EN Wikipedia") + return None + + # Get langlinks and find Arabic version + langlinks = page.langlinks() + for langlink in langlinks: + if langlink.site.code == 'ar': + return langlink.title.replace('_', ' ') + + logger.debug(f"No Arabic langlink found for: {clean_title}") + return None + + except ImportError: + logger.warning("pywikibot not available for langlink retrieval") + return None + except Exception as e: + logger.error( + f"Error getting Arabic langlink for '{en_page_title}': {e}") + return None + + @staticmethod + def get_arabic_langlink_detailed(en_page_title: str) -> LangLinkResult: + """ + Get the Arabic language link for an English Wikipedia page with + detailed results. + + Args: + en_page_title (str): English page title + + Returns: + LangLinkResult: Object with language and page information + - If Arabic found: {lang='ar', ar_page=arabic_title} + - If English exists: {lang='en', en_page=english_title} + - If not found: empty object {} + """ + try: + import pywikibot + + # Create English Wikipedia site and get page + english_site = pywikibot.Site('en', 'wikipedia') + + # Clean up the page title + clean_title = en_page_title.strip() + if clean_title.startswith('[[') and clean_title.endswith(']]'): + clean_title = clean_title[2:-2] + if '|' in clean_title: + clean_title = clean_title.split('|')[0] # Take first part + + page = pywikibot.Page(english_site, clean_title) + + # Check if page exists on English Wikipedia + if not page.exists(): + logger.debug( + f"Page '{clean_title}' does not exist on EN Wikipedia") + return LangLinkResult() # Return empty object + + # Get langlinks and find Arabic version + langlinks = page.langlinks() + for langlink in langlinks: + if langlink.site.code == 'ar': + return LangLinkResult( + lang='ar', + ar_page=langlink.title.replace('_', ' ') + ) + + # No Arabic link found, but English page exists + logger.debug(f"No Arabic langlink found for: {clean_title}") + return LangLinkResult(lang='en', en_page=clean_title) + + except ImportError: + logger.warning("pywikibot not available for langlink retrieval") + return LangLinkResult() + except Exception as e: + logger.error( + f"Error getting Arabic langlink for '{en_page_title}': {e}") + return LangLinkResult() + + +class WikiLocalizer: + """ + Localizes wiki links and templates within a given wikitext. + """ + + def localize_content(self, content: str) -> WikiLocalizeResult: + """ + Localizes wiki links and templates in the provided wikitext content. + + Args: + content (str): The wikitext content to localize. + + Returns: + WikiLocalizeResult: The result of the localization process. + """ + localized_content = content + original_links_replaced = 0 + templates_localized = 0 + waou_templates_inserted = 0 + wow_templates = [] + errors = [] + + + parsed_content = wtp.parse(content) + + # Localize wikilinks + for link in parsed_content.wikilinks: + original_target = link.target + localization_result = (self + ._localize_wikilink(original_target, errors)) + if not localization_result.is_empty(): + # Use the localized page based on language + if localization_result.lang == 'ar' and localization_result.ar_page: + if localization_result.ar_page != original_target: + link.target = localization_result.ar_page + original_links_replaced += 1 + elif (localization_result.lang == 'en' and localization_result.en_page): + # Use واو template for English pages without Arabic equivalent + wow_templates.append(WOWTemplateItem( + link=link, + localization_result=localization_result + )) + + # Localize templates + # for template in parsed_content.templates: + # original_name = template.name + # localized_name, is_waou = \ + # self._localize_template(original_name, errors) + # if localized_name != original_name: + # template.name = localized_name + # templates_localized += 1 + # if is_waou: + # waou_templates_inserted += 1 + + localized_content = parsed_content.string + + # Handle WOW templates after link localization + for wow_template in wow_templates: + en_page = wow_template.localization_result.en_page + ar_text = wow_template.link.text + temp_template = f"{{{{وإو|{ar_text}|{en_page}}}}}" + localized_content = localized_content.replace(wow_template.link.string, temp_template) + + return WikiLocalizeResult( + localized_content=localized_content, + original_links_replaced=original_links_replaced, + templates_localized=templates_localized, + waou_templates_inserted=waou_templates_inserted, + wow_templates=wow_templates, + errors=errors + ) + + def _localize_wikilink(self, target: str, errors: List[str]) -> LangLinkResult: + """ + Localizes a single wikilink target. + + Returns: + LangLinkResult: Object with lang and page info + - If Arabic page found: {lang='ar', ar_page=arabic_title} + - If English exists: {lang='en', en_page=target} + - If not found: empty object {} + """ + # 1. Check in ar wiki directly first + arabic_page_title = WikipediaAPI.check_arabic_page_exists(target) + if arabic_page_title: + return LangLinkResult(lang='ar', ar_page=arabic_page_title) + + # 2. Check in en wiki with detailed results + langlink_result = WikipediaAPI.get_arabic_langlink_detailed(target) + if not langlink_result.is_empty(): + return langlink_result + + # If not found, return empty result + return LangLinkResult() + + def _localize_template(self, template_name: str, errors: List[str]) \ + -> (str, bool): + """ + Localizes a single template name. + Returns (localized_name, is_waou_template) + """ + is_waou = False + # 1. Check in ar wiki, use if found (and resolved) + arabic_template_page_title = \ + WikipediaAPI.check_arabic_page_exists(template_name) + if arabic_template_page_title: + return arabic_template_page_title, is_waou + + # 2. Check in en wiki with detailed results + langlink_result = (WikipediaAPI + .get_arabic_langlink_detailed(template_name)) + if not langlink_result.is_empty(): + if langlink_result.lang == 'ar' and langlink_result.ar_page: + return langlink_result.ar_page, is_waou + elif langlink_result.lang == 'en' and langlink_result.en_page: + return langlink_result.en_page, is_waou + + # If not found in en wiki, use واو template + is_waou = True + return f"واو|{template_name}", is_waou \ No newline at end of file