diff --git a/src/app.py b/src/app.py index 8d606b17ae..09696b8b17 100644 --- a/src/app.py +++ b/src/app.py @@ -5,4 +5,62 @@ import sqlite3 import matplotlib.pyplot as plt import seaborn as sns +import io +import pandas as pd + + # Paso 2 Descargar el HTML +url = "https://en.wikipedia.org/wiki/List_of_Spotify_streaming_records" +response = requests.get(url) +# Verificar la respuesta +print("Estado:", response.status_code) + +# Paso 3: Transforma el HTML al Extraer las tablas con pandas +html = io.StringIO(response.text) +tables = pd.read_html(html) +print(f"Se encontraron {len(tables)} tablas.") + +#Paso 4: Procesar el DataFrame +# Limpieza de datos + +df.columns = ["Rank", "Song", "Artist", "Streams (billions)", "Date released", "Reference"] + +# Se eliminan notas entre corchetes +df["Song"] = df["Song"].str.replace(r"\[.*?\]", "", regex=True) +df["Artist"] = df["Artist"].str.replace(r"\[.*?\]", "", regex=True) + +df = df[df["Streams (billions)"].astype(str).str.contains(r"^\d+(?:\.\d+)?$", na=False)].copy() + +# Se convierten Streams a números flotantes +df["Streams (billions)"] = df["Streams (billions)"].astype(float) + +# Se convierten fechas a datetime +df["Date released"] = pd.to_datetime(df["Date released"], errors="coerce") + +df + +#Paso 5: Se almacenan los datos en sqlite + +# Create the database +conn = sqlite3.connect("spotify_top_songs.db") +# Create table in SQLite +df.to_sql("most_streamed", conn, if_exists="replace", index=False) +cursor = conn.cursor() + +cursor.execute("SELECT COUNT(*) FROM most_streamed") +print("Rows inserted:", cursor.fetchone()[0]) + +conn.commit() +conn.close() + +#Paso 6: Visualizar los datos + +# Gráfico 1: Las 10 canciones más reproducidas +top10 = df.nlargest(10, "Streams (billions)") +plt.figure(figsize=(12, 6)) +sns.barplot(data=top10, x="Streams (billions)", y="Song", hue="Song", palette="viridis", legend=False) +plt.title("Las 10 canciones más reproducidas en Spotify") +plt.xlabel("Reproducciones (en miles de millones)") +plt.ylabel("Canción") +plt.tight_layout() +plt.show() diff --git a/src/explore.es.ipynb b/src/explore.es.ipynb index e4854634c7..8488c20ee1 100644 --- a/src/explore.es.ipynb +++ b/src/explore.es.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -22,8 +22,84 @@ "import sqlite3\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", - "\n" + "\n", + " # Paso 2 Descargar el HTML\n", + "url = \"https://en.wikipedia.org/wiki/List_of_Spotify_streaming_records\"\n", + "response = requests.get(url)\n", + "# Verificar la respuesta\n", + "print(\"Estado:\", response.status_code)\n", + "\n", + "# Paso 3: Transforma el HTML al Extraer las tablas con pandas\n", + "html = io.StringIO(response.text)\n", + "tables = pd.read_html(html)\n", + "print(f\"Se encontraron {len(tables)} tablas.\")\n", + "\n", + "#Paso 4: Procesar el DataFrame\n", + "# Limpieza de datos\n", + "\n", + "df.columns = [\"Rank\", \"Song\", \"Artist\", \"Streams (billions)\", \"Date released\", \"Reference\"]\n", + "\n", + "# Se eliminan notas entre corchetes\n", + "df[\"Song\"] = df[\"Song\"].str.replace(r\"\\[.*?\\]\", \"\", regex=True)\n", + "df[\"Artist\"] = df[\"Artist\"].str.replace(r\"\\[.*?\\]\", \"\", regex=True)\n", + "\n", + "df = df[df[\"Streams (billions)\"].astype(str).str.contains(r\"^\\d+(?:\\.\\d+)?$\", na=False)].copy()\n", + "\n", + "# Se convierten Streams a números flotantes\n", + "df[\"Streams (billions)\"] = df[\"Streams (billions)\"].astype(float)\n", + "\n", + "# Se convierten fechas a datetime\n", + "df[\"Date released\"] = pd.to_datetime(df[\"Date released\"], errors=\"coerce\")\n", + "\n", + "df\n", + "\n", + "#Paso 5: Se almacenan los datos en sqlite\n", + "\n", + "# Create the database\n", + "conn = sqlite3.connect(\"spotify_top_songs.db\")\n", + "# Create table in SQLite\n", + "df.to_sql(\"most_streamed\", conn, if_exists=\"replace\", index=False)\n", + "cursor = conn.cursor()\n", + "\n", + "cursor.execute(\"SELECT COUNT(*) FROM most_streamed\")\n", + "print(\"Rows inserted:\", cursor.fetchone()[0])\n", + "\n", + "conn.commit()\n", + "conn.close()\n", + "\n", + "#Paso 6: Visualizar los datos\n", + "\n", + "# Gráfico 1: Las 10 canciones más reproducidas\n", + "top10 = df.nlargest(10, \"Streams (billions)\")\n", + "plt.figure(figsize=(12, 6))\n", + "sns.barplot(data=top10, x=\"Streams (billions)\", y=\"Song\", hue=\"Song\", palette=\"viridis\", legend=False)\n", + "plt.title(\"Las 10 canciones más reproducidas en Spotify\")\n", + "plt.xlabel(\"Reproducciones (en miles de millones)\")\n", + "plt.ylabel(\"Canción\")\n", + "plt.tight_layout()\n", + "plt.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/solution.es.ipynb b/src/solution.es.ipynb index 0b50b26c29..c146040e0d 100644 --- a/src/solution.es.ipynb +++ b/src/solution.es.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -102,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -114,7 +114,7 @@ } ], "source": [ - "# Extraer las tablas con pandas\n", + "# Paso 3: Transforma el HTML al extraer las tablas con pandas\n", "html = io.StringIO(response.text)\n", "tables = pd.read_html(html)\n", "print(f\"Se encontraron {len(tables)} tablas.\")" @@ -122,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -300,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -311,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -347,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -377,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -406,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [ {