Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,62 @@
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import io
import pandas as pd


# Paso 2 Descargar el HTML
url = "https://en.wikipedia.org/wiki/List_of_Spotify_streaming_records"
response = requests.get(url)
# Verificar la respuesta
print("Estado:", response.status_code)

# Paso 3: Transforma el HTML al Extraer las tablas con pandas
html = io.StringIO(response.text)
tables = pd.read_html(html)
print(f"Se encontraron {len(tables)} tablas.")

#Paso 4: Procesar el DataFrame
# Limpieza de datos

df.columns = ["Rank", "Song", "Artist", "Streams (billions)", "Date released", "Reference"]

# Se eliminan notas entre corchetes
df["Song"] = df["Song"].str.replace(r"\[.*?\]", "", regex=True)
df["Artist"] = df["Artist"].str.replace(r"\[.*?\]", "", regex=True)

df = df[df["Streams (billions)"].astype(str).str.contains(r"^\d+(?:\.\d+)?$", na=False)].copy()

# Se convierten Streams a números flotantes
df["Streams (billions)"] = df["Streams (billions)"].astype(float)

# Se convierten fechas a datetime
df["Date released"] = pd.to_datetime(df["Date released"], errors="coerce")

df

#Paso 5: Se almacenan los datos en sqlite

# Create the database
conn = sqlite3.connect("spotify_top_songs.db")
# Create table in SQLite
df.to_sql("most_streamed", conn, if_exists="replace", index=False)
cursor = conn.cursor()

cursor.execute("SELECT COUNT(*) FROM most_streamed")
print("Rows inserted:", cursor.fetchone()[0])

conn.commit()
conn.close()

#Paso 6: Visualizar los datos

# Gráfico 1: Las 10 canciones más reproducidas
top10 = df.nlargest(10, "Streams (billions)")
plt.figure(figsize=(12, 6))
sns.barplot(data=top10, x="Streams (billions)", y="Song", hue="Song", palette="viridis", legend=False)
plt.title("Las 10 canciones más reproducidas en Spotify")
plt.xlabel("Reproducciones (en miles de millones)")
plt.ylabel("Canción")
plt.tight_layout()
plt.show()
80 changes: 78 additions & 2 deletions src/explore.es.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -22,8 +22,84 @@
"import sqlite3\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n"
"\n",
" # Paso 2 Descargar el HTML\n",
"url = \"https://en.wikipedia.org/wiki/List_of_Spotify_streaming_records\"\n",
"response = requests.get(url)\n",
"# Verificar la respuesta\n",
"print(\"Estado:\", response.status_code)\n",
"\n",
"# Paso 3: Transforma el HTML al Extraer las tablas con pandas\n",
"html = io.StringIO(response.text)\n",
"tables = pd.read_html(html)\n",
"print(f\"Se encontraron {len(tables)} tablas.\")\n",
"\n",
"#Paso 4: Procesar el DataFrame\n",
"# Limpieza de datos\n",
"\n",
"df.columns = [\"Rank\", \"Song\", \"Artist\", \"Streams (billions)\", \"Date released\", \"Reference\"]\n",
"\n",
"# Se eliminan notas entre corchetes\n",
"df[\"Song\"] = df[\"Song\"].str.replace(r\"\\[.*?\\]\", \"\", regex=True)\n",
"df[\"Artist\"] = df[\"Artist\"].str.replace(r\"\\[.*?\\]\", \"\", regex=True)\n",
"\n",
"df = df[df[\"Streams (billions)\"].astype(str).str.contains(r\"^\\d+(?:\\.\\d+)?$\", na=False)].copy()\n",
"\n",
"# Se convierten Streams a números flotantes\n",
"df[\"Streams (billions)\"] = df[\"Streams (billions)\"].astype(float)\n",
"\n",
"# Se convierten fechas a datetime\n",
"df[\"Date released\"] = pd.to_datetime(df[\"Date released\"], errors=\"coerce\")\n",
"\n",
"df\n",
"\n",
"#Paso 5: Se almacenan los datos en sqlite\n",
"\n",
"# Create the database\n",
"conn = sqlite3.connect(\"spotify_top_songs.db\")\n",
"# Create table in SQLite\n",
"df.to_sql(\"most_streamed\", conn, if_exists=\"replace\", index=False)\n",
"cursor = conn.cursor()\n",
"\n",
"cursor.execute(\"SELECT COUNT(*) FROM most_streamed\")\n",
"print(\"Rows inserted:\", cursor.fetchone()[0])\n",
"\n",
"conn.commit()\n",
"conn.close()\n",
"\n",
"#Paso 6: Visualizar los datos\n",
"\n",
"# Gráfico 1: Las 10 canciones más reproducidas\n",
"top10 = df.nlargest(10, \"Streams (billions)\")\n",
"plt.figure(figsize=(12, 6))\n",
"sns.barplot(data=top10, x=\"Streams (billions)\", y=\"Song\", hue=\"Song\", palette=\"viridis\", legend=False)\n",
"plt.title(\"Las 10 canciones más reproducidas en Spotify\")\n",
"plt.xlabel(\"Reproducciones (en miles de millones)\")\n",
"plt.ylabel(\"Canción\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
24 changes: 12 additions & 12 deletions src/solution.es.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -66,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -102,7 +102,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -114,15 +114,15 @@
}
],
"source": [
"# Extraer las tablas con pandas\n",
"# Paso 3: Transforma el HTML al extraer las tablas con pandas\n",
"html = io.StringIO(response.text)\n",
"tables = pd.read_html(html)\n",
"print(f\"Se encontraron {len(tables)} tablas.\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -240,7 +240,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -290,7 +290,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -300,7 +300,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -311,7 +311,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -347,7 +347,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -377,7 +377,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -406,7 +406,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down