-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMap Batch Upload.py
More file actions
178 lines (145 loc) · 6.28 KB
/
Map Batch Upload.py
File metadata and controls
178 lines (145 loc) · 6.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import os
import time
import pandas as pd
import pywikibot
import xml.etree.ElementTree as ET
from pywikibot.exceptions import APIError
# ================= CONFIGURATION =================
# 1. Metadata File Location (Replace with your actual CSV file path)
# The CSV must contain these columns: 'local_filepath', 'filename', 'description'.
METADATA_FILE = r"path/to/your/metadata_file.csv"
# 2. Log Files (To keep track of upload history)
LOG_SUCCESS = "upload_success.txt" # List of successfully uploaded files
LOG_FAILED = "upload_failed.txt" # List of permanently failed files
LOG_BAD_SVG = "upload_bad_svg.txt" # List of invalid or corrupted SVG files
# 3. Upload Settings
BATCH_LIMIT = 5000 # Maximum number of files to upload in one execution
SLEEP_TIME = 1 # Pause (in seconds) after each successful upload
MAX_RETRIES = 3 # Number of retries if network/API fails
# 4. Default Edit Summary
# Change this to match your project's goal.
DEFAULT_EDIT_SUMMARY = "Uploading automated batch files"
# =================================================
def is_valid_svg(filepath):
"""Checks if the SVG file is valid and its size is greater than 0 bytes."""
try:
if not os.path.exists(filepath):
return False
if os.path.getsize(filepath) == 0:
return False
ET.parse(filepath) # Check XML structure
return True
except Exception:
return False
def main():
# ---------------------------------------------------------
# Step 1: Login to Wikimedia Commons
# ---------------------------------------------------------
try:
site = pywikibot.Site('commons', 'commons')
site.login()
print(f"Logged in as: {site.user()}")
except Exception as e:
print(f"Login Failed: {e}")
return
# ---------------------------------------------------------
# Step 2: Read Metadata CSV
# ---------------------------------------------------------
print("Reading metadata file...")
try:
df = pd.read_csv(METADATA_FILE, encoding='utf-8')
print(f"Total files in list: {len(df)}")
except Exception as e:
print(f"Error reading CSV: {e}\nPlease check if METADATA_FILE path is correct.")
return
# ---------------------------------------------------------
# Step 3: Load Upload History (For Resume Capability)
# ---------------------------------------------------------
uploaded_files = set()
if os.path.exists(LOG_SUCCESS):
with open(LOG_SUCCESS, "r", encoding="utf-8") as f:
uploaded_files = set(line.strip() for line in f)
print(f"Previously uploaded: {len(uploaded_files)} files. Starting from next...")
# ---------------------------------------------------------
# Step 4: Main Upload Loop
# ---------------------------------------------------------
count = 0
for index, row in df.iterrows():
# Check batch limit
if count >= BATCH_LIMIT:
print(f"\nBatch limit of {BATCH_LIMIT} reached. Stopping safely.")
break
# Extract data from CSV
local_filepath = str(row['local_filepath']).strip()
target_filename = str(row['filename']).strip()
description_text = str(row['description'])
# Extract optional 'block' or category for edit summary (if exists in CSV)
block_name = row['block'] if 'block' in row else None
# Skip logic: Ignore if already uploaded
if target_filename in uploaded_files:
continue
# File validation
if not is_valid_svg(local_filepath):
print(f"Skipping Bad/Missing File: {local_filepath}")
with open(LOG_BAD_SVG, "a", encoding="utf-8") as f:
f.write(f"{target_filename}\n")
continue
# Start uploading
print(f"[{count + 1}] Uploading: {target_filename} ... ", end="")
retries = 0
success = False
while retries < MAX_RETRIES:
try:
file_page = pywikibot.FilePage(site, target_filename)
# Check: If file already exists on server
if file_page.exists():
print("ALREADY EXISTS (Skipping)")
with open(LOG_SUCCESS, "a", encoding="utf-8") as f:
f.write(target_filename + "\n")
uploaded_files.add(target_filename)
success = True
break
# Prepare dynamic edit summary
edit_summary = f"{DEFAULT_EDIT_SUMMARY} (Block/Category: {block_name})" if block_name else DEFAULT_EDIT_SUMMARY
# Upload command
site.upload(
filepage=file_page,
source_filename=local_filepath,
comment=edit_summary,
text=description_text,
ignore_warnings=True,
report_success=False
)
# On success
print("DONE")
with open(LOG_SUCCESS, "a", encoding="utf-8") as f:
f.write(target_filename + "\n")
uploaded_files.add(target_filename)
count += 1
success = True
# Pause before next upload
time.sleep(SLEEP_TIME)
break
except APIError as e:
if "fileexists" in str(e):
print("EXISTS (API Check)")
with open(LOG_SUCCESS, "a", encoding="utf-8") as f:
f.write(target_filename + "\n")
success = True
break
else:
print(f" API Error: {e.code}")
retries += 1
time.sleep(5)
except Exception as e:
print(f" Error: {e}")
retries += 1
time.sleep(30)
# If all retries fail
if not success:
print(f" FAILED permanently: {target_filename}")
with open(LOG_FAILED, "a", encoding="utf-8") as f:
f.write(f"{target_filename}\n")
print("\nBatch processing completed.")
if __name__ == "__main__":
main()