-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
205 lines (168 loc) · 7.98 KB
/
utils.py
File metadata and controls
205 lines (168 loc) · 7.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import configparser
import os
import logging
import time
import random
import functools
def retry_on_exception(max_attempts_cfg: int, delay_seconds_cfg: float, logger_instance: logging.Logger, exceptions_to_catch_cfg: tuple = (Exception,)):
"""
Decorator that retries the decorated function on exception.
Args:
max_attempts_cfg (int): Maximum number of attempts to try the function.
delay_seconds_cfg (float): Delay in seconds between retry attempts.
logger_instance (logging.Logger): Logger instance for logging retry attempts.
exceptions_to_catch_cfg (tuple): Tuple of exceptions to catch and retry on. Defaults to (Exception,).
Returns:
function: Decorated function with retry logic.
"""
def decorator(func):
@functools.wraps(func)
def _wrapper(*args, **kwargs):
attempts = 0
while attempts < max_attempts_cfg:
attempts += 1
try:
return func(*args, **kwargs)
except exceptions_to_catch_cfg as e:
logger_instance.warning(f"Attempt {attempts}/{max_attempts_cfg} for {func.__name__} failed: {e}. Retrying in {delay_seconds_cfg}s...")
if attempts == max_attempts_cfg:
logger_instance.error(f"All {max_attempts_cfg} attempts for {func.__name__} failed. Last error: {e}")
raise e
time.sleep(delay_seconds_cfg)
return None # This should never be reached as we either return the result or re-raise the exception
return _wrapper
return decorator
def load_config(config_path='config.ini'):
"""
Load the configuration from the specified INI file.
Args:
config_path (str): Path to the configuration file. Defaults to 'config.ini'.
Returns:
configparser.ConfigParser: The loaded configuration object.
Raises:
FileNotFoundError: If the config file doesn't exist.
configparser.Error: If there's an error parsing the config file.
PermissionError: If the config file can't be accessed due to permissions.
"""
# Create a ConfigParser instance
config = configparser.ConfigParser()
# Check if the configuration file exists
if not os.path.exists(config_path):
raise FileNotFoundError(f"Configuration file not found: {config_path}")
try:
# Read the configuration file
config.read(config_path)
# Verify that we actually got some sections from the file
if len(config.sections()) == 0:
raise configparser.Error(f"Config file {config_path} appears to be empty or invalid. No sections found.")
return config
except configparser.ParsingError as e:
raise configparser.Error(f"Error parsing config file {config_path}: {e}")
except PermissionError as e:
raise PermissionError(f"Permission denied when accessing config file {config_path}: {e}")
except Exception as e:
# Catch-all for any other unexpected issues
raise configparser.Error(f"Unexpected error reading config file {config_path}: {e}")
def setup_logger(name='threads_scraper', log_file_path_str='scraper.log', level_str='INFO'):
"""
Set up a logger with console and file handlers.
Args:
name (str): Name of the logger. Defaults to 'threads_scraper'.
log_file_path_str (str): Path to the log file. Defaults to 'scraper.log'.
level_str (str): Logging level as a string (e.g., 'INFO', 'DEBUG'). Defaults to 'INFO'.
Returns:
logging.Logger: Configured logger instance.
"""
# Get a logger instance
logger = logging.getLogger(name)
# Convert level_str to a logging level constant
log_levels = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL
}
# Default to INFO if level_str is invalid
actual_log_level = log_levels.get(level_str.upper(), logging.INFO)
# Set logger level
logger.setLevel(actual_log_level)
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - [%(levelname)s] - %(module)s.%(funcName)s:%(lineno)d - %(message)s')
# Only add handlers if they don't already exist
if not logger.handlers:
# Console Handler
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# File Handler
# Get the directory part of log_file_path_str
log_dir = os.path.dirname(log_file_path_str)
# Create the directory if it doesn't exist (but only if a directory is specified)
if log_dir:
os.makedirs(log_dir, exist_ok=True)
# Create file handler
file_handler = logging.FileHandler(log_file_path_str, mode='a')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
return logger
def polite_wait(min_sec, max_sec, logger):
"""
Wait for a random amount of time between min_sec and max_sec.
Args:
min_sec (float): Minimum wait time in seconds.
max_sec (float): Maximum wait time in seconds.
logger (logging.Logger): Logger instance to log the wait time.
"""
wait_time = random.uniform(min_sec, max_sec)
logger.debug(f"Waiting for {wait_time:.2f} seconds...")
time.sleep(wait_time)
def save_data_to_json(data_to_save: dict, username: str, output_dir_path: str, logger: logging.Logger, config: configparser.ConfigParser):
"""
Save data to a JSON file with a timestamp in the filename.
Args:
data_to_save (dict): The data to save.
username (str): The username for the filename.
output_dir_path (str): The directory to save the file in.
logger (logging.Logger): Logger instance for logging.
config (configparser.ConfigParser): Configuration for any settings.
Returns:
str: The path to the saved file, or None if saving failed.
"""
import json
from datetime import datetime
logger.info(f"Preparing to save data for user: {username}")
# Input validation
if not isinstance(data_to_save, dict):
logger.error(f"Invalid data format for user {username}: expected dict, got {type(data_to_save)}")
return None
if not username or not isinstance(username, str):
logger.error(f"Invalid username: {username}")
return None
try:
# Ensure output_dir_path exists
os.makedirs(output_dir_path, exist_ok=True)
# Generate a file timestamp
file_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Construct filename - replace any problematic characters
safe_username = ''.join(c if c.isalnum() or c in '-_' else '_' for c in username)
output_filename = f"{safe_username}_{file_timestamp}.json"
# Construct full file path
full_file_path = os.path.join(output_dir_path, output_filename)
try:
with open(full_file_path, 'w', encoding='utf-8') as f:
json.dump(data_to_save, f, indent=2, ensure_ascii=False)
logger.info(f"Successfully saved data for {username} to: {full_file_path}")
return full_file_path
except PermissionError as e:
logger.error(f"Permission denied when writing data for {username} to {full_file_path}: {e}")
return None
except IOError as e:
logger.error(f"I/O error when saving data for {username} to {full_file_path}: {e}")
return None
except OSError as e:
logger.error(f"OS error when creating directory {output_dir_path} for user {username}: {e}")
return None
except Exception as e:
logger.error(f"Unexpected error saving data for user {username}: {e}", exc_info=True)
return None