Bicycle-Analysis/bicycle_data_processing.py at main · foreverS01x/Bicycle-Analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

def load_and_clean_data(file_path):
    """
    加载并清洗自行车数据
    """
    print("正在加载数据...")
    df = pd.read_csv(file_path)
    print(f"原始数据形状: {df.shape}")
    print(f"列名: {list(df.columns)}")

    return df

def process_time_features(df):
    """
    处理时间字段，提取时间特征
    """
    print("\n正在处理时间字段...")

    # 转换时间字段
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['ended_at'] = pd.to_datetime(df['ended_at'])

    # 计算骑行时长（秒）
    df['ride_duration_seconds'] = (df['ended_at'] - df['started_at']).dt.total_seconds()

    # 提取开始时间的特征
    df['start_hour'] = df['started_at'].dt.hour
    df['start_weekday'] = df['started_at'].dt.weekday  # 0=周一, 6=周日
    df['start_weekday_name'] = df['started_at'].dt.day_name()
    df['start_date'] = df['started_at'].dt.date

    # 判断是否为工作日 (周一到周五为工作日)
    df['is_workday'] = (df['start_weekday'] < 5).astype(int)

    # 提取结束时间的特征
    df['end_hour'] = df['ended_at'].dt.hour
    df['end_weekday'] = df['ended_at'].dt.weekday
    df['end_date'] = df['ended_at'].dt.date

    print("时间特征提取完成!")
    print(f"时间范围: {df['started_at'].min()} 到 {df['started_at'].max()}")

    return df

def filter_outliers(df):
    """
    过滤异常值
    """
    print("\n正在过滤异常值...")

    original_count = len(df)

    # 过滤骑行时长异常的记录
    # 1. 骑行时长小于等于0秒的记录
    # 2. 骑行时长超过4小时(14400秒)的记录
    df_filtered = df[
        (df['ride_duration_seconds'] > 0) &
        (df['ride_duration_seconds'] <= 14400)  # 4小时 = 4*60*60秒
    ].copy()

    # 过滤缺失关键信息的记录
    df_filtered = df_filtered.dropna(subset=['start_station_id', 'end_station_id'])

    filtered_count = len(df_filtered)
    removed_count = original_count - filtered_count

    print(f"原始记录数: {original_count:,}")
    print(f"过滤后记录数: {filtered_count:,}")
    print(f"移除记录数: {removed_count:,} ({removed_count/original_count*100:.2f}%)")

    # 显示过滤统计
    print("\n异常值统计:")
    print(f"骑行时长 ≤ 0秒的记录: {(df['ride_duration_seconds'] <= 0).sum():,}")
    print(f"骑行时长 > 4小时的记录: {(df['ride_duration_seconds'] > 14400).sum():,}")
    print(f"缺失站点信息的记录: {df[['start_station_id', 'end_station_id']].isnull().any(axis=1).sum():,}")

    return df_filtered

def calculate_station_metrics(df):
    """
    计算每个站点、每个小时的关键指标
    """
    print("\n正在计算站点指标...")

    # 为每个时间段创建借车和还车记录
    # 借车记录 (流出)
    outflow = df.groupby(['start_station_id', 'start_station_name', 'start_date', 'start_hour']).agg({
        'ride_id': 'count',
        'is_workday': 'first',
        'start_weekday': 'first'
    }).reset_index()

    outflow.columns = ['station_id', 'station_name', 'date', 'hour', 'outflow_count', 'is_workday', 'weekday']

    # 还车记录 (流入)
    inflow = df.groupby(['end_station_id', 'end_station_name', 'end_date', 'end_hour']).agg({
        'ride_id': 'count',
        'is_workday': 'first',  # 使用开始时间的工作日标记
        'start_weekday': 'first'  # 使用开始时间的星期
    }).reset_index()

    inflow.columns = ['station_id', 'station_name', 'date', 'hour', 'inflow_count', 'is_workday', 'weekday']

    # 合并流入和流出数据
    station_metrics = pd.merge(
        outflow,
        inflow,
        on=['station_id', 'station_name', 'date', 'hour'],
        how='outer',
        suffixes=('_out', '_in')
    )

    # 填充缺失值
    station_metrics['outflow_count'] = station_metrics['outflow_count'].fillna(0)
    station_metrics['inflow_count'] = station_metrics['inflow_count'].fillna(0)

    # 使用非空的工作日和星期信息
    station_metrics['is_workday'] = station_metrics['is_workday_out'].fillna(station_metrics['is_workday_in'])
    station_metrics['weekday'] = station_metrics['weekday_out'].fillna(station_metrics['weekday_in'])

    # 删除临时列
    station_metrics = station_metrics.drop(['is_workday_out', 'is_workday_in', 'weekday_out', 'weekday_in'], axis=1)

    # 计算净流量
    station_metrics['net_flow'] = station_metrics['outflow_count'] - station_metrics['inflow_count']

    # 添加流量状态标签
    def get_flow_status(net_flow):
        if net_flow < -5:
            return "严重缺车"
        elif net_flow < -2:
            return "轻微缺车"
        elif net_flow <= 2:
            return "平衡"
        elif net_flow <= 5:
            return "轻微堆积"
        else:
            return "严重堆积"

    station_metrics['flow_status'] = station_metrics['net_flow'].apply(get_flow_status)

    print(f"站点指标计算完成! 共 {len(station_metrics):,} 条记录")

    return station_metrics

def generate_summary_statistics(df, station_metrics):
    """
    生成汇总统计信息
    """
    print("\n=== 数据处理汇总报告 ===")

    # 基本统计
    print(f"\n📊 基本统计:")
    print(f"总骑行次数: {len(df):,}")
    print(f"独特站点数: {df['start_station_id'].nunique():,}")
    print(f"数据时间跨度: {df['started_at'].min().strftime('%Y-%m-%d')} 到 {df['started_at'].max().strftime('%Y-%m-%d')}")
    print(f"平均骑行时长: {df['ride_duration_seconds'].mean()/60:.1f} 分钟")
    print(f"中位数骑行时长: {df['ride_duration_seconds'].median()/60:.1f} 分钟")

    # 用户类型统计
    print(f"\n👥 用户类型分布:")
    user_stats = df['member_casual'].value_counts()
    for user_type, count in user_stats.items():
        print(f"{user_type}: {count:,} ({count/len(df)*100:.1f}%)")

    # 自行车类型统计
    print(f"\n🚲 自行车类型分布:")
    bike_stats = df['rideable_type'].value_counts()
    for bike_type, count in bike_stats.items():
        print(f"{bike_type}: {count:,} ({count/len(df)*100:.1f}%)")

    # 时间分布统计
    print(f"\n⏰ 时间分布:")
    print(f"工作日骑行: {df[df['is_workday']==1]['ride_id'].count():,} ({df[df['is_workday']==1]['ride_id'].count()/len(df)*100:.1f}%)")
    print(f"周末骑行: {df[df['is_workday']==0]['ride_id'].count():,} ({df[df['is_workday']==0]['ride_id'].count()/len(df)*100:.1f}%)")

    # 热门时段
    hourly_rides = df.groupby('start_hour')['ride_id'].count().sort_values(ascending=False)
    print(f"\n🔥 最繁忙时段 (前5):")
    for hour, count in hourly_rides.head().items():
        print(f"{hour:02d}:00-{hour+1:02d}:00: {count:,} 次骑行")

    # 站点流量统计
    print(f"\n🚉 站点流量统计:")
    flow_status_counts = station_metrics['flow_status'].value_counts()
    total_records = len(station_metrics)
    for status, count in flow_status_counts.items():
        print(f"{status}: {count:,} ({count/total_records*100:.1f}%)")

    # 最繁忙的站点
    print(f"\n📍 最繁忙站点 (按总流量):")
    station_total_flow = station_metrics.groupby(['station_id', 'station_name']).agg({
        'outflow_count': 'sum',
        'inflow_count': 'sum'
    }).reset_index()
    station_total_flow['total_flow'] = station_total_flow['outflow_count'] + station_total_flow['inflow_count']
    top_stations = station_total_flow.nlargest(5, 'total_flow')

    for _, row in top_stations.iterrows():
        print(f"{row['station_name']}: {row['total_flow']:,} (出: {row['outflow_count']:,}, 入: {row['inflow_count']:,})")

def save_results(df, station_metrics, output_dir):
    """
    保存处理结果到指定目录
    """
    print(f"\n正在保存结果到: {output_dir}...")

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)

    # 保存清洗后的原始数据
    cleaned_file = os.path.join(output_dir, 'cleaned_trip_data.csv')
    df.to_csv(cleaned_file, index=False, encoding='utf-8-sig')
    print(f"✅ 清洗后的行程数据已保存到: {cleaned_file}")

    # 保存站点指标数据
    hourly_file = os.path.join(output_dir, 'station_hourly_metrics.csv')
    station_metrics.to_csv(hourly_file, index=False, encoding='utf-8-sig')
    print(f"✅ 站点小时指标已保存到: {hourly_file}")

    # 创建每日汇总
    daily_summary = station_metrics.groupby(['station_id', 'station_name', 'date']).agg({
        'outflow_count': 'sum',
        'inflow_count': 'sum',
        'net_flow': 'sum',
        'is_workday': 'first'
    }).reset_index()
    daily_summary['flow_status'] = daily_summary['net_flow'].apply(
        lambda x: "严重缺车" if x < -10 else "轻微缺车" if x < -5 else "平衡" if abs(x) <= 5 else "轻微堆积" if x <= 10 else "严重堆积"
    )
    daily_file = os.path.join(output_dir, 'station_daily_summary.csv')
    daily_summary.to_csv(daily_file, index=False, encoding='utf-8-sig')
    print(f"✅ 站点每日汇总已保存到: {daily_file}")

def process_single_month(file_path, month_folder):
    """
    处理单个月份的数据文件
    """
    print(f"\n{'='*60}")
    print(f"📅 正在处理: {os.path.basename(file_path)}")
    print(f"{'='*60}")

    # 1. 加载数据
    df = load_and_clean_data(file_path)

    # 2. 处理时间特征
    df = process_time_features(df)

    # 3. 过滤异常值
    df_clean = filter_outliers(df)

    # 4. 计算站点指标
    station_metrics = calculate_station_metrics(df_clean)

    # 5. 生成汇总统计
    generate_summary_statistics(df_clean, station_metrics)

    # 6. 保存结果到对应月份文件夹
    save_results(df_clean, station_metrics, month_folder)

    print(f"\n✅ {os.path.basename(file_path)} 处理完成!")

    return df_clean, station_metrics

def main():
    """
    主函数 - 处理所有月份的数据
    """
    print("🚲 自行车数据处理系统")
    print("=" * 60)

    # 创建输出主目录
    output_base_dir = 'cleaned_data'
    os.makedirs(output_base_dir, exist_ok=True)

    # 查找所有数据文件
    data_files = sorted(glob.glob('data/*-divvy-tripdata.csv'))

    if not data_files:
        print("❌ 未找到数据文件! 请确保data文件夹中有CSV文件。")
        return

    print(f"\n找到 {len(data_files)} 个数据文件:")
    for f in data_files:
        print(f"  - {os.path.basename(f)}")

    # 处理每个文件
    all_results = {}
    for file_path in data_files:
        # 从文件名提取月份 (例如: 202507-divvy-tripdata.csv -> 202507)
        filename = os.path.basename(file_path)
        month = filename.split('-')[0]  # 提取YYYYMM部分

        # 创建月份输出文件夹
        month_folder = os.path.join(output_base_dir, month)

        try:
            # 处理该月份的数据
            df_clean, station_metrics = process_single_month(file_path, month_folder)
            all_results[month] = {
                'df': df_clean,
                'metrics': station_metrics
            }
        except Exception as e:
            print(f"\n❌ 处理 {filename} 时出错: {str(e)}")
            import traceback
            traceback.print_exc()
            continue

    # 汇总报告
    print(f"\n{'='*60}")
    print("🎉 所有数据处理完成!")
    print(f"{'='*60}")
    print(f"\n处理结果已保存到以下文件夹:")
    for month in sorted(all_results.keys()):
        month_folder = os.path.join(output_base_dir, month)
        print(f"  📁 {month_folder}/")
        print(f"     - cleaned_trip_data.csv")
        print(f"     - station_hourly_metrics.csv")
        print(f"     - station_daily_summary.csv")

if __name__ == "__main__":
    main()