'''python
Repository for CODSOFT Data Science Internship tasks import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.impute import SimpleImputer
DATASET_PATH = 'cleaned_movie_data.csv' # REPLACE WITH YOUR ACTUAL PATH
try: df = pd.read_csv(DATASET_PATH) print("Dataset loaded successfully!") except FileNotFoundError: print(f"Error: Dataset not found at {DATASET_PATH}. Please check the file path.") exit()
print("\n--- Exploratory Data Analysis ---") print("\nFirst 5 rows:") print(df.head()) print("\nDataset info:") df.info() print("\nSummary statistics:") print(df.describe())
print("\nMissing values per column:") print(df.isnull().sum())
rating_column_name = 'Rating' # <--- CHANGE THIS IF YOUR RATING COLUMN IS NAMED DIFFERENTLY
plt.figure(figsize=(8, 6)) sns.histplot(df[rating_column_name], bins=30, kde=True) # Use the correct column name plt.title('Distribution of Movie Ratings') plt.xlabel('Rating') plt.ylabel('Frequency') plt.show() plt.savefig('movie_ratings_distribution.png')
print("\n--- Feature Engineering ---")
TARGET_COLUMN = rating_column_name # Use the corrected name FEATURE_COLUMNS = [col for col in df.columns if col != TARGET_COLUMN]
CATEGORICAL_FEATURES = [col for col in FEATURE_COLUMNS if df[col].dtype == 'object'] NUMERICAL_FEATURES = [col for col in FEATURE_COLUMNS if df[col].dtype != 'object']
print("\nCategorical features:", CATEGORICAL_FEATURES) print("Numerical features:", NUMERICAL_FEATURES)
print("\n--- Data Preprocessing ---")
numerical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), # Handle missing numerical values ('scaler', StandardScaler()) ])
categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), # Handle missing categorical values ('onehot', OneHotEncoder(handle_unknown='ignore')) ])
preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, NUMERICAL_FEATURES), ('cat', categorical_transformer, CATEGORICAL_FEATURES) ], remainder='passthrough' # Drop other columns if any )
print("\n--- Model Selection and Training ---")
X = df[FEATURE_COLUMNS] y = df[TARGET_COLUMN] # Use the corrected name X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = { 'Linear Regression': LinearRegression(), 'Ridge Regression': Ridge(), 'Lasso Regression': Lasso(), 'Random Forest': RandomForestRegressor(random_state=42), 'Gradient Boosting': GradientBoostingRegressor(random_state=42) }
results = {} for name, model in models.items(): pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)]) pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_test) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) r2 = r2_score(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) results[name] = {'MSE': mse, 'RMSE': rmse, 'R-squared': r2, 'MAE': mae}
print("\nInitial Model Evaluation:") for name, metrics in results.items(): print(f"\n{name}:") for metric, value in metrics.items(): print(f"{metric}: {value:.4f}")
print("\n--- Hyperparameter Tuning (Optional) ---")
param_grid_rf = { 'regressor__n_estimators': [100, 200, 300], 'regressor__max_depth': [None, 5, 10], 'regressor__min_samples_split': [2, 5, 10] }
grid_search_rf = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor), ('regressor', RandomForestRegressor(random_state=42))]), param_grid_rf, cv=3, # 3-fold cross-validation scoring='neg_mean_squared_error', n_jobs=-1) grid_search_rf.fit(X_train, y_train)
print("\nBest parameters for Random Forest:", grid_search_rf.best_params_) best_rf_model = grid_search_rf.best_estimator_ y_pred_best_rf = best_rf_model.predict(X_test) mse_best_rf = mean_squared_error(y_test, y_pred_best_rf) rmse_best_rf = np.sqrt(mse_best_rf) r2_best_rf = r2_score(y_test, y_pred_best_rf) mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
print("\nBest Random Forest Model Evaluation:") print(f"Mean Squared Error (MSE): {mse_best_rf:.4f}") print(f"Root Mean Squared Error (RMSE): {rmse_best_rf:.4f}") print(f"R-squared (R²): {r2_best_rf:.4f}") print(f"Mean Absolute Error (MAE): {mae_best_rf:.4f}")
print("\n--- Model Evaluation and Interpretation ---")
if 'Random Forest' in models:
try:
feature_importances = best_rf_model.named_steps['regressor'].feature_importances_
feature_names = preprocessor.transformers_[0][2] +
list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(CATEGORICAL_FEATURES))
sorted_indices = np.argsort(feature_importances)[::-1]
plt.figure(figsize=(12, 6))
plt.title("Feature Importances (Random Forest)")
plt.bar(range(X_train.shape[1]), feature_importances[sorted_indices], align="center")
plt.xticks(range(X_train.shape[1]), np.array(feature_names)[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()
except Exception as e:
print(f"Could not plot feature importances: {e}")
'''python
Results for movie ratings prediction:
--- Exploratory Data Analysis ---
First 5 rows: Name Year Duration ... Actor 1 Actor 2 Actor 3 0 2019 120 ... Manmauji Birbal Rajendra Bhatia 1 #Gadhvi (He thought he was Gandhi) 2019 109 ... Rasika Dugal Vivek Ghamande Arvind Jangid 2 #Homecoming 2021 90 ... Sayani Gupta Plabita Borthakur Roy Angana 3 #Yaaram 2019 110 ... Prateik Ishita Raj Siddhant Kapoor 4 ...And Once Again 2010 105 ... Rajat Kapoor Rituparna Sengupta Antara Mali
[5 rows x 10 columns]
Dataset info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 15503 entries, 0 to 15502 Data columns (total 10 columns):
0 Name 15503 non-null object 1 Year 15503 non-null int64 2 Duration 15503 non-null int64 3 Genre 15503 non-null object 4 Rating 15503 non-null float64 5 Votes 15503 non-null int64 6 Director 15503 non-null object 7 Actor 1 15503 non-null object 8 Actor 2 15503 non-null object 9 Actor 3 15503 non-null object dtypes: float64(1), int64(3), object(6) memory usage: 1.2+ MB
Summary statistics: Year Duration Rating Votes count 15503.000000 15503.000000 15503.000000 15503.000000 mean 1988.094240 123.795265 5.919100 994.118235 std 25.645665 20.169313 0.990702 8348.028063 min 1913.000000 2.000000 1.100000 5.000000 25% 1969.000000 120.000000 6.000000 8.000000 50% 1992.000000 120.000000 6.000000 8.000000 75% 2011.000000 129.000000 6.000000 59.000000 max 2022.000000 321.000000 10.000000 591417.000000
Missing values per column:
Name 0
Year 0
Duration 0
Genre 0
Rating 0
Votes 0
Director 0
Actor 1 0
Actor 2 0
Actor 3 0
dtype: int64
