diff --git a/NDV_Code_By_RibkaA_Linear_Regression/LinearRegression.py b/NDV_Code_By_RibkaA_Linear_Regression/LinearRegression.py new file mode 100644 index 000000000..2d30381c3 --- /dev/null +++ b/NDV_Code_By_RibkaA_Linear_Regression/LinearRegression.py @@ -0,0 +1,80 @@ +# Import necessary libraries +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score + +# Load dataset +# If using CSV: df = pd.read_csv('Salary_Data.csv') +# Else, create sample dataset manually +data = { + 'YearsExperience': [1.1, 1.3, 1.5, 2.0, 2.2, 2.9, 3.0, 3.2, 3.2, 3.7, + 3.9, 4.0, 4.0, 4.1, 4.5, 4.9, 5.1, 5.3, 5.9, 6.0, + 6.8, 7.1, 7.9, 8.2, 8.7, 9.0, 9.5, 9.6, 10.3, 10.5], + 'Salary': [39343, 46205, 37731, 43525, 39891, 56642, 60150, 54445, 64445, 57189, + 63218, 55794, 56957, 57081, 61111, 67938, 66029, 83088, 81363, 93940, + 91738, 98273, 101302, 113812, 109431, 105582, 116969, 112635, 122391, 121872] +} +df = pd.DataFrame(data) + +# Display first 5 rows +print("Dataset Preview:") +print(df.head()) + +# Visualize data with scatter plot +plt.figure(figsize=(8, 5)) +sns.scatterplot(x='YearsExperience', y='Salary', data=df) +plt.title("Years of Experience vs Salary") +plt.xlabel("Years of Experience") +plt.ylabel("Salary") +plt.grid(True) +plt.show() + +# Define features and target +X = df[['YearsExperience']] +y = df['Salary'] + +# Split data into training and testing sets (80/20) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Create and train Linear Regression model +model = LinearRegression() +model.fit(X_train, y_train) + +# Predict on test set +y_pred = model.predict(X_test) + +# Evaluate model performance +mse = mean_squared_error(y_test, y_pred) +r2 = r2_score(y_test, y_pred) +print(f"\nMean Squared Error: {mse:.2f}") +print(f"R² Score: {r2:.2f}") + +# Plot regression line over scatter plot +plt.figure(figsize=(8, 5)) +sns.scatterplot(x='YearsExperience', y='Salary', data=df, label='Actual') +plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted Line') +plt.title("Linear Regression Fit") +plt.xlabel("Years of Experience") +plt.ylabel("Salary") +plt.legend() +plt.grid(True) +plt.show() + +# Compare predictions vs actual values using bar chart +comparison_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred}) +comparison_df.plot(kind='bar', figsize=(10, 5)) +plt.title("Actual vs Predicted Salaries") +plt.xlabel("Sample Index") +plt.ylabel("Salary") +plt.grid(True) +plt.tight_layout() +plt.show() + +# Optional: Predict salary for user input +exp = float(input("Enter years of experience to predict salary: ")) +pred_salary = model.predict([[exp]]) +print(f"Predicted Salary for {exp} years of experience: ₹{pred_salary[0]:,.2f}")