diff --git a/DataFrame/.classpath b/DataFrame/.classpath new file mode 100644 index 0000000..57bca72 --- /dev/null +++ b/DataFrame/.classpath @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/DataFrame/.project b/DataFrame/.project new file mode 100644 index 0000000..fbdc09f --- /dev/null +++ b/DataFrame/.project @@ -0,0 +1,17 @@ + + + DataFrame + + + + + + org.eclipse.jdt.core.javabuilder + + + + + + org.eclipse.jdt.core.javanature + + diff --git a/DataFrame/.settings/org.eclipse.core.resources.prefs b/DataFrame/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..99f26c0 --- /dev/null +++ b/DataFrame/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,2 @@ +eclipse.preferences.version=1 +encoding/=UTF-8 diff --git a/DataFrame/.settings/org.eclipse.jdt.core.prefs b/DataFrame/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..8c9943d --- /dev/null +++ b/DataFrame/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,14 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=17 +org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve +org.eclipse.jdt.core.compiler.compliance=17 +org.eclipse.jdt.core.compiler.debug.lineNumber=generate +org.eclipse.jdt.core.compiler.debug.localVariable=generate +org.eclipse.jdt.core.compiler.debug.sourceFile=generate +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=warning +org.eclipse.jdt.core.compiler.release=enabled +org.eclipse.jdt.core.compiler.source=17 diff --git a/DataFrame/bin/Pandas/DataFrame.class b/DataFrame/bin/Pandas/DataFrame.class new file mode 100644 index 0000000..5277a21 Binary files /dev/null and b/DataFrame/bin/Pandas/DataFrame.class differ diff --git a/DataFrame/bin/Pandas/package-info.class b/DataFrame/bin/Pandas/package-info.class new file mode 100644 index 0000000..a3d20df Binary files /dev/null and b/DataFrame/bin/Pandas/package-info.class differ diff --git a/DataFrame/bin/PandasImplementation/implementation.class b/DataFrame/bin/PandasImplementation/implementation.class new file mode 100644 index 0000000..c237780 Binary files /dev/null and b/DataFrame/bin/PandasImplementation/implementation.class differ diff --git a/DataFrame/bin/PandasImplementation/package-info.class b/DataFrame/bin/PandasImplementation/package-info.class new file mode 100644 index 0000000..7f3242c Binary files /dev/null and b/DataFrame/bin/PandasImplementation/package-info.class differ diff --git a/DataFrame/bin/module-info.class b/DataFrame/bin/module-info.class new file mode 100644 index 0000000..2509486 Binary files /dev/null and b/DataFrame/bin/module-info.class differ diff --git a/DataFrame/src/Pandas/DataFrame.java b/DataFrame/src/Pandas/DataFrame.java new file mode 100644 index 0000000..3595799 --- /dev/null +++ b/DataFrame/src/Pandas/DataFrame.java @@ -0,0 +1,389 @@ +package Pandas; +import java.util.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.io.FileWriter; + +public class DataFrame { + + public List> data; + public Set columns; + + public DataFrame() { + this.data = new ArrayList<>(); + this.columns = new HashSet<>(); + } + + public void addColumn(String columnName, Object defaultValue) { + columns.add(columnName); + for (Map row : data) { + row.put(columnName, defaultValue); + } + } + + public void replace(String columnName, Object oldValue, Object newValue) { + for (Map row : data) { + if (row.containsKey(columnName) && Objects.equals(row.get(columnName), oldValue)) { + row.put(columnName, newValue); + } + } + } + + public List columns() { + List columnList = new ArrayList<>(columns); + Collections.sort(columnList); + return columnList; + } + + + public void addRow(Map row) { + for (String column : columns) { + row.putIfAbsent(column, null); + } + data.add(row); + } + + public int[] getShape() { + int numRows = data.size(); + int numColumns = columns.size(); + return new int[]{numRows, numColumns}; + } + + public void deleteColumn(String colName) { + if (columns.contains(colName)) { + columns.remove(colName); + for (Map entity : data) { + entity.remove(colName); + } + } else { + System.out.println("Column " + colName + " does not exist."); + } + } + + public void deleteRow(int index) { + if (index >= 0 && index < data.size()) { + data.remove(index); + } else { + throw new IndexOutOfBoundsException("Index " + index + " is out of range."); + } + } + // Retrieve a specific column + public List getColumn(String columnName) { + if (!columns.contains(columnName)) { + throw new IllegalArgumentException("Column does not exist"); + } + + List columnData = new ArrayList<>(); + for (Map row : data) { + columnData.add(row.get(columnName)); + } + return columnData; + } + + public void renameColumn(String oldName, String newName) { + if (!columns.contains(oldName)) { + throw new IllegalArgumentException("Column '" + oldName + "' does not exist."); + } + if (columns.contains(newName)) { + throw new IllegalArgumentException("Column '" + newName + "' already exists."); + } + + + columns.remove(oldName); + columns.add(newName); + + + for (Map row : data) { + if (row.containsKey(oldName)) { + row.put(newName, row.remove(oldName)); + } + } + } + + public Map getRow(int index) { + if (index < 0 || index >= data.size()) { + throw new IndexOutOfBoundsException("Row index out of bounds"); + } + + return data.get(index); + } + + + public DataFrame filter(String columnName, Object value) { + DataFrame filtered = new DataFrame(); + filtered.addColumn(columnName, null); + + for (Map row : data) { + if (Objects.equals(row.get(columnName), value)) { + filtered.addRow(new HashMap<>(row)); + } + } + + return filtered; + } + + public void read_csv(String filePath) throws IOException { + try (BufferedReader br = new BufferedReader(new FileReader(filePath))) { + + String headerLine = br.readLine(); + if (headerLine == null) { + throw new IllegalStateException("CSV file is empty"); + } + + + String[] headers = headerLine.split(","); + for (String columnName : headers) { + addColumn(columnName.trim(), null); + } + + + String line; + while ((line = br.readLine()) != null) + { + String[] rowValues = line.split(","); + + if (rowValues.length != headers.length) { + throw new IllegalArgumentException("Row length does not match header length"); + } + + + Map row = new HashMap<>(); + for (int i = 0; i < headers.length; i++) { + row.put(headers[i].trim(), rowValues[i].trim()); + } + + addRow(row); + } + } + } + + public List> head(int num) { + if(num > 0) + { + int numRows = Math.min(data.size(), num); + return data.subList(0, numRows); + } + else + { + return tail((-1)*num); + } + } + + public List> head() { + + int numRows = Math.min(data.size(), 5); + return data.subList(0, numRows); + + } + + public List> tail(int num) { + int totalRows = data.size(); + int start = Math.max(0, totalRows - num); + return data.subList(start, totalRows); + } + + public List> tail(){ + int totalRows = data.size(); + int start = Math.max(0, totalRows - 5); + return data.subList(start, totalRows); + } + + + public void displayDataTabular(List> rows) { + if (rows.isEmpty()) { + System.out.println("No data available"); + return; + } + + + List columnList = new ArrayList<>(columns); + + + Map columnWidths = new HashMap<>(); + for (String column : columnList) { + int maxLength = column.length(); + for (Map row : rows) { + Object value = row.get(column); + int valueLength = (value == null) ? 4 : value.toString().length(); + if (valueLength > maxLength) { + maxLength = valueLength; + } + } + columnWidths.put(column, maxLength); + } + + + StringBuilder formatBuilder = new StringBuilder(); + for (String column : columnList) { + formatBuilder.append("%-").append(columnWidths.get(column)).append("s | "); + } + String formatString = formatBuilder.toString(); + + + System.out.println(); + System.out.format(formatString, columnList.toArray()); + + + StringBuilder separator = new StringBuilder(); + for (String column : columnList) { + int width = columnWidths.get(column) + 3; + separator.append("-".repeat(width)); + } + System.out.println(); + System.out.println(separator); + + + for (Map row : rows) { + List rowData = new ArrayList<>(); + for (String column : columnList) { + Object value = row.get(column); + rowData.add(value == null ? "null" : value.toString()); + } + System.out.format(formatString, rowData.toArray()); + } + System.out.println(); + } + + public void info() { + if (data == null || data.isEmpty()) { + System.out.println("No data available"); + return; + } + + int totalRows = data.size(); + System.out.println("Total rows: " + totalRows); + + + Set uniqueColumns = new HashSet<>(); + for (Map row : data) { + uniqueColumns.addAll(row.keySet()); + } + List columnNames = new ArrayList<>(uniqueColumns); + + System.out.println("Column Information:"); + Map nonNullCounts = new HashMap<>(); + Map> columnTypes = new HashMap<>(); + + + for (String column : columnNames) { + int nonNullCount = 0; + Class columnType = null; + + for (Map row : data) { + Object value = row.get(column); + if (value != null) { + nonNullCount++; + if (columnType == null) { + columnType = value.getClass(); + } else if (!columnType.equals(value.getClass())) { + columnType = Object.class; + } + } + } + + nonNullCounts.put(column, nonNullCount); + columnTypes.put(column, columnType); + } + + for (String column : columnNames) { + int nonNullCount = nonNullCounts.get(column); + Class columnType = columnTypes.get(column); + System.out.printf(" - %s: %d non-null, type: %s%n", column, nonNullCount, + (columnType == null) ? "Unknown" : columnType.getSimpleName()); + } + + + long estimatedMemoryUsage = totalRows * columnNames.size() * 8; + System.out.println("Estimated memory usage: " + estimatedMemoryUsage + " bytes"); + } + + + public void toCSV(String fileName) throws IOException { + String str = fileName + ".csv"; + File myFile = new File(str); + try { + myFile.createNewFile(); + System.out.println("File created at: " + myFile.getAbsolutePath()); + } catch (IOException e) { + + e.printStackTrace(); + } + try (FileWriter writer = new FileWriter(str)) { + + List columnNames = new ArrayList<>(columns); + Collections.sort(columnNames); + writer.write(String.join(",", columnNames) + "\n"); + + + for (Map row : data) { + List values = new ArrayList<>(); + for (String column : columnNames) { + Object value = row.get(column); + if (value == null) { + values.add(""); + } else { + String strValue = value.toString(); + + if (strValue.contains(",") || strValue.contains("\"")) { + strValue = "\"" + strValue.replace("\"", "\"\"") + "\""; + } + values.add(strValue); + } + } + writer.write(String.join(",", values) + "\n"); + } + } + } + + public void sort_values(String columnName, boolean ascending) { + if (!columns.contains(columnName)) { + throw new IllegalArgumentException("Column " + columnName + " not found"); + } + + Comparator> comparator = (row1, row2) -> { + Object value1 = row1.get(columnName); + Object value2 = row2.get(columnName); + + + if (value1 == null && value2 == null) { + return 0; + } + if (value1 == null) { + return ascending ? -1 : 1; + } + if (value2 == null) { + return ascending ? 1 : -1; + } + + if (value1 instanceof Comparable && value2 instanceof Comparable) { + return ((Comparable) value1).compareTo(value2) * (ascending ? 1 : -1); + } + + throw new IllegalArgumentException("Values in column " + columnName + " are not comparable"); + }; + + data.sort(comparator); + } + + public void deleteRows(int startIndex, int endIndex) { + if (startIndex < 0 || endIndex >= data.size() || startIndex > endIndex) { + throw new IllegalArgumentException("Invalid index range for deletion"); + } + + + data.subList(startIndex, endIndex + 1).clear(); + } + + + public List> getData() { + return data; + } + + public Set getColumns() { + return columns; + } + + +} diff --git a/DataFrame/src/Pandas/package-info.java b/DataFrame/src/Pandas/package-info.java new file mode 100644 index 0000000..73ed990 --- /dev/null +++ b/DataFrame/src/Pandas/package-info.java @@ -0,0 +1 @@ +package Pandas; \ No newline at end of file diff --git a/DataFrame/src/PandasImplementation/implementation.java b/DataFrame/src/PandasImplementation/implementation.java new file mode 100644 index 0000000..7dadec5 --- /dev/null +++ b/DataFrame/src/PandasImplementation/implementation.java @@ -0,0 +1,205 @@ +package PandasImplementation; + +import java.io.IOException; + +import Pandas.DataFrame; +import Pandas.DataFrame.*; +import java.util.*; +public class implementation { + + public static void main(String[] args) throws IOException { + + Scanner sc = new Scanner(System.in); + + DataFrame df = new DataFrame(); + //df.read_csv("C:\\Users\\richa\\Downloads\\outputfinal.csv"); + + System.out.println("Enter the path of the csv file : "); + String path = sc.next(); + df.read_csv(path); + + int a = 1; + + while(a==1) + { + + System.out.println(); + System.out.println("---------- MENU ----------"); + System.out.println("1. Sort by Column values"); + System.out.println("2. Delete Column"); + System.out.println("3. Delete Row"); + System.out.println("4. Delete Rows in range"); + System.out.println("5. Get Column"); + System.out.println("6. Get Row"); + System.out.println("7. Rename Column"); + System.out.println("8. Get Shape"); + System.out.println("9. Get Info"); + System.out.println("10. Replace Columns value"); + System.out.println("11. Get all Column Names"); + System.out.println("12. Get Default Head"); + System.out.println("13. Get Custom Head"); + System.out.println("14. Get Default Tail"); + System.out.println("15. Get Custom Tail"); + System.out.println("16. Generate CSV"); + System.out.println("17. Exit"); + System.out.println(); + + System.out.println("Enter your choice : "); + int choice = sc.nextInt(); + + switch(choice) + { + + case 1: + System.out.println(); + System.out.println("----- Sort by Column values -----"); + System.out.println("Enter the column name you want to sort : "); + df.sort_values(sc.next(), false); + break; + + case 2: + System.out.println(); + System.out.println("----- Delete Column -----"); + System.out.println("Enter the column name you want to delete : "); + df.deleteColumn(sc.next()); + break; + + case 3: + System.out.println(); + System.out.println("----- Delete Row -----"); + System.out.println("Enter the sample index you want to delete : "); + df.deleteRow(sc.nextInt()); + break; + + case 4: + System.out.println(); + System.out.println("----- Delete Rows in Range -----"); + System.out.println("Enter start index : "); + int start = sc.nextInt(); + System.out.println("Enter end index : "); + int end = sc.nextInt(); + df.deleteRows(start, end); + break; + + case 5: + System.out.println(); + System.out.println("----- Get Column -----"); + System.out.println("Enter the Column name you want to view:"); + List column = df.getColumn(sc.next()); + Iterator it = column.iterator(); + + while (it.hasNext()) + { + System.out.println(it.next()); + } + break; + + case 6: + System.out.println(); + System.out.println("----- Get Row -----"); + System.out.println("Enter the row index you want to view:"); + Map row = df.getRow(sc.nextInt()); + for (Map.Entry set : row.entrySet()) { + + System.out.println(set.getKey() + " : "+ set.getValue()); + } + break; + + case 7: + System.out.println(); + System.out.println("----- Rename Column -----"); + System.out.println("Enter the initial column name : "); + String ini = sc.next(); + String commit = sc.next(); + df.renameColumn(ini, commit); + break; + + case 8: + System.out.println(); + System.out.println("----- Get Shape -----"); + int arr[] = df.getShape(); + System.out.println("Number of Rows:" + arr[0]); + System.out.println("Number of Columns:" + arr[1]); + break; + + case 9: + System.out.println(); + System.out.println("----- Get Info -----"); + df.info(); + break; + + case 10: + System.out.println(); + System.out.println("----- Replace Columns value -----"); + System.out.println("Enter Column Name : "); + String colname = sc.next(); + System.out.println("Enter the String you want to replace : "); + Object replace = sc.next(); + System.out.println("Enter the value you want to replace with"); + Object with = sc.next(); + + df.replace(colname, replace, with); + break; + + case 11: + System.out.println(); + System.out.println("----- Get all Column Names -----"); + System.out.println("Get Column Names"); + Set colname1 = df.getColumns(); + Iterator namesIterator = colname1.iterator(); + while(namesIterator.hasNext()) { + System.out.println(namesIterator.next()); + } + break; + + case 12: + System.out.println(); + System.out.println("----- Get Default Head -----"); + df.displayDataTabular(df.head()); + System.out.println(); + break; + + case 13: + System.out.println(); + System.out.println("----- Get Custom Head -----"); + System.out.println("Enter Number of Samples: "); + df.displayDataTabular(df.head(sc.nextInt())); + System.out.println(); + break; + + case 14: + System.out.println(); + System.out.println("----- Get Default Tail -----"); + df.displayDataTabular(df.tail()); + System.out.println(); + break; + + case 15: + System.out.println(); + System.out.println("----- Get Custom Head -----"); + System.out.println("Enter Number of Samples: "); + df.displayDataTabular(df.tail(sc.nextInt())); + System.out.println(); + break; + + case 16: + System.out.println(); + System.out.println("----- Generate CSV -----"); + System.out.println("Enter the file name: "); + df.toCSV(sc.next()); + break; + + case 17: + System.out.println(); + System.out.println("----- Exit -----"); + System.out.println("Thank You"); + a = 0; + break; + + } + } + + + } + +} diff --git a/DataFrame/src/PandasImplementation/package-info.java b/DataFrame/src/PandasImplementation/package-info.java new file mode 100644 index 0000000..1b02e18 --- /dev/null +++ b/DataFrame/src/PandasImplementation/package-info.java @@ -0,0 +1 @@ +package PandasImplementation; \ No newline at end of file diff --git a/DataFrame/src/module-info.java b/DataFrame/src/module-info.java new file mode 100644 index 0000000..d549775 --- /dev/null +++ b/DataFrame/src/module-info.java @@ -0,0 +1,9 @@ +/** + * + */ +/** + * @author richa + * + */ +module DataFrame { +} \ No newline at end of file diff --git a/README.md b/README.md index 910ff0f..96e244a 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,42 @@ -# Buffer-5.0 -Buffer is a Data Structures and Algorithms Project Series, in which students can participate as mentees in teams of 2-4. Under Buffer 5.0, the themes on which students can create a project are: +# RiRa25-TechTribe-24-PandasLibraryinJava -1. Public Welfare -2. Tourism -3. College level applications -4. Custom Data structure +Domain: Custom Data Structures + +# "Developing a Pandas Library in Java for Data Manipulation and Analysis" + +There are two Java Files: + +Path of Implemented Java file on GitHub : DataFrame -> src -> Pandas -> DataFrame.java + +Path of Main Java file on GitHub : DataFrame -> src -> PandasImplementation -> implementation.java + +15 days Report Part 1: https://drive.google.com/file/d/1R18rocuTFUHpl4HoBmyhC43PM5xzve0i/view?usp=sharing + +15 days Report Part 2: https://drive.google.com/file/d/1r1XOYNLR-jNCFrGRiiXkLr8ie9ou0C76/view?usp=sharing + +Project Report: https://drive.google.com/file/d/1gQ5SXzsoPDYgvIvs_0UGrSu2wuYWXHU0/view?usp=sharing + +Video Link: https://drive.google.com/file/d/14mhADGuM3zFoIYLIVzyW4ji9yQbVPAPh/view?usp=sharing + +Drive Link: https://drive.google.com/drive/folders/1Ix1RRPqXSxsXQb-TE0vpQPoTAEkEM_DV?usp=sharing + +# Description: + +This project aims to create a DataFrame library in Java, offering functionalities similar to Python's Pandas library. The Java DataFrame will allow users to perform key data manipulation tasks such as sorting, filtering, adding/removing columns and rows, reading and generating CSV files, and more. + +Python's Pandas DataFrame is a widely used data structure that allows users to work with two-dimensional data, similar to SQL tables or Excel spreadsheets. The DataFrame's flexibility and ease of use have made it a preferred choice for data science and analysis. + +By developing a DataFrame library for Java, this project will provide Java developers with a powerful tool for data manipulation and analysis. This will enable Java developers to conduct data science tasks without needing to switch to other programming languages. + +# Data Structures used: + +1. List of Maps (`List>`): The primary data structure for storing rows of the DataFrame. Each map represents a single row, with keys as column names and values as data. + +2. Set of Strings (`Set`): A set is used to maintain a unique collection of column names. This set helps ensure that column names are unique and provides a way to track which columns are part of the Pandas. + +3. ArrayList (`ArrayList`): Used in various operations, such as maintaining a list of column names in a specific order or returning a subset of rows. The `ArrayList` is preferred for its dynamic size and random-access performance. + +4. HashSet (`HashSet`): Used to store column names for fast membership checks. The `HashSet` provides quick lookup for column names, helping ensure uniqueness. + +5. HashMap (`HashMap`): The `HashMap` is used to represent individual rows within the DataFrame. It allows efficient key-based retrieval of column values and supports dynamic insertion and removal of key-value pairs. -This repository is created for all the teams to be able to upload their final project source code. While submitting, note that all the submission guidelines given are followed, and all the files are named appropiately. Also ensure that your README file contains the links of the progress reports and the drive link containing the video of the project.