From c5048c6b6e13d4151b3ba3c4ac6a73b94eb24d26 Mon Sep 17 00:00:00 2001 From: Dima Davydov Date: Wed, 27 Nov 2024 13:54:50 +0300 Subject: [PATCH 1/7] do method for dividing a line --- .../api_openai/OpenAIEmbeddingsAPI.java | 64 +++++++++++++++++++ .../configurations/RedisConfig.java | 24 +++++++ .../controller/DataController.java | 8 ++- .../controllers/EmbeddingController.java | 36 +++++++++++ .../models/CompanyEmbeddingsData.java | 13 ++++ .../models/EmbeddingData.java | 15 +++++ .../services/EmbeddingService.java | 47 ++++++++++++++ .../services/TextServiceImpl.java | 31 ++++++++- src/main/resources/application.properties | 6 ++ 9 files changed, 239 insertions(+), 5 deletions(-) create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/api_openai/OpenAIEmbeddingsAPI.java create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/configurations/RedisConfig.java create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/controllers/EmbeddingController.java create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/models/CompanyEmbeddingsData.java create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/models/EmbeddingData.java create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/services/EmbeddingService.java diff --git a/src/main/java/ru/hackteam/window_of_knowledge/api_openai/OpenAIEmbeddingsAPI.java b/src/main/java/ru/hackteam/window_of_knowledge/api_openai/OpenAIEmbeddingsAPI.java new file mode 100644 index 0000000..3a07cc5 --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/api_openai/OpenAIEmbeddingsAPI.java @@ -0,0 +1,64 @@ +package ru.hackteam.window_of_knowledge.api_openai; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; + + +import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + + +@Service +public class OpenAIEmbeddingsAPI { + private final ObjectMapper objectMapper; + private final String API_KEY = System.getenv("API_KEY"); + @Value(value = "${api.url.embeddings}") + private String API_URL; + + private final HttpClient httpClient; + + + public OpenAIEmbeddingsAPI() { + this.httpClient = HttpClient.newHttpClient(); + this.objectMapper = new ObjectMapper(); + } + + + public double[] getEmbeddings(String text) throws IOException, InterruptedException { + Map requestBody = new HashMap<>(); + requestBody.put("input", text); + requestBody.put("model", "text-embedding-ada-002"); + + String jsonBody = objectMapper.writeValueAsString(requestBody); + + HttpRequest request = HttpRequest.newBuilder() + .uri(URI.create(API_URL)) + .header("Content-Type", "application/json") + .header("Authorization", "Bearer " + API_KEY) + .POST(HttpRequest.BodyPublishers.ofString(jsonBody, StandardCharsets.UTF_8)) + .build(); + + HttpResponse response = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); + + if (response.statusCode() != 200) { + throw new RuntimeException("Ошибка при запросе к OpenAI API: " + response.body()); + } + + // Разбор JSON-ответа + Map responseMap = objectMapper.readValue(response.body(), Map.class); + Map data = (Map) ((java.util.List) responseMap.get("data")).get(0); + java.util.List embeddingList = (java.util.List) data.get("embedding"); + + // Преобразование списка в массив + return embeddingList.stream().mapToDouble(Double::doubleValue).toArray(); + + + } +} diff --git a/src/main/java/ru/hackteam/window_of_knowledge/configurations/RedisConfig.java b/src/main/java/ru/hackteam/window_of_knowledge/configurations/RedisConfig.java new file mode 100644 index 0000000..4334e96 --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/configurations/RedisConfig.java @@ -0,0 +1,24 @@ +package ru.hackteam.window_of_knowledge.configurations; + + +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.data.redis.connection.RedisConnectionFactory; +import org.springframework.data.redis.connection.lettuce.LettuceConnectionFactory; +import org.springframework.data.redis.core.RedisTemplate; + +@Configuration +public class RedisConfig { + @Bean + public RedisTemplate redisTemplateEmbeddings(RedisConnectionFactory redisConnectionFactory) { + + LettuceConnectionFactory factory = new LettuceConnectionFactory(); + factory.setDatabase(0); // Подключение к базе 0 + factory.afterPropertiesSet(); + + RedisTemplate template = new RedisTemplate<>(); + template.setConnectionFactory(redisConnectionFactory); + return template; + } + +} diff --git a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java index 14800ed..3354ca1 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java @@ -4,8 +4,10 @@ import org.springframework.http.MediaType; import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; -import ru.hackteam.window_of_knowledge.model.TextData; -import ru.hackteam.window_of_knowledge.service.*; +import ru.hackteam.window_of_knowledge.models.TextData; +import ru.hackteam.window_of_knowledge.services.*; + +import java.util.List; @RestController @RequestMapping(path = "data") @@ -34,7 +36,7 @@ public String saveTexFiletFormat(@RequestParam MultipartFile avatar) { } @PostMapping(path = "text") - public String saveTextFormat(@RequestBody TextData textData) { + public List saveTextFormat(@RequestBody TextData textData) { return textServiceImpl.saveTextToBd(textData); } diff --git a/src/main/java/ru/hackteam/window_of_knowledge/controllers/EmbeddingController.java b/src/main/java/ru/hackteam/window_of_knowledge/controllers/EmbeddingController.java new file mode 100644 index 0000000..93ed622 --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/controllers/EmbeddingController.java @@ -0,0 +1,36 @@ +package ru.hackteam.window_of_knowledge.controllers; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.web.bind.annotation.*; +import ru.hackteam.window_of_knowledge.services.EmbeddingService; + +import java.io.IOException; +import java.util.Map; + +@RestController +@RequestMapping("/api/embeddings") +public class EmbeddingController { + + private final EmbeddingService embeddingService; + + @Autowired + public EmbeddingController(EmbeddingService embeddingService) { + this.embeddingService = embeddingService; + } + + @PostMapping("/process") + public String processText(@RequestParam String id, @RequestParam String text) { + try { + embeddingService.processText(id, text); + return "Text processed and saved!"; + } catch (IOException | InterruptedException e) { + e.printStackTrace(); + return "Error: " + e.getMessage(); + } + } + + @GetMapping("/{id}") + public Map getCompanyData(@PathVariable String id) { + return embeddingService.getCompanyData(id); + } +} diff --git a/src/main/java/ru/hackteam/window_of_knowledge/models/CompanyEmbeddingsData.java b/src/main/java/ru/hackteam/window_of_knowledge/models/CompanyEmbeddingsData.java new file mode 100644 index 0000000..91140d5 --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/models/CompanyEmbeddingsData.java @@ -0,0 +1,13 @@ +package ru.hackteam.window_of_knowledge.models; + +import lombok.Getter; +import lombok.Setter; + +import java.util.List; + +@Setter +@Getter +public class CompanyEmbeddingsData { + private String id; + private List texts; +} diff --git a/src/main/java/ru/hackteam/window_of_knowledge/models/EmbeddingData.java b/src/main/java/ru/hackteam/window_of_knowledge/models/EmbeddingData.java new file mode 100644 index 0000000..8c6ab03 --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/models/EmbeddingData.java @@ -0,0 +1,15 @@ +package ru.hackteam.window_of_knowledge.models; + +import lombok.Getter; +import lombok.Setter; + +import java.util.List; + + + +@Getter +@Setter +public class EmbeddingData { + private String text; + private List embedding; +} diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/EmbeddingService.java b/src/main/java/ru/hackteam/window_of_knowledge/services/EmbeddingService.java new file mode 100644 index 0000000..f41ac6c --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/EmbeddingService.java @@ -0,0 +1,47 @@ +package ru.hackteam.window_of_knowledge.services; + +import org.springframework.data.redis.core.RedisTemplate; +import org.springframework.stereotype.Service; +import ru.hackteam.window_of_knowledge.api_openai.OpenAIEmbeddingsAPI; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + + +@Service +public class EmbeddingService { + private final OpenAIEmbeddingsAPI openAIEmbeddingsAPI; + private final RedisTemplate redisTemplate; + + public EmbeddingService(OpenAIEmbeddingsAPI openAIEmbeddingsAPI, RedisTemplate redisTemplate) { + this.openAIEmbeddingsAPI = openAIEmbeddingsAPI; + this.redisTemplate = redisTemplate; + } + + + public void processText(String id, String text) throws InterruptedException, IOException { + double[] embeddings = openAIEmbeddingsAPI.getEmbeddings(text); + saveToRedis(id, text, embeddings); + } + + private void saveToRedis(String id, String text, double[] embeddings) { + + Map companyData = (Map) redisTemplate.opsForValue().get(id); + if (companyData == null) { + companyData = new HashMap<>(); + companyData.put("id", id); + companyData.put("texts", new HashMap()); + } + + Map texts = (Map) companyData.get("texts"); + texts.put(text, embeddings); + + redisTemplate.opsForValue().set(id, companyData); + } + + public Map getCompanyData(String id) { + return (Map) redisTemplate.opsForValue().get(id); + } +} + diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/TextServiceImpl.java b/src/main/java/ru/hackteam/window_of_knowledge/services/TextServiceImpl.java index c9da1e6..8c1403e 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/TextServiceImpl.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/TextServiceImpl.java @@ -3,9 +3,36 @@ import org.springframework.stereotype.Service; import ru.hackteam.window_of_knowledge.models.TextData; +import java.util.ArrayList; +import java.util.List; + @Service public class TextServiceImpl { - public String saveTextToBd(TextData textData) { - return "Ваш текстовый файл добавлен с форматом: " + textData; + public List saveTextToBd(TextData textData) { + String text = textData.getTextData(); + String[] words = text.split("\\s+"); // Разделение строки на слова + List chunks = new ArrayList<>(); + + StringBuilder chunk = new StringBuilder(); + int wordCount = 0; + + for (String word : words) { + chunk.append(word).append(" "); + wordCount++; + + // Если достигли 200 слов или конец текста, добавляем в список + if (wordCount == 200) { + chunks.add(chunk.toString().trim()); + chunk.setLength(0); // Очистка StringBuilder + wordCount = 0; + } + } + + // Добавляем последний оставшийся кусок, если он не пуст + if (chunk.length() > 0) { + chunks.add(chunk.toString().trim()); + } + + return chunks; } } diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index a4fd1eb..3bdaa3c 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -1 +1,7 @@ spring.application.name=WindowOfKnowledge +api.url.embeddings=https://api.openai.com/v1/embeddings +spring.jpa.properties.hibernate.dialect=org.hibernate.dialect.PostgreSQLDialect +spring.data.redis.host=localhost +spring.data.redis.port=6379 +spring.data.redis.database=0 +spring.autoconfigure.exclude=org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration From bd1fec03093550bb8eecd1203698be5ce211065b Mon Sep 17 00:00:00 2001 From: Dima Davydov Date: Wed, 27 Nov 2024 17:16:17 +0300 Subject: [PATCH 2/7] did method for getting text from pdf --- pom.xml | 180 ++++++++++-------- .../controller/DataController.java | 16 +- .../services/ExcelServiceImpl.java | 127 +++++++++++- .../services/ExtractData.java | 2 + .../services/NotionServiceImpl.java | 1 + .../services/PdfService.java | 45 +++++ .../services/TextFileServiceImpl.java | 1 + 7 files changed, 282 insertions(+), 90 deletions(-) create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java diff --git a/pom.xml b/pom.xml index b6c1c56..65cdb5b 100644 --- a/pom.xml +++ b/pom.xml @@ -1,60 +1,72 @@ - 4.0.0 - - org.springframework.boot - spring-boot-starter-parent - 3.4.0 - - - ru.hackteam - WindowOfKnowledge - 0.0.1-SNAPSHOT - WindowOfKnowledge - Demo project for Spring Boot - - - - - - - - - - - - - - - 17 - - - - org.springframework.boot - spring-boot-starter-web - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 + + org.springframework.boot + spring-boot-starter-parent + 3.4.0 + + + ru.hackteam + WindowOfKnowledge + 0.0.1-SNAPSHOT + WindowOfKnowledge + Demo project for Spring Boot + + + + + + + + + + + + + + + 17 + + + + org.springframework.boot + spring-boot-starter-web + + + + org.postgresql + postgresql + runtime + + + org.projectlombok + lombok + true + + + org.springframework.boot + spring-boot-starter-test + test + + + org.springdoc + springdoc-openapi-starter-webmvc-ui + 2.6.0 + + + org.apache.pdfbox + pdfbox + 2.0.29 + + + + org.apache.poi + poi-ooxml + 5.2.3 + - - org.postgresql - postgresql - runtime - - - org.projectlombok - lombok - true - - - org.springframework.boot - spring-boot-starter-test - test - - - org.springdoc - springdoc-openapi-starter-webmvc-ui - 2.6.0 - org.springframework.boot spring-boot-starter-data-jpa @@ -62,39 +74,39 @@ compile - - com.vladmihalcea - hibernate-types-52 - 2.21.1 - + + com.vladmihalcea + hibernate-types-52 + 2.21.1 + - - org.springframework.boot - spring-boot-starter-data-redis - - - com.fasterxml.jackson.core - jackson-databind - + + org.springframework.boot + spring-boot-starter-data-redis + + + com.fasterxml.jackson.core + jackson-databind + - + - - - - org.springframework.boot - spring-boot-maven-plugin - - - - org.projectlombok - lombok - - - - - - + + + + org.springframework.boot + spring-boot-maven-plugin + + + + org.projectlombok + lombok + + + + + + diff --git a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java index 3354ca1..ed6ff12 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java @@ -16,11 +16,15 @@ public class DataController { private ExtractData extractData; @Autowired public TextServiceImpl textServiceImpl; + @Autowired + public ExcelServiceImpl excelServiceImpl; + + @Autowired + private PdfService pdfService; @PostMapping(value = "excel", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) - public String saveExcelFormat(@RequestParam MultipartFile avatar) { - extractData = new ExcelServiceImpl(); - return extractData.saveDataToBd(avatar); + public String saveExcelFormat(@RequestParam MultipartFile avatar, @RequestParam(required = false) String startCell1, @RequestParam(required = false) String startCell2) { + return excelServiceImpl.saveDataToBd(avatar, startCell1, startCell2); } @PostMapping(value = "notion", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) @@ -40,6 +44,12 @@ public List saveTextFormat(@RequestBody TextData textData) { return textServiceImpl.saveTextToBd(textData); } + + @PostMapping(value = "pdf", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public String convertPdf(@RequestParam("file") MultipartFile file, @RequestParam(value = "startPage", required = false, defaultValue = "0") Integer startPage, + @RequestParam(value = "endPage", required = false, defaultValue = "0") Integer endPage) { + return pdfService.convertPdfToText(file, startPage, endPage); + } } diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/ExcelServiceImpl.java b/src/main/java/ru/hackteam/window_of_knowledge/services/ExcelServiceImpl.java index b368605..cf6c5ea 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/ExcelServiceImpl.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/ExcelServiceImpl.java @@ -1,11 +1,132 @@ package ru.hackteam.window_of_knowledge.services; +import org.apache.poi.ss.usermodel.*; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; +import java.io.InputStream; +import java.io.FileWriter; +import java.io.IOException; + @Service -public class ExcelServiceImpl implements ExtractData { - public String saveDataToBd(MultipartFile avatar) { - return "Ваш excel файл добавлен с форматом: " + avatar; +public class ExcelServiceImpl { + + public String saveDataToBd(MultipartFile avatar, String startCell, String endCell) { + StringBuilder resultText = new StringBuilder(); // Используем StringBuilder для накопления данных + + // Указываем путь для сохранения текста + String textFilePath = "output.txt"; + + try (InputStream fis = avatar.getInputStream(); + Workbook workbook = new XSSFWorkbook(fis); + FileWriter writer = new FileWriter(textFilePath)) { + + // Получаем первый лист Excel + Sheet sheet = workbook.getSheetAt(0); + + // Проверяем, были ли указаны начальная и конечная ячейки + if (startCell != null && !startCell.isEmpty() && endCell != null && !endCell.isEmpty()) { + // Преобразуем строки вида "E3", "G4" в индексы + CellReference startCellRef = new CellReference(startCell); + CellReference endCellRef = new CellReference(endCell); + + int startRow = startCellRef.getRow(); + int startCol = startCellRef.getCol(); + int endRow = endCellRef.getRow(); + int endCol = endCellRef.getCol(); + + // Итерация по строкам и столбцам в указанном диапазоне + for (int rowIndex = startRow; rowIndex <= endRow; rowIndex++) { + Row row = sheet.getRow(rowIndex); + if (row != null) { // Проверяем на null, если строка пуста + for (int colIndex = startCol; colIndex <= endCol; colIndex++) { + Cell cell = row.getCell(colIndex); + if (cell != null) { // Проверяем, если ячейка не null + String cellValue = getCellValueAsString(cell); + if (!cellValue.isEmpty()) { + resultText.append(cellValue).append("\t"); + } + } + } + writer.write("\n"); // Переход на новую строку + } + } + } else { + // Если ячейки не указаны, обрабатываем весь файл + for (Row row : sheet) { + for (Cell cell : row) { + if (cell != null) { + String cellValue = getCellValueAsString(cell); + if (!cellValue.isEmpty()) { + resultText.append(cellValue).append("\t"); + } + } + } + writer.write("\n"); // Переход на новую строку + } + } + + System.out.println("Excel файл успешно преобразован в текст!"); + return resultText.toString(); + + } catch (IOException e) { + e.printStackTrace(); + return "Ошибка при обработке файла!"; + } + } + + // Вынесенный метод для преобразования значения ячейки в строку + private static String getCellValueAsString(Cell cell) { + switch (cell.getCellType()) { + case STRING: + return cell.getStringCellValue(); + case NUMERIC: + if (DateUtil.isCellDateFormatted(cell)) { + return cell.getDateCellValue().toString(); + } else { + return String.valueOf(cell.getNumericCellValue()); + } + case BOOLEAN: + return String.valueOf(cell.getBooleanCellValue()); + case FORMULA: + return cell.getCellFormula(); + default: + return ""; + } + } + + // Преобразование строки вида "E3" в индекс строки и столбца + public static class CellReference { + private final int row; + private final int col; + + public CellReference(String cellRef) { + // Разделяем строку на букву (столбец) и цифры (строка) + String columnPart = cellRef.replaceAll("[^A-Za-z]", ""); + String rowPart = cellRef.replaceAll("[^0-9]", ""); + + // Преобразуем буквы в индекс столбца (A -> 0, B -> 1, ..., Z -> 25) + col = convertColStringToIndex(columnPart); + // Строка в Excel - это просто число (считается с 1), поэтому уменьшаем на 1 + row = Integer.parseInt(rowPart) - 1; + } + + public int getRow() { + return row; + } + + public int getCol() { + return col; + } + + // Преобразуем буквы в индекс столбца (A = 0, B = 1, ...) + private int convertColStringToIndex(String colString) { + int colIndex = 0; + for (int i = 0; i < colString.length(); i++) { + colIndex = colIndex * 26 + (colString.charAt(i) - 'A' + 1); + } + return colIndex - 1; // Индексация с 0 + } } } diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/ExtractData.java b/src/main/java/ru/hackteam/window_of_knowledge/services/ExtractData.java index f3e988a..ca62012 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/ExtractData.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/ExtractData.java @@ -4,4 +4,6 @@ public interface ExtractData { String saveDataToBd(MultipartFile avatar); + +// String saveDataToBd(MultipartFile avatar, String startCell, String endCell); } diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/NotionServiceImpl.java b/src/main/java/ru/hackteam/window_of_knowledge/services/NotionServiceImpl.java index 5aa9754..9dd9692 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/NotionServiceImpl.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/NotionServiceImpl.java @@ -7,4 +7,5 @@ public class NotionServiceImpl implements ExtractData { public String saveDataToBd(MultipartFile avatar) { return "Ваш notion файл добавлен с форматом: " + avatar; } + } diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java b/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java new file mode 100644 index 0000000..5282163 --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java @@ -0,0 +1,45 @@ +package ru.hackteam.window_of_knowledge.services; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import java.io.IOException; +import java.io.InputStream; + +@Service +public class PdfService { + + public String convertPdfToText(MultipartFile pdfFile, int startPage, int endPage) { + String extractedText = ""; + + try (InputStream inputStream = pdfFile.getInputStream(); + PDDocument document = PDDocument.load(inputStream)) { + + PDFTextStripper pdfStripper = new PDFTextStripper(); + + // Если startPage и endPage не указаны, обрабатываем весь файл + if (startPage != 0) { + pdfStripper.setStartPage(startPage); + } else { + pdfStripper.setStartPage(1); + } + + if (endPage != 0) { + pdfStripper.setEndPage(endPage); + } else { + pdfStripper.setEndPage(document.getNumberOfPages()); + } + + // Извлечение текста из PDF + extractedText = pdfStripper.getText(document); + + } catch (IOException e) { + e.printStackTrace(); + return "Ошибка при обработке PDF файла: " + e.getMessage(); + } + + return extractedText; + } +} diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java b/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java index a8ea727..6c2ad74 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java @@ -8,4 +8,5 @@ public class TextFileServiceImpl implements ExtractData { public String saveDataToBd(MultipartFile avatar) { return "Ваш text файл добавлен с форматом: " + avatar; } + } From d72e49f4611dfbdf04f619d27250dfc93820f941 Mon Sep 17 00:00:00 2001 From: Dima Davydov Date: Thu, 28 Nov 2024 09:32:54 +0300 Subject: [PATCH 3/7] did method for getting list string from pdf --- .../controller/DataController.java | 4 +-- .../services/PdfService.java | 35 ++++++++++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java index ed6ff12..b4c83ae 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java @@ -46,8 +46,8 @@ public List saveTextFormat(@RequestBody TextData textData) { @PostMapping(value = "pdf", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) - public String convertPdf(@RequestParam("file") MultipartFile file, @RequestParam(value = "startPage", required = false, defaultValue = "0") Integer startPage, - @RequestParam(value = "endPage", required = false, defaultValue = "0") Integer endPage) { + public List convertPdf(@RequestParam("file") MultipartFile file, @RequestParam(value = "startPage", required = false, defaultValue = "0") Integer startPage, + @RequestParam(value = "endPage", required = false, defaultValue = "0") Integer endPage) { return pdfService.convertPdfToText(file, startPage, endPage); } } diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java b/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java index 5282163..f86c010 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java @@ -7,11 +7,13 @@ import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; @Service public class PdfService { - public String convertPdfToText(MultipartFile pdfFile, int startPage, int endPage) { + public List convertPdfToText(MultipartFile pdfFile, int startPage, int endPage) { String extractedText = ""; try (InputStream inputStream = pdfFile.getInputStream(); @@ -35,11 +37,36 @@ public String convertPdfToText(MultipartFile pdfFile, int startPage, int endPage // Извлечение текста из PDF extractedText = pdfStripper.getText(document); - } catch (IOException e) { + } + catch (IOException e) { e.printStackTrace(); - return "Ошибка при обработке PDF файла: " + e.getMessage(); + System.out.println(e.getMessage()); } - return extractedText; + String[] words = extractedText.split("\\s+"); // Разделение строки на слова + List chunks = new ArrayList<>(); + + StringBuilder chunk = new StringBuilder(); + int wordCount = 0; + + for (String word : words) { + chunk.append(word).append(" "); + wordCount++; + + // Если достигли 200 слов или конец текста, добавляем в список + if (wordCount == 200) { + chunks.add(chunk.toString().trim()); + chunk.setLength(0); // Очистка StringBuilder + wordCount = 0; + } + + } + + if (chunk.length() > 0) { + chunks.add(chunk.toString().trim()); + } + + return chunks; } } + From e116be8cbe12f6a2f6472db3885c86e86a3e921c Mon Sep 17 00:00:00 2001 From: Dima Davydov Date: Thu, 28 Nov 2024 10:10:34 +0300 Subject: [PATCH 4/7] did method for getting list string from TXT --- .../controller/DataController.java | 8 ++- .../services/TextFileServiceImpl.java | 58 +++++++++++++++++-- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java index b4c83ae..cdaabe5 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java @@ -19,6 +19,9 @@ public class DataController { @Autowired public ExcelServiceImpl excelServiceImpl; + @Autowired + public TextFileServiceImpl textFileService; + @Autowired private PdfService pdfService; @@ -34,9 +37,8 @@ public String saveNotionFormat(@RequestParam MultipartFile avatar) { } @PostMapping(value = "text-file", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) - public String saveTexFiletFormat(@RequestParam MultipartFile avatar) { - extractData = new TextFileServiceImpl(); - return extractData.saveDataToBd(avatar); + public List saveTexFiletFormat(@RequestParam MultipartFile avatar) { + return textFileService.saveDataToBd(avatar); } @PostMapping(path = "text") diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java b/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java index 6c2ad74..a6ca442 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/TextFileServiceImpl.java @@ -3,10 +3,60 @@ import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + @Service -public class TextFileServiceImpl implements ExtractData { - public String saveDataToBd(MultipartFile avatar) { - return "Ваш text файл добавлен с форматом: " + avatar; - } +public class TextFileServiceImpl { + + public List saveDataToBd(MultipartFile txtFile) { + List textChunks = new ArrayList<>(); + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(txtFile.getInputStream()))) { + StringBuilder textBuilder = new StringBuilder(); + String line; + + // Считываем весь текст из файла + while ((line = reader.readLine()) != null) { + textBuilder.append(line).append(" "); + } + + // Разделяем текст на слова + String fullText = textBuilder.toString().trim(); + String[] words = fullText.split("\\s+"); + if (words.length <= 200) { + // Если меньше 200 слов, добавляем весь текст как одну часть + textChunks.add(fullText); + } else { + // Если больше 200 слов, делим на части по 200 слов + StringBuilder chunk = new StringBuilder(); + int wordCount = 0; + + for (String word : words) { + chunk.append(word).append(" "); + wordCount++; + + if (wordCount == 200) { + textChunks.add(chunk.toString().trim()); + chunk.setLength(0); // очищаем для следующей части + wordCount = 0; + } + } + + // Добавляем остаток слов, если он есть + if (chunk.length() > 0) { + textChunks.add(chunk.toString().trim()); + } + } + } catch (IOException e) { + e.printStackTrace(); + textChunks.add("Ошибка при чтении файла: " + e.getMessage()); + } + + return textChunks; + } } From accfd91e45b21202c8a418fd21ef6dc9cc370fba Mon Sep 17 00:00:00 2001 From: Dima Davydov Date: Thu, 28 Nov 2024 11:26:22 +0300 Subject: [PATCH 5/7] did method for getting list string from Docx --- .../controller/DataController.java | 12 ++++ .../services/DocxService.java | 63 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/services/DocxService.java diff --git a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java index cdaabe5..996e8b3 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java @@ -19,6 +19,9 @@ public class DataController { @Autowired public ExcelServiceImpl excelServiceImpl; + @Autowired + public DocxService docxService; + @Autowired public TextFileServiceImpl textFileService; @@ -52,6 +55,15 @@ public List convertPdf(@RequestParam("file") MultipartFile file, @Reques @RequestParam(value = "endPage", required = false, defaultValue = "0") Integer endPage) { return pdfService.convertPdfToText(file, startPage, endPage); } + + @PostMapping(value = "docx-file", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) + public List saveDocxFiletFormat(@RequestParam MultipartFile avatar) { + return docxService.processDocxFile(avatar); + } + + + + } diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/DocxService.java b/src/main/java/ru/hackteam/window_of_knowledge/services/DocxService.java new file mode 100644 index 0000000..939118f --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/DocxService.java @@ -0,0 +1,63 @@ +package ru.hackteam.window_of_knowledge.services; + +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.springframework.stereotype.Service; +import org.springframework.web.multipart.MultipartFile; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +@Service +public class DocxService { + + public List processDocxFile(MultipartFile docxFile) { + List textChunks = new ArrayList<>(); + + try (InputStream inputStream = docxFile.getInputStream(); + XWPFDocument document = new XWPFDocument(inputStream)) { + + StringBuilder textBuilder = new StringBuilder(); + + // Извлекаем текст из всех параграфов документа + for (XWPFParagraph paragraph : document.getParagraphs()) { + textBuilder.append(paragraph.getText()).append(" "); + } + + // Получаем полный текст и делим его на слова + String fullText = textBuilder.toString().trim(); + String[] words = fullText.split("\\s+"); + + // Если меньше 200 слов, добавляем весь текст как одну часть + if (words.length <= 200) { + textChunks.add(fullText); + } else { + // Если больше 200 слов, делим на части по 200 слов + StringBuilder chunk = new StringBuilder(); + int wordCount = 0; + + for (String word : words) { + chunk.append(word).append(" "); + wordCount++; + if (wordCount == 200) { + textChunks.add(chunk.toString().trim()); + chunk.setLength(0); // Очищаем для новой части + wordCount = 0; + } + } + + // Добавляем остаток слов, если он есть + if (chunk.length() > 0) { + textChunks.add(chunk.toString().trim()); + } + } + + } catch (IOException e) { + e.printStackTrace(); + textChunks.add("Ошибка при обработке файла: " + e.getMessage()); + } + + return textChunks; + } +} From 82153c8a881dbc913cf7aff3e10cdd71cf407c32 Mon Sep 17 00:00:00 2001 From: Dima Davydov Date: Thu, 28 Nov 2024 14:27:21 +0300 Subject: [PATCH 6/7] change structure --- .../DataController.java | 10 +- .../services/PdfService.java | 102 +++++++++--------- .../services/UrlTextService.java | 48 +++++++++ 3 files changed, 108 insertions(+), 52 deletions(-) rename src/main/java/ru/hackteam/window_of_knowledge/{controller => controllers}/DataController.java (90%) create mode 100644 src/main/java/ru/hackteam/window_of_knowledge/services/UrlTextService.java diff --git a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java b/src/main/java/ru/hackteam/window_of_knowledge/controllers/DataController.java similarity index 90% rename from src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java rename to src/main/java/ru/hackteam/window_of_knowledge/controllers/DataController.java index 996e8b3..e17f2ad 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/controller/DataController.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/controllers/DataController.java @@ -1,4 +1,4 @@ -package ru.hackteam.window_of_knowledge.controller; +package ru.hackteam.window_of_knowledge.controllers; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.MediaType; @@ -28,6 +28,9 @@ public class DataController { @Autowired private PdfService pdfService; + @Autowired + private UrlTextService urlTextService; + @PostMapping(value = "excel", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) public String saveExcelFormat(@RequestParam MultipartFile avatar, @RequestParam(required = false) String startCell1, @RequestParam(required = false) String startCell2) { return excelServiceImpl.saveDataToBd(avatar, startCell1, startCell2); @@ -61,6 +64,11 @@ public List saveDocxFiletFormat(@RequestParam MultipartFile avatar) { return docxService.processDocxFile(avatar); } + @PostMapping(path = "url") + public String saveUrlTest(@RequestParam String url){ + return urlTextService.getTextFromUrl(url); + } + diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java b/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java index f86c010..ee22a6e 100644 --- a/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/PdfService.java @@ -1,72 +1,72 @@ -package ru.hackteam.window_of_knowledge.services; + package ru.hackteam.window_of_knowledge.services; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.text.PDFTextStripper; -import org.springframework.stereotype.Service; -import org.springframework.web.multipart.MultipartFile; + import org.apache.pdfbox.pdmodel.PDDocument; + import org.apache.pdfbox.text.PDFTextStripper; + import org.springframework.stereotype.Service; + import org.springframework.web.multipart.MultipartFile; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; + import java.io.IOException; + import java.io.InputStream; + import java.util.ArrayList; + import java.util.List; -@Service -public class PdfService { + @Service + public class PdfService { - public List convertPdfToText(MultipartFile pdfFile, int startPage, int endPage) { - String extractedText = ""; + public List convertPdfToText(MultipartFile pdfFile, int startPage, int endPage) { + String extractedText = ""; - try (InputStream inputStream = pdfFile.getInputStream(); - PDDocument document = PDDocument.load(inputStream)) { + try (InputStream inputStream = pdfFile.getInputStream(); + PDDocument document = PDDocument.load(inputStream)) { - PDFTextStripper pdfStripper = new PDFTextStripper(); + PDFTextStripper pdfStripper = new PDFTextStripper(); - // Если startPage и endPage не указаны, обрабатываем весь файл - if (startPage != 0) { - pdfStripper.setStartPage(startPage); - } else { - pdfStripper.setStartPage(1); - } + // Если startPage и endPage не указаны, обрабатываем весь файл + if (startPage != 0) { + pdfStripper.setStartPage(startPage); + } else { + pdfStripper.setStartPage(1); + } + + if (endPage != 0) { + pdfStripper.setEndPage(endPage); + } else { + pdfStripper.setEndPage(document.getNumberOfPages()); + } + + // Извлечение текста из PDF + extractedText = pdfStripper.getText(document); - if (endPage != 0) { - pdfStripper.setEndPage(endPage); - } else { - pdfStripper.setEndPage(document.getNumberOfPages()); + } + catch (IOException e) { + e.printStackTrace(); + System.out.println(e.getMessage()); } - // Извлечение текста из PDF - extractedText = pdfStripper.getText(document); + String[] words = extractedText.split("\\s+"); // Разделение строки на слова + List chunks = new ArrayList<>(); - } - catch (IOException e) { - e.printStackTrace(); - System.out.println(e.getMessage()); - } + StringBuilder chunk = new StringBuilder(); + int wordCount = 0; - String[] words = extractedText.split("\\s+"); // Разделение строки на слова - List chunks = new ArrayList<>(); + for (String word : words) { + chunk.append(word).append(" "); + wordCount++; - StringBuilder chunk = new StringBuilder(); - int wordCount = 0; + // Если достигли 200 слов или конец текста, добавляем в список + if (wordCount == 200) { + chunks.add(chunk.toString().trim()); + chunk.setLength(0); // Очистка StringBuilder + wordCount = 0; + } - for (String word : words) { - chunk.append(word).append(" "); - wordCount++; + } - // Если достигли 200 слов или конец текста, добавляем в список - if (wordCount == 200) { + if (chunk.length() > 0) { chunks.add(chunk.toString().trim()); - chunk.setLength(0); // Очистка StringBuilder - wordCount = 0; } + return chunks; } - - if (chunk.length() > 0) { - chunks.add(chunk.toString().trim()); - } - - return chunks; } -} diff --git a/src/main/java/ru/hackteam/window_of_knowledge/services/UrlTextService.java b/src/main/java/ru/hackteam/window_of_knowledge/services/UrlTextService.java new file mode 100644 index 0000000..f767388 --- /dev/null +++ b/src/main/java/ru/hackteam/window_of_knowledge/services/UrlTextService.java @@ -0,0 +1,48 @@ +package ru.hackteam.window_of_knowledge.services; + +import org.springframework.stereotype.Service; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +@Service +public class UrlTextService { + + public String getTextFromUrl(String urlString) { + StringBuilder result = new StringBuilder(); + HttpURLConnection connection = null; + + try { + // Создаем URL-объект и открываем соединение + URL url = new URL(urlString); + connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + + // Проверяем статус ответа (200 — успешный запрос) + int responseCode = connection.getResponseCode(); + if (responseCode == HttpURLConnection.HTTP_OK) { + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(connection.getInputStream()))) { + + String line; + while ((line = reader.readLine()) != null) { + result.append(line).append("\n"); + } + } + } else { + result.append("Ошибка подключения: ").append(responseCode); + } + + } catch (IOException e) { + result.append("Ошибка при получении текста с URL: ").append(e.getMessage()); + } finally { + if (connection != null) { + connection.disconnect(); + } + } + + return result.toString(); + } +} \ No newline at end of file From 8bf154b920693e81f9adcd2ac7644c53266648cd Mon Sep 17 00:00:00 2001 From: "Dmitry Davydov 16 y.o" <166521432+SaikiKusuo16yearsold@users.noreply.github.com> Date: Thu, 28 Nov 2024 16:44:31 +0300 Subject: [PATCH 7/7] Update README.md --- README.md | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6fa250a..48f20ba 100644 --- a/README.md +++ b/README.md @@ -1 +1,38 @@ -# HackathonWindowKnowledge \ No newline at end of file + +## API Reference + + +http://localhost:8080/question/ask/{assistant_id}/{conversation_id} - +Принимает id ассистента и id юзера и дает ответ +assistant_id - id ассистента +conversation_id - id usera + + + + +POST /data/pdf + +http://localhost:8080/data/pdf - принимает pdf файл и преобразует его в текст деля на chunki по 200 слов +pdf - pdf файл +Принимает pdf файл и преобразует его в текст деля на chunki по 200 слов + + + +http://localhost:8080/data/text - принимает текст и создает список по 200 слов + + + +http://localhost:8080/data/docx-file - принимает docx файл и преобразует его в текст деля на chunki по 200 слов +docx - docx файл +Принимает docx файл и преобразует его в текст деля на chunki по 200 слов + +http://localhost:8080/data/text-file - принимает text файл и преобразует его в текст деля на chunki по 200 слов +text - text файл +Принимает text файл и преобразует его в текст деля на chunki по 200 слов + + + + + + +