diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..956a693 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +__pycache__ +.env +.git +.venv +.idea \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..91edc3d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.11-slim +LABEL maintainer="Dmitry Titenkov " +LABEL version="1.0" +LABEL description="Satire Pulp parser" +WORKDIR /app +COPY requirements.txt /app +RUN pip3 install -r /app/requirements.txt --no-cache-dir +COPY . . diff --git a/bot.py b/bot.py index 0c396eb..b696c54 100644 --- a/bot.py +++ b/bot.py @@ -1,13 +1,14 @@ import logging -from config import setup_logger -from dotenv import load_dotenv -from storage import ( +from bot_storage import ( get_all_users, get_last_sent_id, get_news_after_id, save_last_sent_news_id, ) +from config import setup_logger +from db_async import AsyncSessionLocal +from dotenv import load_dotenv from telegram import ( BotCommand, InlineKeyboardButton, @@ -67,22 +68,30 @@ async def send_news( async def auto_send_news(context: ContextTypes.DEFAULT_TYPE): - users = get_all_users() - if not users: - return - for chat_id in users: - last_id = get_last_sent_id(chat_id) - news_list = get_news_after_id(last_id) - if not news_list: + async with AsyncSessionLocal() as session: + users = await get_all_users(session) + if not users: return - for news_id, title, image, text, url in news_list: - try: - await send_news(chat_id, context, title, image, text, url) - save_last_sent_news_id(chat_id, news_id) - except Exception as e: - logger.error( - f"Ошибка при автоматической отправке новости: {e}" - ) + for chat_id in users: + last_id = await get_last_sent_id(chat_id, session) + news_list = await get_news_after_id(last_id, session) + if not news_list: + continue + for news in news_list: + try: + await send_news( + chat_id, + context, + news.title, + news.image, + news.text, + news.url, + ) + await save_last_sent_news_id(chat_id, news.id, session) + except Exception as e: + logger.error( + f"Ошибка при автоматической отправке новости: {e}" + ) async def menu(update: Update, context: ContextTypes.DEFAULT_TYPE): @@ -115,20 +124,28 @@ async def button_handler(update: Update, context: ContextTypes.DEFAULT_TYPE): logger.warning(f"callback_query не удалось ответить: {e}") if query.data == "send_news": chat_id = query.message.chat_id - last_id = get_last_sent_id(chat_id) - news_list = get_news_after_id(last_id) - if not news_list: - await query.message.reply_text("Новых новостей пока нет 🙁") - logger.info("Новых новостей нет") - return - for news_id, title, image, text, url in news_list: - try: - await send_news(chat_id, context, title, image, text, url) - save_last_sent_news_id(chat_id, news_id) - except Exception as e: - logger.error( - f"Ошибка при отправке новости '{title[:25]}': {e}" - ) + async with AsyncSessionLocal() as session: + last_id = await get_last_sent_id(chat_id, session) + news_list = await get_news_after_id(last_id, session) + if not news_list: + await query.message.reply_text("Новых новостей пока нет 🙁") + logger.info("Новых новостей нет") + return + for news in news_list: + try: + await send_news( + chat_id, + context, + news.title, + news.image, + news.text, + news.url, + ) + await save_last_sent_news_id(chat_id, news.id, session) + except Exception as e: + logger.error( + f"Ошибка при отправке новости '{news.title[:25]}': {e}" + ) elif query.data == "help": help_text = "Нажмите 📰 'Показать новости', чтобы получить новости." await query.message.reply_text(help_text) @@ -138,18 +155,28 @@ async def show_news_command( update: Update, context: ContextTypes.DEFAULT_TYPE ): chat_id = update.message.chat_id - last_id = get_last_sent_id(chat_id) - news_list = get_news_after_id(last_id) - if not news_list: - await update.message.reply_text("Новостей пока нет 🙁") - logger.info("Новостей нет") - return - try: - for news_id, title, image, text, url in news_list: - await send_news(chat_id, context, title, image, text, url) - save_last_sent_news_id(chat_id, news_id) - except Exception as e: - logger.error(f"Ошибка при отправке новости '{title[:25]}': {e}") + async with AsyncSessionLocal() as session: + last_id = await get_last_sent_id(chat_id, session) + news_list = await get_news_after_id(last_id, session) + if not news_list: + await update.message.reply_text("Новостей пока нет 🙁") + logger.info("Новостей нет") + return + try: + for news in news_list: + await send_news( + chat_id, + context, + news.title, + news.image, + news.text, + news.url, + ) + await save_last_sent_news_id(chat_id, news.id, session) + except Exception as e: + logger.error( + f"Ошибка при отправке новости '{news.title[:25]}': {e}" + ) async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE): diff --git a/bot_storage.py b/bot_storage.py new file mode 100644 index 0000000..5ba4e5d --- /dev/null +++ b/bot_storage.py @@ -0,0 +1,79 @@ +import logging + +from models import LastSentNews, News +from sqlalchemy import select +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +logger = logging.getLogger(__name__) + + +async def get_news_after_id(last_id: int, session: AsyncSession, n=10): + """ + Получение новостей если если id новости меньше чем last_id отправленной новости, + либо получение 10 последних новостей если last_id = 0 + """ + try: + if last_id == 0: + news = await session.execute( + select(News) + .where(News.id > last_id) + .order_by(News.id.desc()) + .limit(n) + ) + return news.scalars().all()[::-1] + else: + news = await session.execute( + select(News).where(News.id > last_id).order_by(News.id.asc()) + ) + return news.scalars().all() + except SQLAlchemyError as e: + logger.error(f"Ошибка при получении списка новостей: {e}") + raise + + +async def get_last_sent_id(chat_id: int, session: AsyncSession): + """Получение last_id последней отправленной новости""" + try: + last_sent = await session.execute( + select(LastSentNews).where(LastSentNews.chat_id == chat_id) + ) + last_sent_id = last_sent.scalar_one_or_none() + if last_sent_id: + return last_sent_id.last_news_id + else: + return 0 + except SQLAlchemyError as e: + logger.error( + f"Ошибка при получении id последней отправленной новости: {e}" + ) + + +async def save_last_sent_news_id( + chat_id: int, last_id: int, session: AsyncSession +): + """Сохранение last_id последней отправленной новости""" + try: + last_sent = await session.execute( + select(LastSentNews).where(LastSentNews.chat_id == chat_id) + ) + last_sent = last_sent.scalar_one_or_none() + if last_sent: + last_sent.last_news_id = last_id + else: + last_sent = LastSentNews(chat_id=chat_id, last_news_id=last_id) + session.add(last_sent) + await session.commit() + except SQLAlchemyError as e: + await session.rollback() + logger.error(f"Ошибка при сохранении last_news_id: {e}") + raise + + +async def get_all_users(session: AsyncSession): + """Получение id юзера""" + try: + users = await session.execute(select(LastSentNews.chat_id)) + return users.scalars().all() + except SQLAlchemyError as e: + logger.error(f"Ошибка получения chat_id пользователей: {e}") diff --git a/db_async.py b/db_async.py new file mode 100644 index 0000000..5d00ea3 --- /dev/null +++ b/db_async.py @@ -0,0 +1,12 @@ +import os + +from dotenv import load_dotenv +from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine + +load_dotenv() + + +engine = create_async_engine(os.getenv("DATABASE_URL_ASYNC"), echo=False) + + +AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False) diff --git a/db_sync.py b/db_sync.py new file mode 100644 index 0000000..1423a78 --- /dev/null +++ b/db_sync.py @@ -0,0 +1,12 @@ +import os + +from dotenv import load_dotenv +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +load_dotenv() + + +engine = create_engine(os.getenv("DATABASE_URL_SYNC")) + +SessionLocal = sessionmaker(engine) diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..2827f07 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,33 @@ +version: '3.8' + +services: + db: + image: postgres:15.0-alpine + volumes: + - postgres_data:/var/lib/postgresql/data/ + ports: + - "5432:5432" + env_file: + - ./.env + + bot: + build: . + restart: always + depends_on: + - db + env_file: + - ./.env + command: python3 main.py + + scheduler: + build: . + restart: always + command: python3 scheduler.py + depends_on: + - db + - bot + env_file: + - ./.env + +volumes: + postgres_data: \ No newline at end of file diff --git a/init_db.py b/init_db.py new file mode 100644 index 0000000..7fe950b --- /dev/null +++ b/init_db.py @@ -0,0 +1,22 @@ +import asyncio +import logging + +from config import setup_logger +from db_async import engine +from dotenv import load_dotenv +from models import Base, LastSentNews, News # noqa + +load_dotenv() + +setup_logger() +logger = logging.getLogger(__name__) + + +async def init(): + """Создание таблиц в базе""" + async with engine.begin() as conn: + await conn.run_sync(Base.metadata.create_all) + logger.info("...Таблицы созданы...") + + +asyncio.run(init()) diff --git a/main.py b/main.py index e39e29f..306aea6 100644 --- a/main.py +++ b/main.py @@ -12,7 +12,6 @@ ) from config import setup_logger from dotenv import load_dotenv -from storage import init_db from telegram.ext import ( ApplicationBuilder, CallbackQueryHandler, @@ -26,7 +25,6 @@ def main(): - init_db() app = ApplicationBuilder().token(os.getenv("TELEGRAM_TOKEN")).build() app.job_queue.run_repeating(auto_send_news, interval=600, first=10) app.add_handler(CommandHandler("start", menu)) diff --git a/models.py b/models.py new file mode 100644 index 0000000..89c0fe4 --- /dev/null +++ b/models.py @@ -0,0 +1,30 @@ +from datetime import datetime + +from sqlalchemy import BigInteger, DateTime, Integer, String, Text, func +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column + + +class Base(DeclarativeBase): + pass + + +class News(Base): + __tablename__ = "news" + + id: Mapped[int] = mapped_column(primary_key=True) + url: Mapped[str] = mapped_column(String, unique=True, nullable=False) + title: Mapped[str | None] = mapped_column(String, nullable=True) + image: Mapped[str | None] = mapped_column(String, nullable=True) + text: Mapped[str | None] = mapped_column(Text, nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), default=func.now(), nullable=False + ) + + +class LastSentNews(Base): + __tablename__ = "last_sent_news" + + chat_id: Mapped[int] = mapped_column(BigInteger, primary_key=True) + last_news_id: Mapped[int] = mapped_column( + Integer, nullable=True, default=0 + ) diff --git a/requirements.txt b/requirements.txt index ab8e9a0..9c4da35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ anyio==4.12.1 APScheduler==3.11.2 +asyncpg==0.31.0 attrs==25.4.0 Automat==25.4.16 black==26.1.0 @@ -13,6 +14,7 @@ cssselect==1.3.0 defusedxml==0.7.1 filelock==3.20.3 flake8==7.3.0 +greenlet==3.3.1 h11==0.16.0 httpcore==1.0.9 httpx==0.28.1 @@ -33,6 +35,7 @@ pathspec==1.0.4 platformdirs==4.5.1 pluggy==1.6.0 Protego==0.5.0 +psycopg2-binary==2.9.11 pyasn1==0.6.2 pyasn1_modules==0.4.2 pycodestyle==2.14.0 @@ -51,6 +54,7 @@ requests-file==3.0.1 schedule==1.2.2 Scrapy==2.14.1 service-identity==24.2.0 +SQLAlchemy==2.0.46 tldextract==5.3.1 Twisted==25.5.0 typing_extensions==4.15.0 diff --git a/satire_pulp_parser/spiders/satire_pulp.py b/satire_pulp_parser/spiders/satire_pulp.py index 435170e..b4ab849 100644 --- a/satire_pulp_parser/spiders/satire_pulp.py +++ b/satire_pulp_parser/spiders/satire_pulp.py @@ -1,12 +1,9 @@ import scrapy -from storage import init_db, is_news_exists, save_news +from db_sync import SessionLocal +from spider_storage import is_news_exists, save_news class SatirePulpSpider(scrapy.Spider): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - init_db() - name = "satire_pulp" allowed_domains = ["panorama.pub"] start_urls = ["https://panorama.pub"] @@ -15,9 +12,10 @@ def parse(self, response): news_links = response.css("div.shrink-0 li a::attr(href)").getall() for link in news_links: full_link = response.urljoin(link) - if is_news_exists(full_link): - self.logger.info(f"Новость уже есть: {full_link}") - continue + with SessionLocal() as session: + if is_news_exists(full_link, session): + self.logger.info(f"Новость уже есть: {full_link}") + continue yield scrapy.Request(url=full_link, callback=self.parse_news) @@ -32,7 +30,8 @@ def parse_news(self, response): else: image = None - save_news(response.url, final_title, image, final_text) + with SessionLocal() as session: + save_news(response.url, final_title, image, final_text, session) yield { "title": title, "text": final_text, diff --git a/scheduler.py b/scheduler.py index 6a6dfb7..b7b6b32 100644 --- a/scheduler.py +++ b/scheduler.py @@ -26,11 +26,12 @@ def run_spider(): def run_scheduler(): + """Запуск планировщика, паук запускаеется каждые 20 минут""" scheduler = BlockingScheduler() try: logger.info("...Планировщик запущен...") run_spider() - scheduler.add_job(run_spider, "interval", minutes=30) + scheduler.add_job(run_spider, "interval", minutes=20) scheduler.start() except Exception as e: logger.error(f"Ошибкка в планировщике: {e}") diff --git a/spider_storage.py b/spider_storage.py new file mode 100644 index 0000000..2e9bfc4 --- /dev/null +++ b/spider_storage.py @@ -0,0 +1,27 @@ +import logging + +from models import News +from sqlalchemy.exc import SQLAlchemyError + +logger = logging.getLogger(__name__) + + +def is_news_exists(url: str, session): + """Проверка новости в базе по URL""" + try: + news = session.query(News).filter(News.url == url) + return news.first() is not None + except SQLAlchemyError as e: + logger.error(f"Ошибка проверки новости в базе: {e}") + raise + + +def save_news(url: str, title: str, image: str, text: str, session): + """Созранение новости в базе""" + try: + news = News(url=url, title=title, image=image, text=text) + session.add(news) + session.commit() + except SQLAlchemyError as e: + logger.error(f"Ошибка сохранения новости в бд: {e}") + raise diff --git a/storage.py b/storage.py deleted file mode 100644 index 9952c1d..0000000 --- a/storage.py +++ /dev/null @@ -1,96 +0,0 @@ -import sqlite3 -from datetime import datetime, timezone - -DB_NAME = "satire_pulp.db" - - -def get_connection(): - return sqlite3.connect(DB_NAME) - - -def init_db(): - with get_connection() as conn: - conn.execute(""" - CREATE TABLE IF NOT EXISTS news ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - url TEXT UNIQUE, - title TEXT, - image TEXT, - text TEXT, - create_at TEXT - ) -""") - conn.execute(""" - CREATE TABLE IF NOT EXISTS last_sent_news ( - chat_id INTEGER PRIMARY KEY, - last_news_id INTEGER - ) -""") - conn.commit() - - -def is_news_exists(url): - with get_connection() as conn: - cursor = conn.execute( - """ - SELECT 1 FROM news WHERE url = ? -""", - (url,), - ) - return cursor.fetchone() is not None - - -def save_news(url, title, image, text): - with get_connection() as conn: - conn.execute( - """ - INSERT INTO news (url, title, image, text, create_at) VALUES (?, ?, ?, ?, ?) -""", - (url, title, image, text, datetime.now(timezone.utc).isoformat()), - ) - conn.commit() - - -def get_news_after_id(last_id, n=10): - with get_connection() as conn: - cursor = conn.execute( - """ - SELECT id, title, image, text, url FROM news WHERE id > ? ORDER BY id DESC LIMIT ? -""", - (last_id, n), - ) - return cursor.fetchall()[::-1] - - -def get_last_sent_id(chat_id): - with get_connection() as conn: - cursor = conn.execute( - """ - SELECT last_news_id FROM last_sent_news WHERE chat_id = ? -""", - (chat_id,), - ) - row = cursor.fetchone() - if row: - return row[0] - else: - return 0 - - -def save_last_sent_news_id(chat_id, last_id): - with get_connection() as conn: - conn.execute( - """ - INSERT INTO last_sent_news (chat_id, last_news_id) VALUES (?, ?) - ON CONFLICT(chat_id) DO UPDATE SET last_news_id = excluded.last_news_id""", - (chat_id, last_id), - ) - conn.commit() - - -def get_all_users(): - with get_connection() as conn: - cursor = conn.execute(""" - SELECT chat_id FROM last_sent_news -""") - return [row[0] for row in cursor.fetchall()]