Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 24 additions & 43 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,9 @@
"""
Instagram Profile Scraper
by SoClose Society — https://soclose.co
Digital solutions & software development studio.

Scrapes Instagram profile links from any feed page using Selenium
browser automation. Exports unique profile URLs to CSV format.

Part of the SoClose open-source automation toolkit:
https://github.com/soclosesociety

Usage:
python main.py

Environment Variables:
INSTA_USERNAME - Instagram username or email
INSTA_PASSWORD - Instagram password

License: MIT — See LICENSE file for details.
Contact: contact@soclose.co

DISCLAIMER: This tool is provided for educational purposes only.
Scraping Instagram may violate their Terms of Service.
Use responsibly and at your own risk.
"""

import csv
import logging
import os
import random
import sys
import time
time
from pathlib import Path

from bs4 import BeautifulSoup
Expand All @@ -45,6 +19,20 @@
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

def is_valid_username(username: str) -> bool:
"""Validate Instagram username format."""
if not (3 <= len(username) <= 30):
return False
if not username.isalnum():
return False
return True

def is_valid_password(password: str) -> bool:
"""Validate Instagram password format."""
if len(password) < 6:
return False
return True

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
Expand All @@ -61,12 +49,10 @@
SCROLL_AMOUNT = 600 # Pixels to scroll down per iteration
SAVE_INTERVAL = 50 # Save to CSV every N iterations


# ---------------------------------------------------------------------------
# Helper Functions
# ---------------------------------------------------------------------------


def create_driver() -> webdriver.Chrome:
"""Create and configure a Chrome WebDriver instance."""
options = webdriver.ChromeOptions()
Expand All @@ -80,19 +66,20 @@ def create_driver() -> webdriver.Chrome:
driver.maximize_window()
return driver


def get_credentials() -> tuple[str, str]:
"""Retrieve Instagram credentials from environment variables or user input."""
"""Retrieve Instagram credentials from environment variables or user input and validate them."""
username = os.getenv("INSTA_USERNAME") or input("Enter Instagram username/email: ").strip()
password = os.getenv("INSTA_PASSWORD") or input("Enter Instagram password: ").strip()

if not username or not password:
logger.error("Username and password are required.")
if not is_valid_username(username):
logger.error("Invalid username. Please check the format.")
sys.exit(1)
if not is_valid_password(password):
logger.error("Invalid password. Password must be at least 6 characters long.")
sys.exit(1)

return username, password


def login(driver: webdriver.Chrome, username: str, password: str) -> bool:
"""Log in to Instagram and return True on success."""
logger.info("Navigating to Instagram login page...")
Expand Down Expand Up @@ -122,14 +109,12 @@ def login(driver: webdriver.Chrome, username: str, password: str) -> bool:
logger.info("Login successful.")
return True


EXCLUDED_PATHS = {
"/explore/", "/accounts/", "/reels/", "/stories/", "/direct/",
"/directory/", "/developer/", "/about/", "/legal/", "/privacy/",
"/terms/", "/session/", "/emails/", "/settings/", "/nametag/",
}


def extract_profile_links(html: str) -> set[str]:
"""Extract Instagram profile links from page HTML source."""
soup = BeautifulSoup(html, "lxml")
Expand All @@ -141,7 +126,6 @@ def extract_profile_links(html: str) -> set[str]:
links.add(href)
return links


def save_to_csv(links: list[str], filepath: Path) -> None:
"""Save profile links to a CSV file."""
with open(filepath, "w", newline="", encoding="utf-8") as f:
Expand All @@ -151,15 +135,15 @@ def save_to_csv(links: list[str], filepath: Path) -> None:
writer.writerow([f"https://www.instagram.com{link}"])
logger.info("Saved %d links to %s", len(links), filepath)


def scrape_profiles(driver: webdriver.Chrome, output_file: Path) -> list[str]:
"""Scroll the feed and collect unique profile links."""
all_links: set[str] = set()
stale_count = 0
iteration = 0

logger.info("Starting scrape — scroll the page or let the script run.")
logger.info("Press Ctrl+C to stop early and save results.\n")
logger.info("Press Ctrl+C to stop early and save results.
")

try:
while stale_count < MAX_STALE_ITERATIONS:
Expand Down Expand Up @@ -199,12 +183,10 @@ def scrape_profiles(driver: webdriver.Chrome, output_file: Path) -> list[str]:

return sorted(all_links)


# ---------------------------------------------------------------------------
# Main Entry Point
# ---------------------------------------------------------------------------


def main() -> None:
"""Main entry point for the Instagram Profile Scraper."""
logger.info("Instagram Profile Scraper — by SoClose Society (soclose.co)")
Expand Down Expand Up @@ -235,6 +217,5 @@ def main() -> None:
driver.quit()
logger.info("Browser closed.")


if __name__ == "__main__":
main()
main()