BNLP is a natural language processing toolkit for Bengali Language. This tool will help you to tokenize Bengali text, Embedding Bengali words, Embedding Bengali Document, Bengali POS Tagging, Bengali Name Entity Recognition, Bangla Text Cleaning for Bengali NLP purposes.
- Tokenization
- Embeddings
- Part of speech tagging
- Named Entity Recognition
- Text Cleaning
- Corpus
- Letters, vowels, punctuations, stopwords
- Command Line Interface (CLI)
- Pipeline API
- Batch Processing
- Async Model Loading
- Spell Checking
- Language Detection
pip install bnlp_toolkit
or Upgrade
pip install -U bnlp_toolkit
- Python: 3.8, 3.9, 3.10, 3.11
- OS: Linux, Windows, Mac
git clone https://github.com/sagorbrur/bnlp.git
cd bnlp
python setup.py install
from bnlp import BasicTokenizer
tokenizer = BasicTokenizer()
raw_text = "আমি বাংলায় গান গাই।"
tokens = tokenizer(raw_text)
print(tokens)
# output: ["আমি", "বাংলায়", "গান", "গাই", "।"]BNLP provides a command-line interface for quick text processing without writing Python code.
# Tokenize text
bnlp tokenize "আমি বাংলায় গান গাই।"
# Output: ['আমি', 'বাংলায়', 'গান', 'গাই', '।']
# Named Entity Recognition
bnlp ner "সজীব ঢাকায় থাকেন।"
# Part-of-Speech Tagging
bnlp pos "আমি ভাত খাই।"
# Get word embeddings (similar words)
bnlp embedding "বাংলা" --similar
# Clean text
bnlp clean "hello@example.com আমি বাংলায়" --remove-email
# Download models
bnlp download all # Download all models
bnlp download word2vec # Download specific model
# List available models
bnlp list-models
# Access corpus data
bnlp corpus stopwords
bnlp corpus letters| Command | Description |
|---|---|
tokenize |
Tokenize Bengali text (supports: basic, nltk, sentencepiece) |
ner |
Named Entity Recognition |
pos |
Part-of-Speech tagging |
embedding |
Word embeddings (supports: word2vec, fasttext, glove) |
clean |
Text cleaning and normalization |
download |
Download pre-trained models |
list-models |
List all available models |
corpus |
Access Bengali corpus data (stopwords, letters, digits, etc.) |
# Get help
bnlp --help
bnlp tokenize --help
# Output as JSON
bnlp tokenize "আমি বাংলায় গান গাই।" --json
# Use different tokenizer
bnlp tokenize "আমি বাংলায় গান গাই।" --type nltk
# Sentence tokenization
bnlp tokenize "আমি বাংলায় গান গাই। তুমি কি গাও?" --type nltk --sentence
# Get similar words with custom count
bnlp embedding "বাংলা" --similar --topn 5Chain multiple NLP operations together using the Pipeline API.
from bnlp import Pipeline, CleanText, BasicTokenizer
# Create a pipeline
pipeline = Pipeline([
CleanText(remove_url=True, remove_punct=True),
BasicTokenizer(),
])
# Process text through the pipeline
result = pipeline("আমি বাংলায় গান গাই।")
print(result)
# Output: ['আমি', 'বাংলায়', 'গান', 'গাই']
# Get detailed results with intermediate outputs
result = pipeline.run("আমি বাংলায় গান গাই।", return_details=True)
print(result.intermediate_results)from bnlp import create_tokenization_pipeline, create_ner_pipeline, create_pos_pipeline
# Tokenization pipeline
tokenizer_pipeline = create_tokenization_pipeline(clean=True, tokenizer_type="basic")
tokens = tokenizer_pipeline("আমি বাংলায় গান গাই।")
# NER pipeline
ner_pipeline = create_ner_pipeline(clean=True)
entities = ner_pipeline("সজীব ঢাকায় থাকেন।")
# POS pipeline
pos_pipeline = create_pos_pipeline(clean=True)
tags = pos_pipeline("আমি ভাত খাই।")Process multiple texts efficiently using batch processing utilities.
from bnlp import BasicTokenizer, tokenize_batch, tag_batch, clean_batch
from bnlp import BengaliNER, CleanText
# Batch tokenization
tokenizer = BasicTokenizer()
texts = ["আমি বাংলায় গান গাই।", "তুমি কোথায় যাও?", "সে বই পড়ে।"]
results = tokenize_batch(tokenizer.tokenize, texts)
print(results)
# Output: [['আমি', 'বাংলায়', ...], ['তুমি', 'কোথায়', ...], ['সে', 'বই', ...]]
# Batch NER tagging
ner = BengaliNER()
texts = ["সজীব ঢাকায় থাকেন।", "রবীন্দ্রনাথ ঠাকুর কলকাতায় জন্মগ্রহণ করেন।"]
results = tag_batch(ner.tag, texts)
# Batch text cleaning
cleaner = CleanText(remove_url=True, remove_email=True)
texts = ["email@example.com আমি", "https://example.com তুমি"]
results = clean_batch(cleaner, texts)from bnlp import BatchProcessor, BasicTokenizer
tokenizer = BasicTokenizer()
batch = BatchProcessor(tokenizer.tokenize, max_workers=4)
texts = ["আমি বাংলায় গান গাই।"] * 100
results = batch.process(texts, show_progress=True)Load large models in the background without blocking your application.
from bnlp import AsyncModelLoader, BengaliWord2Vec
# Create async loader with callbacks
def on_progress(progress):
print(f"Loading: {progress.progress * 100:.0f}% - {progress.message}")
loader = AsyncModelLoader(
BengaliWord2Vec,
on_progress=on_progress,
on_complete=lambda m: print("Model ready!")
)
# Start loading in background
loader.start_loading()
# Do other work while model loads...
print("Doing other work...")
# Get model when needed (blocks until ready)
model = loader.get_model()
vector = model.get_word_vector("বাংলা")from bnlp import LazyModelLoader, BengaliWord2Vec
# Model not loaded yet
lazy_model = LazyModelLoader(BengaliWord2Vec)
# Model loads on first access
model = lazy_model.get()
vector = model.get_word_vector("বাংলা")from bnlp import load_model_async, BengaliWord2Vec
# One-liner to start async loading
loader = load_model_async(BengaliWord2Vec)
# Get model when ready
model = loader.get_model()Fast and accurate Bengali spell checking using the SymSpell algorithm.
from bnlp import BengaliSpellChecker
# Create spell checker
checker = BengaliSpellChecker()
# Check if a word is spelled correctly
print(checker.is_correct("আমি")) # True
print(checker.is_correct("আমর")) # False (misspelled)
# Get spelling suggestions
suggestions = checker.suggestions("আমর")
print(suggestions)
# Output: [('আমি', 1), ('আমার', 2), ...]
# Check text for errors
text = "আমর বাংলায় গান গাই।"
errors = checker.check(text)
for error in errors:
print(f"{error.word} -> {error.best_correction}")
# Output: আমর -> আমি
# Automatically correct text
corrected = checker.correct("আমর বাংলায় গান গাই।")
print(corrected)
# Output: আমি বাংলায় গান গাই।from bnlp import BengaliSpellChecker
# Add custom words
checker = BengaliSpellChecker()
checker.add_word("কাস্টমশব্দ", frequency=1000)
checker.add_words(["নতুনশব্দ", "আরেকটি"], default_frequency=500)
# Or pass custom words during initialization
custom_words = {"বিএনএলপি": 1000, "এনএলপি": 900}
checker = BengaliSpellChecker(custom_words=custom_words)
# Load dictionary from file
checker.load_dictionary("my_dictionary.txt")from bnlp import BengaliSpellChecker
# Customize edit distance (default: 2)
checker = BengaliSpellChecker(max_edit_distance=1) # Faster, less suggestions
# Get word probability
prob = checker.word_probability("আমি")
print(prob) # Higher = more common wordFast and accurate language detection using FastText's language identification model. Supports 176 languages including Bengali.
# Install with language detection support
pip install bnlp_toolkit[langdetect]from bnlp import LanguageDetector, detect_language, is_bengali
# Create detector (downloads model automatically on first use)
detector = LanguageDetector()
# Detect language
result = detector.detect("আমি বাংলায় গান গাই")
print(result.language) # 'bn'
print(result.confidence) # ~0.99
print(result.is_bengali) # True
# Get multiple predictions
result = detector.detect("আমি বাংলায় গান গাই", top_k=3)
print(result.all_predictions)
# Output: [('bn', 0.99), ('hi', 0.005), ...]
# Check if text is Bengali
print(detector.is_bengali("আমি বাংলায় গান গাই")) # True
print(detector.is_bengali("Hello world")) # False
# Detect English
result = detector.detect("Hello, this is English text")
print(result.language) # 'en'
print(result.is_bengali) # Falsefrom bnlp import detect_language, is_bengali
# Quick language detection
result = detect_language("আমি বাংলায় গান গাই")
print(result.language) # 'bn'
# Quick Bengali check
print(is_bengali("আমি বাংলায় গান গাই")) # True
print(is_bengali("Hello world")) # Falsefrom bnlp import LanguageDetector
detector = LanguageDetector()
texts = ["আমি বাংলায় গান গাই", "Hello world", "Bonjour le monde"]
results = detector.detect_batch(texts)
for text, result in zip(texts, results):
print(f"{text[:20]}... -> {result.language} ({result.confidence:.2f})")Detect code-mixed text (e.g., Bengali-English):
from bnlp import LanguageDetector
detector = LanguageDetector()
mixed_text = "আমি today বাংলায় গান গাই। This is mixed text।"
languages = detector.detect_mixed(mixed_text)
print(languages)
# Output: {'bn': 0.5, 'en': 0.5}from bnlp import LanguageDetector
# Custom confidence threshold
detector = LanguageDetector(threshold=0.7)
# Use your own model file
detector = LanguageDetector(model_path="/path/to/lid.176.ftz")
# Disable auto-download
detector = LanguageDetector(auto_download=False)
# Get language name
print(detector.get_language_name('bn')) # 'Bengali'
print(detector.get_language_name('en')) # 'English'Full documentation are available here
If you are using previous version of bnlp check the documentation archive
Check CONTRIBUTING.md page for details.
- Semantics Lab
- All the developers who are contributing to enrich Bengali NLP.