Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 51 additions & 29 deletions ch3/3_1_wordsteam.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# This script give you idea how stemming has been placed by using NLTK and Polyglot libraries.
# This script give you idea how stemming has been placed by using NLTK libraries.
# It is part of morphological analysis

from nltk.stem import PorterStemmer
from polyglot.text import Text, Word
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

word = "unexpected"
text = "disagreement"
Expand All @@ -17,41 +16,64 @@
text9 = "expected"
words_derv = ["happiness", "unkind"]
word_infle = ["dogs", "expected"]
words = ["unexpected", "disagreement", "disagree", "agreement", "quirkiness", "canonical" "historical"]
words = ["unexpected", "disagreement", "disagree", "agreement", "quirkiness", "canonical", "historical"]

def stemmer_porter():
port = PorterStemmer()
print "\nDerivational Morphemes"
print " ".join([port.stem(i) for i in text6.split()])
print " ".join([port.stem(i) for i in text7.split()])
print "\nInflectional Morphemes"
print " ".join([port.stem(i) for i in text8.split()])
print " ".join([port.stem(i) for i in text9.split()])
print "\nSome examples"
print " ".join([port.stem(i) for i in word.split()])
print " ".join([port.stem(i) for i in text.split()])
print " ".join([port.stem(i) for i in text1.split()])
print " ".join([port.stem(i) for i in text2.split()])
print " ".join([port.stem(i) for i in text3.split()])
print " ".join([port.stem(i) for i in text4.split()])
print " ".join([port.stem(i) for i in text5.split()])
print("\nDerivational Morphemes")
print(" ".join([port.stem(i) for i in text6.split()]))
print(" ".join([port.stem(i) for i in text7.split()]))
print("\nInflectional Morphemes")
print(" ".join([port.stem(i) for i in text8.split()]))
print(" ".join([port.stem(i) for i in text9.split()]))
print("\nSome examples")
print(" ".join([port.stem(i) for i in word.split()]))
print(" ".join([port.stem(i) for i in text.split()]))
print(" ".join([port.stem(i) for i in text1.split()]))
print(" ".join([port.stem(i) for i in text2.split()]))
print(" ".join([port.stem(i) for i in text3.split()]))
print(" ".join([port.stem(i) for i in text4.split()]))
print(" ".join([port.stem(i) for i in text5.split()]))


def polyglot_stem():
print "\nDerivational Morphemes using polyglot library"
def alternative_stemmers():
print("\n" + "="*60)
print("Alternative Stemmers (Snowball and Lancaster)")
print("="*60)

snowball = SnowballStemmer("english")
lancaster = LancasterStemmer()

print("\nDerivational Morphemes using Snowball Stemmer")
for w in words_derv:
w = Word(w, language="en")
print("{:<20}{}".format(w, w.morphemes))
print "\nInflectional Morphemes using polyglot library"
print("{:<20}{}".format(w, snowball.stem(w)))

print("\nInflectional Morphemes using Snowball Stemmer")
for w in word_infle:
w = Word(w, language="en")
print("{:<20}{}".format(w, w.morphemes))
print "\nSome Morphemes examples using polyglot library"
print("{:<20}{}".format(w, snowball.stem(w)))

print("\nDerivational Morphemes using Lancaster Stemmer")
for w in words_derv:
print("{:<20}{}".format(w, lancaster.stem(w)))

print("\nInflectional Morphemes using Lancaster Stemmer")
for w in word_infle:
w = Word(w, language="en")
print("{:<20}{}".format(w, w.morphemes))
print("{:<20}{}".format(w, lancaster.stem(w)))

print("\nComparison of all three stemmers:")
print("{:<20}{:<20}{:<20}{:<20}".format("Word", "Porter", "Snowball", "Lancaster"))
print("-" * 80)
port = PorterStemmer()
test_words = ["happiness", "unkind", "dogs", "expected", "running", "flies"]
for w in test_words:
print("{:<20}{:<20}{:<20}{:<20}".format(
w,
port.stem(w),
snowball.stem(w),
lancaster.stem(w)
))


if __name__ == "__main__":
stemmer_porter()
polyglot_stem()
alternative_stemmers()
205 changes: 205 additions & 0 deletions ch3/MIGRATION_GUIDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# Migration Guide: From Polyglot to NLTK Stemmers

## Quick Fix for Existing Code

If you have code using polyglot for stemming, here's how to migrate to NLTK:

### Before (Broken - Polyglot)

```python
from polyglot.text import Word

word = "happiness"
w = Word(word, language="en")
print(w.morphemes)
```

### After (Working - NLTK)

```python
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")
word = "happiness"
print(stemmer.stem(word))
```

## Common Migration Patterns

### Pattern 1: Simple Stemming

**Polyglot:**
```python
from polyglot.text import Word

words = ["running", "flies", "happiness"]
for w in words:
word_obj = Word(w, language="en")
print(f"{w} -> {word_obj.morphemes}")
```

**NLTK:**
```python
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")
words = ["running", "flies", "happiness"]
for w in words:
print(f"{w} -> {stemmer.stem(w)}")
```

### Pattern 2: Batch Processing

**Polyglot:**
```python
from polyglot.text import Text

text = Text("The runners were running quickly")
for word in text.words:
print(f"{word} -> {word.morphemes}")
```

**NLTK:**
```python
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

stemmer = SnowballStemmer("english")
text = "The runners were running quickly"
words = word_tokenize(text)
for word in words:
print(f"{word} -> {stemmer.stem(word)}")
```

### Pattern 3: Multiple Languages

**Polyglot:**
```python
from polyglot.text import Word

word_en = Word("running", language="en")
word_es = Word("corriendo", language="es")
```

**NLTK:**
```python
from nltk.stem import SnowballStemmer

stemmer_en = SnowballStemmer("english")
stemmer_es = SnowballStemmer("spanish")

print(stemmer_en.stem("running"))
print(stemmer_es.stem("corriendo"))
```

## Supported Languages in NLTK Snowball Stemmer

NLTK's SnowballStemmer supports the following languages:

- Arabic
- Danish
- Dutch
- English
- Finnish
- French
- German
- Hungarian
- Italian
- Norwegian
- Portuguese
- Romanian
- Russian
- Spanish
- Swedish
- Turkish

## When to Use Each Stemmer

### Porter Stemmer
- **Use when**: You need the most widely-used stemming algorithm
- **Pros**: Well-documented, predictable behavior
- **Cons**: Older algorithm, some edge cases

### Snowball Stemmer
- **Use when**: You need improved accuracy over Porter
- **Pros**: Better handling of edge cases, multi-language support
- **Cons**: Slightly slower than Porter

### Lancaster Stemmer
- **Use when**: You need aggressive stemming
- **Pros**: Very fast, reduces words to minimal stems
- **Cons**: Can over-stem, reducing accuracy

## Complete Example: Chapter 3 Code

Here's the complete updated code for Chapter 3:

```python
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

def demonstrate_stemmers():
words = ["happiness", "unkind", "dogs", "expected", "running", "flies"]

porter = PorterStemmer()
snowball = SnowballStemmer("english")
lancaster = LancasterStemmer()

print("{:<15} {:<15} {:<15} {:<15}".format(
"Word", "Porter", "Snowball", "Lancaster"
))
print("-" * 60)

for word in words:
print("{:<15} {:<15} {:<15} {:<15}".format(
word,
porter.stem(word),
snowball.stem(word),
lancaster.stem(word)
))

if __name__ == "__main__":
demonstrate_stemmers()
```

## Troubleshooting

### Issue: ModuleNotFoundError: No module named 'nltk'

**Solution:**
```bash
pip install nltk
```

### Issue: Resource 'tokenizers/punkt' not found

**Solution:**
```python
import nltk
nltk.download('punkt')
```

### Issue: Need morphological analysis, not just stemming

**Solution:** Use spaCy for lemmatization:
```python
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The runners were running quickly")
for token in doc:
print(f"{token.text} -> {token.lemma_}")
```

## Additional Resources

- [NLTK Stemming Documentation](https://www.nltk.org/howto/stem.html)
- [Difference between Stemming and Lemmatization](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)
- [spaCy Lemmatization Guide](https://spacy.io/usage/linguistic-features#lemmatization)

## Need Help?

If you encounter issues with this migration:
1. Check that NLTK is installed: `pip list | grep nltk`
2. Verify Python version: `python --version` (Python 3.6+ recommended)
3. Review the NLTK documentation for your specific use case
4. Consider using spaCy for more advanced NLP tasks
Loading