Skip to content

Commit

Permalink
lazy load stopwords, add regex default to str.replace, fix packages v…
Browse files Browse the repository at this point in the history
…ersions
  • Loading branch information
jbesomi committed Jul 1, 2021
1 parent 8f64d3e commit cfcda43
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 17 deletions.
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ python_requires = >=3.6.1
install_requires =
numpy>=1.17
scikit-learn>=0.22
spacy>=2.2.2
spacy<3.0.0
tqdm>=4.3
nltk>=3.3
plotly>=4.2.0
pandas>=1.0.2
wordcloud>=1.5.0
unidecode>=1.1.1
gensim>=3.6.0
gensim>=3.6.0,<4.0
matplotlib>=3.1.0
# TODO pick the correct version.
[options.extras_require]
Expand Down
27 changes: 13 additions & 14 deletions texthero/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@
import unidecode
from nltk.stem import PorterStemmer, SnowballStemmer

from texthero import stopwords as _stopwords

from typing import List, Callable


# Ignore gensim annoying warnings
import warnings

Expand Down Expand Up @@ -69,9 +66,9 @@ def replace_digits(input: pd.Series, symbols: str = " ", only_blocks=True) -> pd

if only_blocks:
pattern = r"\b\d+\b"
return input.str.replace(pattern, symbols)
return input.str.replace(pattern, symbols, regex=True)
else:
return input.str.replace(r"\d+", symbols)
return input.str.replace(r"\d+", symbols, regex=True)


def remove_digits(input: pd.Series, only_blocks=True) -> pd.Series:
Expand Down Expand Up @@ -128,7 +125,7 @@ def replace_punctuation(input: pd.Series, symbol: str = " ") -> pd.Series:
dtype: object
"""

return input.str.replace(rf"([{string.punctuation}])+", symbol)
return input.str.replace(rf"([{string.punctuation}])+", symbol, regex=True)


def remove_punctuation(input: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -187,7 +184,7 @@ def remove_whitespace(input: pd.Series) -> pd.Series:
dtype: object
"""

return input.str.replace("\xa0", " ").str.split().str.join(" ")
return input.str.replace("\xa0", " ", regex=False).str.split().str.join(" ")


def _replace_stopwords(text: str, words: Set[str], symbol: str = " ") -> str:
Expand Down Expand Up @@ -249,6 +246,8 @@ def replace_stopwords(
"""

if stopwords is None:
from texthero import stopwords as _stopwords

stopwords = _stopwords.DEFAULT
return input.apply(_replace_stopwords, args=(stopwords, symbol))

Expand Down Expand Up @@ -444,7 +443,7 @@ def remove_round_brackets(s: pd.Series):
:meth:`remove_square_brackets`
"""
return s.str.replace(r"\([^()]*\)", "")
return s.str.replace(r"\([^()]*\)", "", regex=True)


def remove_curly_brackets(s: pd.Series):
Expand All @@ -466,7 +465,7 @@ def remove_curly_brackets(s: pd.Series):
:meth:`remove_square_brackets`
"""
return s.str.replace(r"\{[^{}]*\}", "")
return s.str.replace(r"\{[^{}]*\}", "", regex=True)


def remove_square_brackets(s: pd.Series):
Expand All @@ -490,7 +489,7 @@ def remove_square_brackets(s: pd.Series):
"""
return s.str.replace(r"\[[^\[\]]*\]", "")
return s.str.replace(r"\[[^\[\]]*\]", "", regex=True)


def remove_angle_brackets(s: pd.Series):
Expand All @@ -513,7 +512,7 @@ def remove_angle_brackets(s: pd.Series):
:meth:`remove_square_brackets`
"""
return s.str.replace(r"<[^<>]*>", "")
return s.str.replace(r"<[^<>]*>", "", regex=True)


def remove_brackets(s: pd.Series):
Expand Down Expand Up @@ -567,7 +566,7 @@ def remove_html_tags(s: pd.Series) -> pd.Series:
| &([a-z0-9]+|\#[0-9]{1,6}|\#x[0-9a-f]{1,6}); # Remove &nbsp;
"""

return s.str.replace(pattern, "")
return s.str.replace(pattern, "", regex=True)


def tokenize(s: pd.Series) -> pd.Series:
Expand Down Expand Up @@ -595,7 +594,7 @@ def tokenize(s: pd.Series) -> pd.Series:
rf"((\w)([{string.punctuation}])(?:\B|$)|(?:^|\B)([{string.punctuation}])(\w))"
)

return s.str.replace(pattern, r"\2 \3 \4 \5").str.split()
return s.str.replace(pattern, r"\2 \3 \4 \5", regex=True).str.split()


def tokenize_with_phrases(s: pd.Series, min_count: int = 5, threshold: int = 10):
Expand Down Expand Up @@ -661,7 +660,7 @@ def replace_urls(s: pd.Series, symbol: str) -> pd.Series:

pattern = r"http\S+"

return s.str.replace(pattern, symbol)
return s.str.replace(pattern, symbol, regex=True)


def remove_urls(s: pd.Series) -> pd.Series:
Expand Down
2 changes: 1 addition & 1 deletion texthero/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series:

return (
s.str.replace(
pattern, r"\2 \3"
pattern, r"\2 \3", regex=True
) # \2 and \3 permits to keep the character around the punctuation.
.str.split() # now split by space
.explode() # one word for each line
Expand Down

0 comments on commit cfcda43

Please sign in to comment.