Skip to content

Commit

Permalink
Use global regex for TweetTokenizer'ss remove_handles (#2795)
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaarsen committed Sep 5, 2021
1 parent ec0c03c commit d4e8c3c
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions nltk/tokenize/casual.py
Expand Up @@ -177,6 +177,11 @@
# These are for regularizing HTML entities to Unicode:
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")

# For stripping away handles from a tweet:
HANDLES_RE = regex.compile(
r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|"
r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
)

######################################################################
# Functions for converting html entities
Expand Down Expand Up @@ -322,13 +327,8 @@ def remove_handles(text):
"""
Remove Twitter username handles from text.
"""
pattern = regex.compile(
r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|"
r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
)
# Substitute handles with ' ' to ensure that text on either
# side of removed handles are tokenized correctly
return pattern.sub(" ", text)
# Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
return HANDLES_RE.sub(" ", text)


######################################################################
Expand Down

0 comments on commit d4e8c3c

Please sign in to comment.