diff --git a/nltk/tokenize/casual.py b/nltk/tokenize/casual.py index 7d0d17fc64..66f038105f 100644 --- a/nltk/tokenize/casual.py +++ b/nltk/tokenize/casual.py @@ -177,6 +177,11 @@ # These are for regularizing HTML entities to Unicode: ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);") +# For stripping away handles from a tweet: +HANDLES_RE = regex.compile( + r"(?