import re import struct import unicodedata from .regex import ( EMAIL_REGEX, EMOJI_REGEX, HASHTAG_REGEX, MULTIWHITESPACE_REGEX, NON_ALNUMWHITESPACE_REGEX, TELEGRAM_LINK_REGEX, URL_REGEX, ) def add_surrogate(text): return ''.join( # SMP -> Surrogate Pairs (Telegram offsets are calculated with these). # See https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview for more. ''.join(chr(y) for y in struct.unpack('\g<2>', text, flags=re.MULTILINE) return text def normalize_string(string): string = re.sub('[^a-zA-Z0-9_\\-]+', '', string.lower().strip().replace(' ', '-')) return unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8') def remove_emails(text): return re.sub(EMAIL_REGEX, '', text) def remove_emoji(text): text = re.sub(EMOJI_REGEX, '', text) text = re.sub(u'\ufe0f', '', text) return text def remove_hashtags(text): return re.sub(HASHTAG_REGEX, '', text) def remove_url(text): return re.sub(URL_REGEX, '', text) def replace_telegram_link(text): return re.sub(TELEGRAM_LINK_REGEX, r'@\1', text) def split_at(s, pos): if len(s) < pos: return s pos -= 10 pos = max(0, pos) for p in range(pos, min(pos + 20, len(s) - 1)): if s[p] in [' ', '\n', '.', ',', ':', ';', '-']: return s[:p] + '...' return s[:pos] + '...' def unwind_hashtags(text): return re.sub(HASHTAG_REGEX, r'\2', text)