Code: Select all
import re
split_by_dot_pattern = r"(?|-(?=\s*\w+))"
split_by_tag_pattern = r"|(?)[\s+\n+]\s*(?=|)|[….?!](?=|
)"
sentence_split_pattern = split_by_dot_pattern + split_by_tag_pattern
sentence_split_regexp = re.compile(sentence_split_pattern)
def split_text_into_part(text, part_regexp):
"""
:rtype: list[dict]
"""
initial_position = 0
parts = []
for match in part_regexp.finditer(text):
start, end = match.span()
parts.append({'left': initial_position, 'length': start - initial_position, 'text': text[initial_position:start]})
initial_position = end
parts.append({'left': initial_position, 'length': len(text) - initial_position, 'text': text[initial_position:]})
return parts
example_text = " " * 10000 + " Show Image " + " " * 10000
print(split_text_into_part(example_text, sentence_split_regexp))