# coding: utf-8
# 2021/5/18 @ tongshiwei
import base64
import numpy as np
import re
from contextlib import contextmanager
from ..constants import Symbol, TEXT_SYMBOL, FORMULA_SYMBOL, FIGURE_SYMBOL, QUES_MARK_SYMBOL, TAG_SYMBOL, SEP_SYMBOL
[docs]class TextSegment(str):
pass
[docs]class QuesMarkSegment(str):
pass
[docs]class SepSegment(str):
pass
[docs]class SegmentList(object):
"""
Parameters
----------
item
figures:dict
Returns
----------
list
tokenizated item
Examples
--------
>>> test_item = "如图所示,则三角形$ABC$的面积是$\\SIFBlank$。$\\FigureID{1}$"
>>> SegmentList(test_item)
['如图所示,则三角形', 'ABC', '的面积是', '\\\\SIFBlank', '。', \\FigureID{1}]
Attributes
----------
segments
show all segments
text_segments
show text segments
formula_segments
show formula segments
figure_segments
show figure sements
ques_mark_segments
show question mark segments
tag_segments
show tag segments
describe
show number of each elements
"""
def __init__(self, item, figures: dict = None):
self._segments = []
self._text_segments = []
self._formula_segments = []
self._figure_segments = []
self._ques_mark_segments = []
self._tag_segments = []
self._sep_segments = []
# remove $\textf{*} from the item$
item_no_textf = "".join(re.split(r"\$\\textf\{([^,]+?),b?d?i?t?u?w?}\$", item))
segments = re.split(r"(\$.+?\$)", item_no_textf)
for segment in segments:
if not segment:
continue
if not re.match(r"\$.+?\$", segment):
self.append(TextSegment(segment))
elif re.match(r"\$\\FormFigureID\{.+?}\$", segment):
self.append(FigureFormulaSegment(segment[1:-1], is_base64=False, figure_instance=figures))
elif re.match(r"\$\\FormFigureBase64\{.+?}\$", segment):
self.append(FigureFormulaSegment(segment[1:-1], is_base64=True, figure_instance=figures))
elif re.match(r"\$\\FigureID\{.+?}\$", segment):
self.append(FigureSegment(segment[1:-1], is_base64=False, figure_instance=figures))
elif re.match(r"\$\\FigureBase64\{.+?}\$", segment):
self.append(FigureSegment(segment[1:-1], is_base64=True, figure_instance=figures))
elif re.match(r"\$\\(SIFBlank|SIFChoice)\$", segment):
self.append(QuesMarkSegment(segment[1:-1]))
elif re.match(r"\$\\SIFTag\{.+?}\$", segment):
self.append(TagSegment(segment[1:-1]))
elif re.match(r"\$\\SIFSep\$", segment):
self.append(SepSegment(segment[1:-1]))
else:
self.append(LatexFormulaSegment(segment[1:-1]))
self._seg_idx = None
def __repr__(self):
return str(self.segments)
def __len__(self):
return len(self._segments)
[docs] def append(self, segment) -> None:
"""add segment to corresponding segments"""
if isinstance(segment, TextSegment):
self._text_segments.append(len(self))
elif isinstance(segment, (LatexFormulaSegment, FigureFormulaSegment)):
self._formula_segments.append(len(self))
elif isinstance(segment, FigureSegment):
self._figure_segments.append(len(self))
elif isinstance(segment, QuesMarkSegment):
self._ques_mark_segments.append(len(self))
elif isinstance(segment, TagSegment):
self._tag_segments.append(len(self))
elif isinstance(segment, SepSegment):
self._sep_segments.append(len(self))
else:
raise TypeError("Unknown Segment Type: %s" % type(segment))
self._segments.append(segment)
@property
def segments(self):
"""return segments"""
if self._seg_idx is None:
return self._segments
else:
return [s for i, s in enumerate(self._segments) if i in self._seg_idx]
@property
def text_segments(self):
"""return text segments"""
return [self._segments[i] for i in self._text_segments]
@property
def formula_segments(self):
"""return formula segments"""
return [self._segments[i] for i in self._formula_segments]
@property
def figure_segments(self):
"""return figure segments"""
return [self._segments[i] for i in self._figure_segments]
@property
def ques_mark_segments(self):
"""return question mark segments"""
return [self._segments[i] for i in self._ques_mark_segments]
@property
def tag_segments(self):
"""return tag segments"""
return [self._segments[i] for i in self._tag_segments]
[docs] def to_symbol(self, idx, symbol):
"""switch element to its symbol"""
self._segments[idx] = symbol
[docs] def symbolize(self, to_symbolize="fgm"):
"""
Switch designated elements to symbol. \
It is a good way to protect or preserve the elements which we don't want to tokenize.
Parameters
----------
to_symbolize:
"t": text
"f": formula
"g": figure
"m": question mark
"a": tag
"s": sep
Returns
-------
"""
if "t" in to_symbolize:
for idx in self._text_segments:
self.to_symbol(idx, Symbol(TEXT_SYMBOL))
if "f" in to_symbolize:
for idx in self._formula_segments:
self.to_symbol(idx, Symbol(FORMULA_SYMBOL))
if "g" in to_symbolize:
for idx in self._figure_segments:
self.to_symbol(idx, Symbol(FIGURE_SYMBOL))
if "m" in to_symbolize:
for idx in self._ques_mark_segments:
self.to_symbol(idx, Symbol(QUES_MARK_SYMBOL))
if "a" in to_symbolize:
for idx in self._tag_segments:
self.to_symbol(idx, Symbol(TAG_SYMBOL))
if "s" in to_symbolize:
for idx in self._sep_segments:
self.to_symbol(idx, Symbol(SEP_SYMBOL))
[docs] @contextmanager
def filter(self, drop: (set, str) = "", keep: (set, str) = "*"):
"""
Output special element list selective.Drop means not show.Keep means show.
Parameters
----------
drop: set or str
The alphabet should be included in "tfgmas", which means drop selected segments out of return value.
keep: set or str
The alphabet should be included in "tfgmas", which means only keep selected segments in return value.
"""
_drop = {c for c in drop} if isinstance(drop, str) else drop
if keep == "*":
_keep = {c for c in "tfgmas" if c not in _drop}
else:
_keep = {c for c in keep if c not in _drop} if isinstance(keep, str) else keep
self._seg_idx = set()
if "t" in _keep:
self._seg_idx |= set(self._text_segments)
if "f" in _keep:
self._seg_idx |= set(self._formula_segments)
if "g" in _keep:
self._seg_idx |= set(self._figure_segments)
if "m" in _keep:
self._seg_idx |= set(self._ques_mark_segments)
if "a" in _keep:
self._seg_idx |= set(self._tag_segments)
if "s" in _keep:
self._seg_idx |= set(self._sep_segments)
yield
self._seg_idx = None
[docs] def describe(self):
"""show the length of different segments"""
return {
"t": len(self._text_segments),
"f": len(self._formula_segments),
"g": len(self._figure_segments),
"m": len(self._ques_mark_segments),
}
[docs]def seg(item, figures=None, symbol=None):
r"""
It is a interface for SegmentList. And show it in an appropriate way.
Parameters
----------
item
figures
symbol
Returns
-------
list
segmented item
Examples
--------
>>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$"
>>> s = seg(test_item)
>>> s
['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}]
>>> s.describe()
{'t': 3, 'f': 1, 'g': 1, 'm': 1}
>>> with s.filter("f"):
... s
['如图所示,则', '的面积是', '\\SIFBlank', '。', \FigureID{1}]
>>> with s.filter(keep="t"):
... s
['如图所示,则', '的面积是', '。']
>>> with s.filter():
... s
['如图所示,则', '\\bigtriangleup ABC', '的面积是', '\\SIFBlank', '。', \FigureID{1}]
>>> seg(test_item, symbol="fgm")
['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]']
>>> seg(test_item, symbol="tfgm")
['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]']
>>> seg(r"如图所示,则$\FormFigureID{0}$的面积是$\SIFBlank$。$\FigureID{1}$")
['如图所示,则', \FormFigureID{0}, '的面积是', '\\SIFBlank', '。', \FigureID{1}]
>>> seg(r"如图所示,则$\FormFigureID{0}$的面积是$\SIFBlank$。$\FigureID{1}$", symbol="fgm")
['如图所示,则', '[FORMULA]', '的面积是', '[MARK]', '。', '[FIGURE]']
>>> s.text_segments
['如图所示,则', '的面积是', '。']
>>> s.formula_segments
['\\bigtriangleup ABC']
>>> s.figure_segments
[\FigureID{1}]
>>> s.ques_mark_segments
['\\SIFBlank']
>>> test_item_1 = {
... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$",
... "options": ['0', '1', r'$\sqrt{2}$', '2']
... }
>>> from EduNLP.utils import dict2str4sif
>>> test_item_1_str = dict2str4sif(test_item_1)
>>> test_item_1_str
'$\\SIFTag{stem_begin}$...$\\SIFTag{stem_end}$$\\SIFTag{options_begin}$$\\SIFTag{list_0}$0...$\\SIFTag{options_end}$'
>>> s1 = seg(test_item_1_str, symbol="tfgm")
>>> s1
['\\SIFTag{stem_begin}'...'\\SIFTag{stem_end}', '\\SIFTag{options_begin}', '\\SIFTag{list_0}', ...]
>>> with s1.filter(keep="a"):
... s1
[...'\\SIFTag{list_0}', '\\SIFTag{list_1}', '\\SIFTag{list_2}', '\\SIFTag{list_3}', '\\SIFTag{options_end}']
>>> s1.tag_segments
['\\SIFTag{stem_begin}', '\\SIFTag{stem_end}', '\\SIFTag{options_begin}', ... '\\SIFTag{options_end}']
>>> test_item_1_str_2 = dict2str4sif(test_item_1, tag_mode="head", add_list_no_tag=False)
>>> seg(test_item_1_str_2, symbol="tfgmas")
['[TAG]', ... '[TAG]', '[TEXT]', '[SEP]', '[TEXT]', '[SEP]', '[FORMULA]', '[SEP]', '[TEXT]']
>>> s2 = seg(test_item_1_str_2, symbol="fgm")
>>> s2.tag_segments
['\\SIFTag{stem}', '\\SIFTag{options}']
>>> test_item_2 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是"
>>> s2 = seg(test_item_2)
>>> s2.text_segments
['已知', ',则以下说法中正确的是']
"""
segments = SegmentList(item, figures)
if symbol is not None:
segments.symbolize(symbol)
return segments