如何解决Wild Dataset中的SynthText有多少个字符数?
我从official的Wild Dataset中下载了SynthText。
然后,我阅读了官方的readme.txt,但是我找不到数据集有多少个字符。 我用谷歌搜索,但找不到它...
如下面的示例图像所示,存在一些符号,例如.
,:
和-
。因此,此数据集包含字母(27)+数字(10)+一些符号(?)。
有人知道吗?
解决方法
我实现了自己的代码,可以对符号进行计数。
def get_characters(basedir,imagedirname='SynthText',skip_missing=False):
class Symbols:
def __init__(self):
self.symbols = set()
def update(self,data):
self.symbols = self.symbols.union(data)
def __len__(self):
return len(self.symbols)
def __str__(self):
return ''.join(self.symbols)
symbols = Symbols()
def csvgenerator(annodir,imagedir,cbb,wBB,imname,txts,symbols,**kwargs):
image_num = kwargs.get('image_num')
i = kwargs.get('i')
imgpath = os.path.join(imagedir,imname)
img = cv2.imread(imgpath)
h,w,_ = img.shape
if not os.path.exists(imgpath):
if not skip_missing:
raise FileNotFoundError('{} was not found'.format(imgpath))
else:
logging.warning('Missing image: {}'.format(imgpath))
raise _Skip()
# convert txts to list of str
# I don't know why txts is
# ['Lines:\nI lost\nKevin ','will ','line\nand ',# 'and\nthe ','(and ','the\nout ',# 'you ',"don't\n pkg "]
# there is strange blank and the length of txts is different from the one of wBB
txts = ' '.join(txts.tolist()).split()
text_num = len(txts)
if wBB.ndim == 2:
# convert shape=(2,4,) to (2,1)
wBB = np.expand_dims(wBB,2)
assert text_num == wBB.shape[2],'The length of text and wordBB must be same,but got {} and {}'.format(
text_num,wBB.shape[2])
# replace non-alphanumeric characters with *
alltexts_asterisk = ''.join([re.sub(r'[^A-Za-z0-9]','*',text) for text in txts])
assert len(alltexts_asterisk) == cbb.shape[
2],'The length of characters and cbb must be same,but got {} and {}'.format(
len(alltexts_asterisk),cbb.shape[2])
for b in range(text_num):
text = txts[b]
symboltext = re.sub(r'[A-Za-z0-9]+','',text)
symbols.update(symboltext)
sys.stdout.write('\r{},and number is {}...{:0.1f}% ({}/{})'.format(symbols,len(symbols),100 * (float(i + 1) / image_num),i + 1,image_num))
sys.stdout.flush()
_gtmatRecognizer(csvgenerator,basedir,imagedirname,customLog=True,symbols=symbols)
print()
print('symbols are {},and number is {}'.format(symbols,len(symbols)))
def _gtmatRecognizer(generator,customLog=False,**kwargs):
"""
convert gt.mat to https://github.com/MhLiao/TextBoxes_plusplus/blob/master/data/example.xml
<annotation>
<folder>train_images</folder>
<filename>img_10.jpg</filename>
<size>
<width>1280</width>
<height>720</height>
<depth>3</depth>
</size>
<object>
<difficult>1</difficult>
<content>###</content>
<name>text</name>
<bndbox>
<x1>1011</x1>
<y1>157</y1>
<x2>1079</x2>
<y2>160</y2>
<x3>1076</x3>
<y3>173</y3>
<x4>1011</x4>
<y4>170</y4>
<xmin>1011</xmin>
<ymin>157</ymin>
<xmax>1079</xmax>
<ymax>173</ymax>
</bndbox>
</object>
.
.
.
</annotation>
:param basedir: str,directory path under \'SynthText\'(,\'licence.txt\')
:param imagedirname: (Optional) str,image directory name including \'gt.mat\
:return:
"""
logging.basicConfig(level=logging.INFO)
imagedir = os.path.join(basedir,imagedirname)
gtpath = os.path.join(imagedir,'gt.mat')
annodir = os.path.join(basedir,'Annotations')
if not os.path.exists(gtpath):
raise FileNotFoundError('{} was not found'.format(gtpath))
if not os.path.exists(annodir):
# create Annotations directory
os.mkdir(annodir)
"""
ref: http://www.robots.ox.ac.uk/~vgg/data/scenetext/readme.txt
gts = dict;
__header__: bytes
__version__: str
__globals__: list
charBB: object ndarray,shape = (1,image num).
Character level bounding box. shape = (2=(x,y),4=(top left,...: clockwise),BBox word num)
wordBB: object ndarray,image num).
Word level bounding box. shape = (2=(x,BBox char num)
imnames: object ndarray,image num,1).
txt: object ndarray,shape = (i,image num).
Text. shape = (word num)
"""
logging.info('Loading {} now.\nIt may take a while.'.format(gtpath))
gts = sio.loadmat(gtpath)
logging.info('Loaded\n'.format(gtpath))
charBB = gts['charBB'][0]
wordBB = gts['wordBB'][0]
imnames = gts['imnames'][0]
texts = gts['txt'][0]
image_num = imnames.size
for i,(cbb,txts) in enumerate(zip(charBB,wordBB,imnames,texts)):
imname = imname[0]
try:
generator(annodir,i=i,image_num=image_num,**kwargs)
except _Skip:
pass
if not customLog:
sys.stdout.write('\rGenerating... {:0.1f}% ({}/{})'.format(100 * (float(i + 1) / image_num),image_num))
sys.stdout.flush()
print()
logging.info('Finished!!!')
最后,我得到了符号编号。 看来ASCII printable characters没有空格。
INFO:root:Loading ~/data/text/SynthText/SynthText/gt.mat now.
It may take a while.
INFO:root:Loaded
}&|%_(],$^{+?#@/-`).<=;~['>:\!"*,and number is 32...100.0% (858750/858750)
INFO:root:Finished!!!
symbols are }&|%_(],and number is 32
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。