Commit 65e7ff02 by mahaisong

feat:pdftojson py 修改

parent 544bf78e
......@@ -158,8 +158,10 @@ class poc:
self.file_name = re.sub('.html', '', file)
self.data = {
# =============================================================================
# =============================================================================
# '文件名': re.sub('.html', '', file),
# =============================================================================
# =============================================================================
'发行人': self.issuer(),
# =============================================================================
# '发行人': file.split('20')[0],
......@@ -400,23 +402,27 @@ class poc:
else:
ret = ''
ret = re.sub('[:,,:“”【】 为]', '', ret)
if re.search('[0-9]', ret):
if re.search('[0-9]+', ret):
if '亿' in ret:
ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
try:
ret = str(float(ret) * 100000000)
except:
ret = ''
else:
ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
if len(ret) > 12:
ret = ''
else:
ret = re.findall(r'[一壹二贰三叁四肆五伍六陆七柒八捌九玖十拾百佰千仟万萬亿億]+', ret)
if ret:
ret = ret[0]
else:
ret = ''
if not re.search('[0-9]', ret):
if not re.search('[0-9]+', ret):
try:
ret = str(chinese_to_arabic(ret))
except Exception as err:
except:
pass
if re.match(r'\d+',ret):
return str(round(float(ret),2))
......@@ -500,36 +506,39 @@ class poc:
start, end = s.span()
ret = t[(start+1):end]
try:
if (t[end] == '-') or (t[end] == '至') or (t[end] == '、') or (t[end] == '—') or (t[end] == '及'):
if end < len(t) and ((t[end] == '-') or (t[end] == '至') or (t[end] == '、') or (t[end] == '—') or (t[end] == '及')):
a = re.findall('[^\-至、—]*?日', t[(end+1):])
if not a:
ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
print ('aa')
return ret
ret = ret+'至'+a[0]
# =============================================================================
# if re.match([0-9],ret) is False:
# return ''
# =============================================================================
# =============================================================================
# if re.match([0-9],ret) is False:
# return ''
# =============================================================================
ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
tmp_file_name = self.file_name
tmp_file_name = re.sub(r'\(.{2}\)','',tmp_file_name)
tmp_file_name = re.sub(r'(.{2})','',tmp_file_name)
tmp_file_name = re.sub(r'\.txt','',tmp_file_name)
tmp_file_name = re.sub(r'期','',tmp_file_name)
ret = re.sub(tmp_file_name,'',ret)
if tmp_file_name in ret:
print('########')
ret = ret.replace(tmp_file_name,'')
ret = re.sub('起息日','',ret)
ret= ret.split(':')[-1]
if ret[:0] =='日':
ret =ret[1:]
if re.findall('[0-9]+',ret):
return ret[1:]
ret = ret.split(':')[-1]
if re.search('[0-9]+',ret):
if ret[0] == '日':
return ret[1:]
else:
return ret
else:
return ''
except Exception as err:
print(err)
except:
pass
else:
print('warning:', t)
return ret
def value_date(self):
......@@ -584,8 +593,10 @@ if __name__ == '__main__':
# single instance:
# =============================================================================
# folder= './full'
# =============================================================================
folder= 'D:\\qqFILE\\result\\full'
folder2= 'D:\\data\\full'
filelist = os.listdir(folder)
filelist = [file for file in filelist if re.search('.txt', file)]
# file = filelist[0]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment