Commit 65e7ff02 by mahaisong

feat:pdftojson py 修改

parent 544bf78e
...@@ -158,8 +158,10 @@ class poc: ...@@ -158,8 +158,10 @@ class poc:
self.file_name = re.sub('.html', '', file) self.file_name = re.sub('.html', '', file)
self.data = { self.data = {
# ============================================================================= # =============================================================================
# =============================================================================
# '文件名': re.sub('.html', '', file), # '文件名': re.sub('.html', '', file),
# ============================================================================= # =============================================================================
# =============================================================================
'发行人': self.issuer(), '发行人': self.issuer(),
# ============================================================================= # =============================================================================
# '发行人': file.split('20')[0], # '发行人': file.split('20')[0],
...@@ -400,7 +402,7 @@ class poc: ...@@ -400,7 +402,7 @@ class poc:
else: else:
ret = '' ret = ''
ret = re.sub('[:,,:“”【】 为]', '', ret) ret = re.sub('[:,,:“”【】 为]', '', ret)
if re.search('[0-9]', ret): if re.search('[0-9]+', ret):
if '亿' in ret: if '亿' in ret:
ret = re.sub(u'[\u4e00-\u9fff]', '', ret) ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
try: try:
...@@ -408,15 +410,19 @@ class poc: ...@@ -408,15 +410,19 @@ class poc:
except: except:
ret = '' ret = ''
else: else:
ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
if len(ret) > 12:
ret = ''
else:
ret = re.findall(r'[一壹二贰三叁四肆五伍六陆七柒八捌九玖十拾百佰千仟万萬亿億]+', ret) ret = re.findall(r'[一壹二贰三叁四肆五伍六陆七柒八捌九玖十拾百佰千仟万萬亿億]+', ret)
if ret: if ret:
ret = ret[0] ret = ret[0]
else: else:
ret = '' ret = ''
if not re.search('[0-9]', ret): if not re.search('[0-9]+', ret):
try: try:
ret = str(chinese_to_arabic(ret)) ret = str(chinese_to_arabic(ret))
except Exception as err: except:
pass pass
if re.match(r'\d+',ret): if re.match(r'\d+',ret):
return str(round(float(ret),2)) return str(round(float(ret),2))
...@@ -500,36 +506,39 @@ class poc: ...@@ -500,36 +506,39 @@ class poc:
start, end = s.span() start, end = s.span()
ret = t[(start+1):end] ret = t[(start+1):end]
try: try:
if (t[end] == '-') or (t[end] == '至') or (t[end] == '、') or (t[end] == '—') or (t[end] == '及'): if end < len(t) and ((t[end] == '-') or (t[end] == '至') or (t[end] == '、') or (t[end] == '—') or (t[end] == '及')):
a = re.findall('[^\-至、—]*?日', t[(end+1):]) a = re.findall('[^\-至、—]*?日', t[(end+1):])
if not a: if not a:
ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret) ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
print ('aa')
return ret return ret
ret = ret+'至'+a[0] ret = ret+'至'+a[0]
# ============================================================================= # =============================================================================
# if re.match([0-9],ret) is False: # if re.match([0-9],ret) is False:
# return '' # return ''
# ============================================================================= # =============================================================================
ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret) ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
tmp_file_name = self.file_name tmp_file_name = self.file_name
tmp_file_name = re.sub(r'\(.{2}\)','',tmp_file_name) tmp_file_name = re.sub(r'\(.{2}\)','',tmp_file_name)
tmp_file_name = re.sub(r'(.{2})','',tmp_file_name) tmp_file_name = re.sub(r'(.{2})','',tmp_file_name)
tmp_file_name = re.sub(r'\.txt','',tmp_file_name) tmp_file_name = re.sub(r'\.txt','',tmp_file_name)
tmp_file_name = re.sub(r'期','',tmp_file_name) tmp_file_name = re.sub(r'期','',tmp_file_name)
ret = re.sub(tmp_file_name,'',ret) if tmp_file_name in ret:
print('########')
ret = ret.replace(tmp_file_name,'')
ret = re.sub('起息日','',ret) ret = re.sub('起息日','',ret)
ret= ret.split(':')[-1] ret = ret.split(':')[-1]
if ret[:0] =='日': if re.search('[0-9]+',ret):
ret =ret[1:] if ret[0] == '日':
if re.findall('[0-9]+',ret):
return ret[1:] return ret[1:]
else: else:
return ret
else:
return '' return ''
except Exception as err: except:
print(err) pass
else: else:
print('warning:', t) print('warning:', t)
return ret return ret
def value_date(self): def value_date(self):
...@@ -584,8 +593,10 @@ if __name__ == '__main__': ...@@ -584,8 +593,10 @@ if __name__ == '__main__':
# single instance: # single instance:
# =============================================================================
# folder= './full'
# =============================================================================
folder= 'D:\\qqFILE\\result\\full' folder= 'D:\\qqFILE\\result\\full'
folder2= 'D:\\data\\full'
filelist = os.listdir(folder) filelist = os.listdir(folder)
filelist = [file for file in filelist if re.search('.txt', file)] filelist = [file for file in filelist if re.search('.txt', file)]
# file = filelist[0] # file = filelist[0]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment