Commit b2cdc14f by mahaisong

feat:合并pdftojson、更新python。 整体提交一次。

parent e45ab078
This source diff could not be displayed because it is too large. You can view the blob instead.
b.ToTable("AspNetUsers");
b.ToTable("AspNetUsers");
b.ToTable("AspNetRoles");
b.ToTable("AspNetRoleClaims");
b.ToTable("AspNetUserClaims");
b.ToTable("AspNetUserLogins");
b.ToTable("AspNetUserRoles");
b.ToTable("AspNetUserTokens");
IdentityResources
IdentityClaims
ApiResources
ApiSecrets
ApiScopes
ApiClaims
ApiScopeClaims
Clients
ClientGrantTypes
ClientRedirectUris
ClientPostLogoutRedirectUris
ClientScopes
ClientSecrets
ClientClaims
ClientIdPRestrictions
ClientCorsOrigins
ClientProperties
PersistedGrants
½һ
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for `apiclaims`
-- ----------------------------
DROP TABLE IF EXISTS `ApiClaims`;
CREATE TABLE `ApiClaims` (
`Id` int(11) NOT NULL AUTO_INCREMENT,
`ApiResourceId` int(11) NOT NULL,
`Type` varchar(200) NOT NULL,
PRIMARY KEY (`Id`) USING BTREE,
KEY `FK_ApiClaims_ApiResources_ApiResourceId` (`ApiResourceId`) USING BTREE,
CONSTRAINT `FK_ApiClaims_ApiResources_ApiResourceId` FOREIGN KEY (`ApiResourceId`) REFERENCES `ApiResources` (`Id`) ON DELETE CASCADE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT;
echo off
echo off
:: ԴĿ¼
set sourcePath=D:\Log\input\119log\
::Ŀ·
cd /d C:\Program Files (x86)\Log Parser 2.2
for /r %sourcePath% %%i in (*.log) do (
logparser -i:iisw3c -o:csv "SELECT cs-uri-stem,cs-uri-query, sc-status,c-ip,time-taken,TO_TIMESTAMP(date,time) as TimeStamp into 'D:\Log\output\119\%%~ni.csv' FROM input\119log\%%~ni.log"
)
pause
\ No newline at end of file
林老师您好:
林老师您好:
我们想统计一下12月(12月1日0点~~1月1日0点)的接口数据,能不能麻烦您帮助提供。(和上次统计11月份的统计内容相同)
1.复制:HIVE数据,执行夏阳夏老师的SQL语句,导出文件。
提取点击记录.sql
提取推荐文章是哪些,是否深度阅读.sql
2.复制:3台机器下IIS的日志。
3.在ES中查询可推荐文章条数。(按照下面步骤选择即可)
3.1打开TAB页面ES-基本查询。
3.2搜索 palas_v3的文档,查询条件为 must、 items.pubDate、range、 gt、2018-12-01
3.3得到命中数字--即为条数。
...@@ -152,20 +152,11 @@ class poc: ...@@ -152,20 +152,11 @@ class poc:
with open(dir + '/' + file, 'r', encoding='utf-8') as fp: with open(dir + '/' + file, 'r', encoding='utf-8') as fp:
for line in fp.readlines(): for line in fp.readlines():
self.textlist.append(re.sub('\n', '', line)) self.textlist.append(re.sub('\n', '', line))
# ============================================================================= # self.textlist = [i for i in self.textlist if i != '']
# self.textlist = [i for i in self.textlist if i != '']
# =============================================================================
self.file_name = re.sub('.html', '', file) self.file_name = re.sub('.html', '', file)
self.data = { self.data = {
# ============================================================================= '文件名': re.sub('.html', '', file),
# =============================================================================
# '文件名': re.sub('.html', '', file),
# =============================================================================
# =============================================================================
'发行人': self.issuer(), '发行人': self.issuer(),
# =============================================================================
# '发行人': file.split('20')[0],
# =============================================================================
'主承销商': self.lead_underwriter(), '主承销商': self.lead_underwriter(),
'联席主承销商': self.joint_lead_underwriter(), '联席主承销商': self.joint_lead_underwriter(),
'发行金额(元)': self.issue_amount(), '发行金额(元)': self.issue_amount(),
...@@ -174,32 +165,28 @@ class poc: ...@@ -174,32 +165,28 @@ class poc:
'发行日': self.issue_date(), '发行日': self.issue_date(),
'起息日': self.value_date(), '起息日': self.value_date(),
} }
print(self.data['起息日'])
def issuer(self): def issuer(self):
iii=self.textlist[0] iii=self.textlist[0].split(',')[-1].split(':')[-1]
try: try:
companyname=re.findall('.*?公司',iii)[0] companyname=re.findall('.*?公司',iii)[0]
except Exception as e: except Exception:
try: try:
companyname=re.findall('.*?公司',self.textlist[1])[0] companyname=re.findall('.*?公司',self.textlist[1])[0]
except Exception as e: except Exception:
try: try:
companyname=re.findall('.*?公司',self.textlist[2])[0] companyname=re.findall('.*?公司',self.textlist[2])[0]
except Exception as e: except Exception:
if file.split('20')[0]: if file.split('20')[0]:
companyname=file.split('20')[0] companyname=file.split('20')[0]
else: else:
print ('') print ('')
return str(companyname) return str(companyname).split(',')[-1].split(':')[-1]
def lead_underwriter(self): def lead_underwriter(self):
def normalize(text): def normalize(text):
ret = re.findall('.*公司|.*银行', text)[0] ret = re.findall('.*公司|.*银行', text)[0]
# ============================================================================= # ret = ret .split('联席主承销商')[0]
# ret = ret .split('联席主承销商')[0]
# =============================================================================
ret = ret.split('\n')[-1] ret = ret.split('\n')[-1]
ret = ret.split('二')[0] ret = ret.split('二')[0]
ret = ret.split('人')[-1] ret = ret.split('人')[-1]
...@@ -208,8 +195,8 @@ class poc: ...@@ -208,8 +195,8 @@ class poc:
ret = ret. split('即')[-1] ret = ret. split('即')[-1]
ret = ret. split(' ')[-1] ret = ret. split(' ')[-1]
ret = ret. split(':')[-1] ret = ret. split(':')[-1]
ret= ret. split ('主承销商')[-1] ret = ret. split ('主承销商')[-1]
ret= ret. split ('为')[-1] ret = ret. split ('为')[-1]
return ret return ret
mainpos = locate('目', self.textlist) mainpos = locate('目', self.textlist)
...@@ -225,11 +212,13 @@ class poc: ...@@ -225,11 +212,13 @@ class poc:
ret = re.findall('主承销商.*?公司|主承销商.*?银行', text)[0] ret = re.findall('主承销商.*?公司|主承销商.*?银行', text)[0]
if re.search('联席主承销商',ret): if re.search('联席主承销商',ret):
continue continue
if '《' in ret or '》' in ret:
continue
ret = ret.split(':')[-1] ret = ret.split(':')[-1]
ret = normalize(ret) ret = normalize(ret)
if len(ret)<=4: if len(ret)<=4:
return '' return ''
return ret return ret.split(',')[-1].split(':')[-1]
for n, text in enumerate(self.textlist[:mainpos]): for n, text in enumerate(self.textlist[:mainpos]):
if re.search('主承销商', text): if re.search('主承销商', text):
...@@ -237,7 +226,7 @@ class poc: ...@@ -237,7 +226,7 @@ class poc:
if ret: if ret:
if re.search('联席主承销商',ret[0]): if re.search('联席主承销商',ret[0]):
continue continue
if re.sub('\(|\)|(|)','',normalize(ret[0])) ==re.sub('\(|\)|(|)','',str(self.issuer())): if re.sub(r'\(|\)|(|)','',normalize(ret[0])) == re.sub(r'\(|\)|(|)','',str(self.issuer())):
continue continue
if len(normalize(ret[0]))<4: if len(normalize(ret[0]))<4:
return '' return ''
...@@ -247,12 +236,12 @@ class poc: ...@@ -247,12 +236,12 @@ class poc:
if ret: if ret:
if re.search('联席主承销商',ret[0]): if re.search('联席主承销商',ret[0]):
continue continue
if re.sub('\(|\)|(|)','',normalize(ret[0])) ==re.sub('\(|\)|(|)','',str(self.issuer())): if re.sub(r'\(|\)|(|)','',normalize(ret[0])) == re.sub(r'\(|\)|(|)','',str(self.issuer())):
continue continue
if len(normalize(ret[0]))<4: if len(normalize(ret[0]))<4:
return '' return ''
return normalize(ret[0]) return normalize(ret[0])
for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]): for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+5)]) t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+5)])
if re.search('主承销商.*公司|主承销商.*银行|簿记管理人.*公司|簿记管理人.*银行', t): if re.search('主承销商.*公司|主承销商.*银行|簿记管理人.*公司|簿记管理人.*银行', t):
...@@ -261,11 +250,10 @@ class poc: ...@@ -261,11 +250,10 @@ class poc:
ret = normalize(t) ret = normalize(t)
if re.search('年度第',t): if re.search('年度第',t):
continue continue
if re.sub('\(|\)|(|)','',ret) ==re.sub('\(|\)|(|)','',str(self.issuer())): if re.sub(r'\(|\)|(|)','',ret) == re.sub(r'\(|\)|(|)','',str(self.issuer())):
continue continue
return ret return ret.split('•')[-1]
# =============================================================================
# for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]): # for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
# t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+5)]) # t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+5)])
# if re.search('簿记管理人.*公司|簿记管理人.*银行', t): # if re.search('簿记管理人.*公司|簿记管理人.*银行', t):
...@@ -275,10 +263,7 @@ class poc: ...@@ -275,10 +263,7 @@ class poc:
# if ret ==self.issuer(): # if ret ==self.issuer():
# continue # continue
# return ret # return ret
# =============================================================================
# =============================================================================
# for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]): # for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
# if re.search('主承销商', text): # if re.search('主承销商', text):
# ret = re.findall('.*?公司|.*?银行', self.textlist[n+1]) # ret = re.findall('.*?公司|.*?银行', self.textlist[n+1])
...@@ -292,7 +277,6 @@ class poc: ...@@ -292,7 +277,6 @@ class poc:
# if re.search('与主承销商签订',ret): # if re.search('与主承销商签订',ret):
# continue # continue
# return normalize(ret[0]) # return normalize(ret[0])
# =============================================================================
if not ret: if not ret:
ret = '' ret = ''
...@@ -326,64 +310,67 @@ class poc: ...@@ -326,64 +310,67 @@ class poc:
for n, text in enumerate(self.textlist[:mainpos]): for n, text in enumerate(self.textlist[:mainpos]):
if re.search('联席主承销商.*公司|联席主承销商.*银行', text): if re.search('联席主承销商.*公司|联席主承销商.*银行', text):
ret = re.findall('联席主承销商.*?公司|联席主承销商.*?银行', text)[0] ret = re.findall('联席主承销商.*?公司|联席主承销商.*?银行', text)[0]
if '《' in ret or '》' in ret:
continue
ret = ret.split(':')[-1] ret = ret.split(':')[-1]
ret = normalize(ret) ret = normalize(ret)
print (1) if re.sub(r'\(|\)|\(|\)','',ret) == re.sub(r'\(|\)|\(|\)','',str(self.issuer())):
if re.sub('\(|\)|\(|\)','',ret) ==re.sub('\(|\)|\(|\)','',str(self.issuer())):
continue continue
# ============================================================================= # if re.search('联席主承销商:', ret):
# if re.search('联席主承销商:', ret): # continue
# continue return ret.split(':')[-1]
# =============================================================================
return ret
for n, text in enumerate(self.textlist[:mainpos]): for n, text in enumerate(self.textlist[:mainpos]):
if re.search('联席主承销商', text): if re.search('联席主承销商', text):
ret = re.findall('.*?公司|.*?银行', self.textlist[n + 1]) ret = re.findall('.*?公司|.*?银行', self.textlist[n + 1])
if ret: if ret:
ret = normalize(ret[0]) ret = normalize(ret[0])
if re.sub('\(|\)|(|)','',ret) ==re.sub('\(|\)|(|)','',str(self.issuer())): if '《' in ret or '》' in ret:
continue continue
return ret if re.sub(r'\(|\)|(|)','',ret) == re.sub(r'\(|\)|(|)','',str(self.issuer())):
continue
return ret.split(':')[-1].split(',')[-1]
else: else:
ret = re.findall('.*?公司|.*?银行', self.textlist[n + 2]) ret = re.findall('.*?公司|.*?银行', self.textlist[n + 2])
if ret: if ret:
ret = normalize(ret[0]) ret = normalize(ret[0])
if re.sub('\(|\)|(|)','',ret) ==re.sub('\(|\)|(|)','',str(self.issuer())): if '《' in ret or '》' in ret:
continue
if re.sub(r'\(|\)|(|)','',ret) == re.sub(r'\(|\)|(|)','',str(self.issuer())):
continue continue
return ret return ret.split(':')[-1].split(',')[-1]
for n, text in enumerate(self.textlist[:mainpos]): for n, text in enumerate(self.textlist[:mainpos]):
t = ''.join(self.textlist[:mainpos]) t = ''.join(self.textlist[:mainpos])
if not re.search('联席主承销商', t) : if not re.search('联席主承销商', t) :
ret='' return ''
return ret
for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]): for n, text in enumerate(self.textlist[mainpos2:mainpos2+70]):
t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+6)]) t = ''.join(self.textlist[(n+mainpos2):(n+mainpos2+6)])
if len(t)<55: if len(t)<55:
t= ''.join(self.textlist[(n+mainpos2):(n+mainpos2+7)]) t= ''.join(self.textlist[(n+mainpos2):(n+mainpos2+7)])
if re.search('联席主承销商.*公司|联席主承销商.*银行', t): if re.search('联席主承销商.*公司|联席主承销商.*银行', t):
if re.search('《',t): if '《' in t or '》' in t:
continue continue
# ============================================================================= # if re.search(self.issuer(),t):
# if re.search(self.issuer(),t): # continue
# continue
# =============================================================================
ret = normalize(t) ret = normalize(t)
if re.search('银行间市场',ret): if re.search('银行间市场',ret):
continue continue
if re.sub('\(|\)|(|)','',ret) ==re.sub('\(|\)|(|)','',str(self.issuer())): if re.sub(r'\(|\)|(|)','',ret) ==re.sub(r'\(|\)|(|)','',str(self.issuer())):
continue continue
if re.search('年度第',t): if re.search('年度第',t):
continue continue
return ret return ret
if ret:
if re.sub('\(|\)|(|)','',ret) ==re.sub('\(|\)|(|)','',str(self.issuer())):
ret=''
if not ret:
ret = ''
return ret # if ret:
# if re.sub('\(|\)|(|)','',ret) == re.sub('\(|\)|(|)','',str(self.issuer())):
# ret = ''
# if not ret:
# ret = ''
# return ret
return ''
def issue_amount(self): def issue_amount(self):
...@@ -394,37 +381,38 @@ class poc: ...@@ -394,37 +381,38 @@ class poc:
ret = text ret = text
break break
else: else:
ret = ''.join(self.textlist[n:(n+5)]) ret = ''.join(self.textlist[n:(n+1)])
break break
ret = re.findall('本期.*?元|本期.*?亿|发行金额.*?元|发行金额.*?亿', ret) ret = re.findall('本期.*?元|本期.*?亿|发行金额.*?元|发行金额.*?亿', ret)
if ret: if ret:
ret = ret[0] ret = ret[0]
else: else:
ret = '' ret = ''
ret = re.sub('[:,,:“”【】 为]', '', ret) ret = re.sub('[:,,:“”【】 为"]', '', ret)
if re.search('[0-9]+', ret): if re.search(r'\d+', ret):
if '亿' in ret: d_tmp = 1
ret = re.sub(u'[\u4e00-\u9fff]', '', ret) for ri in CN_UNIT:
try: if ri in ret:
ret = str(float(ret) * 100000000) d_tmp *= CN_UNIT[ri]
except: ret = re.sub(u'[\u4e00-\u9fff]', '', ret)
ret = '' try:
else: ret = str(float(ret) * d_tmp)
ret = re.sub(u'[\u4e00-\u9fff]', '', ret) except:
if len(ret) > 12: ret = ''
ret = '' if len(ret) > 12:
ret = ''
else: else:
ret = re.findall(r'[一壹二贰三叁四肆五伍六陆七柒八捌九玖十拾百佰千仟万萬亿億]+', ret) ret = re.findall(r'[一壹二贰三叁四肆五伍六陆七柒八捌九玖十拾百佰千仟万萬亿億]+', ret)
if ret: if ret:
ret = ret[0] ret = ret[0]
else: else:
ret = '' ret = ''
if not re.search('[0-9]+', ret): if not re.search(r'\d+', ret):
try: try:
ret = str(chinese_to_arabic(ret)) ret = str(chinese_to_arabic(ret))
except: except:
pass pass
if re.match(r'\d+',ret): if re.match(r'\d+',ret):
return str(round(float(ret),2)) return str(round(float(ret),2))
else: else:
return ret return ret
...@@ -435,7 +423,7 @@ class poc: ...@@ -435,7 +423,7 @@ class poc:
if re.search('期限|发行期限', text): if re.search('期限|发行期限', text):
if re.search('还本付息', text): if re.search('还本付息', text):
continue continue
ret = ''.join(self.textlist[n:(n+4)]) ret = ''.join(self.textlist[n:(n+1)])
break break
ret = re.findall('期限.*?[天年N]', ret) ret = re.findall('期限.*?[天年N]', ret)
if ret: if ret:
...@@ -444,8 +432,11 @@ class poc: ...@@ -444,8 +432,11 @@ class poc:
ret += '年' ret += '年'
else: else:
ret = '' ret = ''
ret = re.sub('[:,,:“”【】 为]', '', ret) ret = re.sub('[:,,:“”【】 为"]', '', ret)
return ret if len(ret) < 10:
return ret
else:
return ''
def face_value(self): def face_value(self):
_num = r'((\d+\.?\d*)|([十拾]?[一壹二贰三叁四肆五伍六陆七柒八捌九玖][十拾百佰千仟万萬亿億]?))' _num = r'((\d+\.?\d*)|([十拾]?[一壹二贰三叁四肆五伍六陆七柒八捌九玖][十拾百佰千仟万萬亿億]?))'
...@@ -453,10 +444,11 @@ class poc: ...@@ -453,10 +444,11 @@ class poc:
_reg = r'(面值\S{0,10}\s*{})|(票面金额\S{0,10}\s*{})'.replace('{}', _rmb) _reg = r'(面值\S{0,10}\s*{})|(票面金额\S{0,10}\s*{})'.replace('{}', _rmb)
ret = '' ret = ''
for n, txt in enumerate(self.textlist): for n, txt in enumerate(self.textlist):
text = ''.join(self.textlist[n:(n+5)]) text = ''.join(self.textlist[n:(n+1)])
tmp = re.search(_reg, text) tmp = re.search(_reg, text)
if tmp: if tmp:
if '每股面值' not in text and '工具面值' not in text and '应收账款面值' not in text and '应收款面值' not in text and '认购单位' not in text and '按面值' not in text and '面值总' not in text and '面值)' not in text: # print(tmp.group(0), text)
if '每股面值' not in text and '工具面值' not in text and '应收账款面值' not in text and '应收款面值' not in text and '认购单位' not in text and '按面值' not in text and '面值总' not in text and '面值)' not in text and '面值)' not in text:
ret = re.search(_num, tmp.group(0)).group(0) ret = re.search(_num, tmp.group(0)).group(0)
break break
if len(ret) > 0 and ret[0] in ['十','拾']: if len(ret) > 0 and ret[0] in ['十','拾']:
...@@ -489,14 +481,14 @@ class poc: ...@@ -489,14 +481,14 @@ class poc:
if re.search('还本付息', text): if re.search('还本付息', text):
continue continue
if (len(text) > 40) & (('2018' not in text) or ('月' not in text) or ('日' not in text)): if (len(text) > 40) and (('2018' not in text) or ('月' not in text) or ('日' not in text)):
continue continue
if re.search('披露|安排|规定|基本|存续', text): if re.search('披露|安排|规定|基本|存续', text):
continue continue
exam_con=''.join(self.textlist[(n):(n+5)]) exam_con=''.join(self.textlist[(n):(n+5)])
if re.search('起息', exam_con) is False: if re.search('起息', exam_con) is False:
continue continue
t = ''.join(self.textlist[(n):(n+2)]) t = ''.join(self.textlist[n:n+2])
##提取当前行及下面两行 ##提取当前行及下面两行
if re.search('中国货币网', t): if re.search('中国货币网', t):
continue continue
...@@ -507,28 +499,27 @@ class poc: ...@@ -507,28 +499,27 @@ class poc:
ret = t[(start+1):end] ret = t[(start+1):end]
try: try:
if end < len(t) and ((t[end] == '-') or (t[end] == '至') or (t[end] == '、') or (t[end] == '—') or (t[end] == '及')): if end < len(t) and ((t[end] == '-') or (t[end] == '至') or (t[end] == '、') or (t[end] == '—') or (t[end] == '及')):
a = re.findall('[^\-至、—]*?日', t[(end+1):]) a = re.findall(r'[^\-至、—]*?日', t[(end+1):])
if not a: if not a:
ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret) ret = re.sub(r':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
print ('aa') # print ('aa')
return ret return ret
ret = ret+'至'+a[0] ret = ret+'至'+a[0]
# ============================================================================= # if re.match([0-9],ret) is False:
# if re.match([0-9],ret) is False: # return ''
# return '' ret = re.sub(r':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
# =============================================================================
ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
tmp_file_name = self.file_name tmp_file_name = self.file_name
tmp_file_name = re.sub(r'\(.{2}\)','',tmp_file_name) tmp_file_name = re.sub(r'\(.{2}\)','',tmp_file_name)
tmp_file_name = re.sub(r'(.{2})','',tmp_file_name) tmp_file_name = re.sub(r'(.{2})','',tmp_file_name)
tmp_file_name = re.sub(r'\.txt','',tmp_file_name) tmp_file_name = re.sub(r'\.txt','',tmp_file_name)
tmp_file_name = re.sub(r'期','',tmp_file_name) tmp_file_name = re.sub(r'期','',tmp_file_name)
if tmp_file_name in ret: # if tmp_file_name in ret:
print('########') # print('########')
ret = ret.replace(tmp_file_name,'') ret = ret.replace(tmp_file_name,'')
ret = re.sub('起息日','',ret) ret = re.sub('起息日','',ret)
ret = ret.split(':')[-1] ret = ret.split(':')[-1]
if re.search('[0-9]+',ret): if re.search(r'\d+',ret):
ret = ret.split(',')[-1].split(':')[-1]
if ret[0] == '日': if ret[0] == '日':
return ret[1:] return ret[1:]
else: else:
...@@ -540,6 +531,7 @@ class poc: ...@@ -540,6 +531,7 @@ class poc:
else: else:
print('warning:', t) print('warning:', t)
return ret return ret
return ''
def value_date(self): def value_date(self):
...@@ -569,7 +561,7 @@ class poc: ...@@ -569,7 +561,7 @@ class poc:
# ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret) # ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
# return ret # return ret
# ret = ret+'至'+a[0] # ret = ret+'至'+a[0]
ret = re.sub(':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret) ret = re.sub(r':|【|】|期|指|〔|〕| |为|\[|\]|\(|\)|(|)|发行|变更', '', ret)
if re.findall('[0-9]+',ret): if re.findall('[0-9]+',ret):
if re.search('自',ret): if re.search('自',ret):
ret=ret.split('自')[-1] ret=ret.split('自')[-1]
...@@ -581,6 +573,7 @@ class poc: ...@@ -581,6 +573,7 @@ class poc:
print('warning:', t) print('warning:', t)
return ret return ret
return ''
def txt_handle_interface(txt_path): def txt_handle_interface(txt_path):
from os import path from os import path
...@@ -593,10 +586,7 @@ if __name__ == '__main__': ...@@ -593,10 +586,7 @@ if __name__ == '__main__':
# single instance: # single instance:
# ============================================================================= folder='D:\\qqFILE\\result\\full'
# folder= './full'
# =============================================================================
folder= 'D:\\qqFILE\\result\\full'
filelist = os.listdir(folder) filelist = os.listdir(folder)
filelist = [file for file in filelist if re.search('.txt', file)] filelist = [file for file in filelist if re.search('.txt', file)]
# file = filelist[0] # file = filelist[0]
...@@ -607,7 +597,6 @@ if __name__ == '__main__': ...@@ -607,7 +597,6 @@ if __name__ == '__main__':
for n, file in enumerate(filelist[:]): for n, file in enumerate(filelist[:]):
# try: # try:
print(n)
p = poc(folder, file) p = poc(folder, file)
data.append(p.data) data.append(p.data)
# except Exception as err: # except Exception as err:
......
http://poc.seekxun.com/
数据库在mech.gimind.com下面的pdftojson数据库中
________过去的,已经不用这个了__________
test.seekxun.com
222.73.241.5
密码taikor
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment