Commit 1a78ef03 by mahaisong

feat:新增

parents


Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
VisualStudioVersion = 14.0.25420.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PythonApplication1", "PythonApplication1\PythonApplication1.pyproj", "{DABC086C-2EBA-4335-A981-588FEFF3445F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{DABC086C-2EBA-4335-A981-588FEFF3445F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DABC086C-2EBA-4335-A981-588FEFF3445F}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class AttributeEvaluationResult:
def __init__(self, Attribute,Value,MatchedWords):
#属性名称string
self.__Attribute = Attribute
#该属性的得分float
self.__Value = Value
#命中该属性的词,多个string
self.__MatchedWords = MatchedWords
@property
def Attribute(self):
return self.__Attribute
@Attribute.setter
def Attribute(self,Attribute):
self.__Attribute = str(Attribute)
@property
def MatchedWords(self):
return self.__MatchedWords
@MatchedWords.setter
def MatchedWords(self,MatchedWords):
self.__MatchedWords = [](MatchedWords)
@property
def Value(self):
return self.__Value
@Value.setter
def Value(self,Value):
self.__Value = float(Value)
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class ItemCountData:
def __init__(self, ViewCount,ReplyCount,ForwardCount,FetchTime):
self.__ViewCount = ViewCount
self.__ReplyCount = ReplyCount
self.__ForwardCount = ForwardCount
self.__FetchTime = FetchTime #DateTime
@property
def ViewCount(self):
return self.__ViewCount
@ViewCount.setter
def ViewCount(self,ViewCount):
self.__ViewCount = int(ViewCount)
@property
def ReplyCount(self):
return self.__ReplyCount
@ReplyCount.setter
def ReplyCount(self,ReplyCount):
self.__ReplyCount = int(ReplyCount)
@property
def ForwardCount(self):
return self.__ForwardCount
@ForwardCount.setter
def ForwardCount(self,ForwardCount):
self.__ForwardCount = int(ForwardCount)
@property
def FetchTime(self):
return self.__FetchTime
@FetchTime.setter
def FetchTime(self,FetchTime):
self.__FetchTime = str(FetchTime)
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys, urllib, urllib2, json, ssl
from datetime import datetime,timedelta
class ItemLabelData:
def __init__(self, IssueID):
self.__IssueID=IssueID; #议题编号
@property
def IssueID(self):
return self.__IssueID
@IssueID.setter
def IssueID(self,IssueID):
self.__IssueID = str(IssueID)
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class ItemReply:
def __init__(self, ItemID,CleanText,FetchTime,PubDate,AuthorName,AuthorID,Location,Source,Mark):
self.__ItemID = ItemID#所回复的条目
self.__CleanText = CleanText #内容
self.__FetchTime = FetchTime #抓取时间 DateTime
self.__PubDate = PubDate#页面显示发布时间 DateTime
self.__AuthorName = AuthorName#作者名
self.__AuthorID = AuthorID #作者ID
self.__Location = Location#作者或内容的地点属性
self.__Source = Source #显示来源(比如使用终端等)
self.__Mark = Mark#星标(多级别) SByte
@property
def ItemID(self):
return self.__ItemID
@ItemID.setter
def ItemID(self,ItemID):
self.__ItemID = str(ItemID)
@property
def CleanText(self):
return self.__CleanText
@CleanText.setter
def CleanText(self,CleanText):
self.__CleanText = str(CleanText)
@property
def FetchTime(self):
return self.__FetchTime
@FetchTime.setter
def FetchTime(self,FetchTime):
self.__FetchTime = str(FetchTime)
@property
def PubDate(self):
return self.__PubDate
@PubDate.setter
def PubDate(self,PubDate):
self.__PubDate = str(PubDate)
@property
def AuthorName(self):
return self.__AuthorName
@AuthorName.setter
def AuthorName(self,AuthorName):
self.__AuthorName = str(AuthorName)
@property
def AuthorID(self):
return self.__AuthorID
@AuthorID.setter
def AuthorID(self,AuthorID):
self.__AuthorID = str(AuthorID)
@property
def Location(self):
return self.__Location
@Location.setter
def Location(self,Location):
self.__Location = str(Location)
@property
def Source(self):
return self.__Source
@Source.setter
def Source(self,Source):
self.__Source = str(Source)
@property
def Mark(self):
return self.__Mark
@Mark.setter
def Mark(self,Mark):
self.__Mark = int(Mark)
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys, urllib, urllib2, json, ssl
from datetime import datetime,timedelta
class ItemTracking:
def __init__(self, FollowStatus,FollowNextTime,FollowPriority,LastReplyUrl,LastReplyPageNo,LastReply_SinceID,LastReplyDate,LastReplyAuthorID,LastReplyAuthorName,ReplyCount,FollowCount,FollowErrorCount):
#抓取任务的状态
#0正常状态,等待NextCrawlTime执行抓取
#1(人为)停止标志
#2进行中(LockTimeout后才可以重置)
#3超过连续错误次数限制后,自动标志为错误并停止抓取
#4在队列中等待处理
self.__FollowStatus=FollowStatus;
self.__FollowNextTime=FollowNextTime;#FollowNextTime DateTime
self.__FollowPriority=FollowPriority; #Follow任务的优先级 sbyte
self.__LastReplyUrl=LastReplyUrl; #最后跟踪到的评论页Url(用于PageByPage或PageByPageTimeASCE)
self.__LastReplyPageNo=LastReplyPageNo; #最后跟踪到的页码(用于FormatStringTimeASCE)
self.__LastReply_SinceID=LastReply_SinceID; #最后一次同步的评论的ID
self.__LastReplyDate=LastReplyDate; #最后回复日期 DateTime
self.__LastReplyAuthorID=LastReplyAuthorID; #最后回复作者ID
self.__LastReplyAuthorName=LastReplyAuthorName; #最后回复作者名
self.__ReplyCount=ReplyCount;#已入库评论数
self.__FollowCount=FollowCount; #跟踪次数(不算第一次抓到)
self.__FollowErrorCount=FollowErrorCount; #跟踪错误数
@property
def FollowStatus(self):
return self.__FollowStatus
@FollowStatus.setter
def FollowStatus(self,FollowStatus):
self.__FollowStatus = int(FollowStatus)
@property
def FollowNextTime(self):
return self.__FollowNextTime
@FollowNextTime.setter
def FollowNextTime(self,FollowNextTime):
self.__FollowNextTime = str(FollowNextTime)
@property
def FollowPriority(self):
return self.__FollowPriority
@FollowPriority.setter
def FollowPriority(self,FollowPriority):
self.__FollowPriority = int(FollowPriority)
@property
def LastReplyUrl(self):
return self.__LastReplyUrl
@LastReplyUrl.setter
def LastReplyUrl(self,LastReplyUrl):
self.__LastReplyUrl = str(LastReplyUrl)
@property
def LastReplyPageNo(self):
return self.__LastReplyPageNo
@LastReplyPageNo.setter
def LastReplyPageNo(self,LastReplyPageNo):
self.__LastReplyPageNo = int(LastReplyPageNo)
@property
def LastReply_SinceID(self):
return self.__LastReply_SinceID
@LastReply_SinceID.setter
def LastReply_SinceID(self,LastReply_SinceID):
self.__LastReply_SinceID = str(LastReply_SinceID)
@property
def LastReplyDate(self):
return self.__LastReplyDate
@LastReplyDate.setter
def LastReplyDate(self,LastReplyDate):
self.__LastReplyDate = str(LastReplyDate)
@property
def LastReplyAuthorID(self):
return self.__LastReplyAuthorID
@LastReplyAuthorID.setter
def LastReplyAuthorID(self,LastReplyAuthorID):
self.__LastReplyAuthorID = str(LastReplyAuthorID)
@property
def LastReplyAuthorName(self):
return self.__LastReplyAuthorName
@LastReplyAuthorName.setter
def LastReplyAuthorName(self,LastReplyAuthorName):
self.__LastReplyAuthorName = str(LastReplyAuthorName)
@property
def ReplyCount(self):
return self.__ReplyCount
@ReplyCount.setter
def ReplyCount(self,ReplyCount):
self.__ReplyCount = int(ReplyCount)
@property
def FollowCount(self):
return self.__FollowCount
@FollowCount.setter
def FollowCount(self,FollowCount):
self.__FollowCount = int(FollowCount)
@property
def FollowErrorCount(self):
return self.__FollowErrorCount
@FollowErrorCount.setter
def FollowErrorCount(self,FollowErrorCount):
self.__FollowErrorCount = int(FollowErrorCount)
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys, urllib, urllib2, json, ssl
from datetime import datetime,timedelta
class ResearchReport:
def __init__(self, ReportType,ReportAgency):
self.__ReportType=ReportType; #研报类型
self.__ReportAgency=ReportAgency;#研报机构
@property
def ReportType(self):
return self.__ReportType
@ReportType.setter
def ReportType(self,ReportType):
self.__ReportType = str(ReportType)
@property
def ReportAgency(self):
return self.__ReportAgency
@ReportAgency.setter
def ReportAgency(self,ReportAgency):
self.__ReportAgency = str(ReportAgency)
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class Structured:
def __init__(self, StructId,Category,Subject,Keypoint,MarketImpact):
self.__StructId = StructId #结构化编号,和分类相关
self.__Category = Category #结构化类别
self.__Subject = Subject #文章主题,暂时仅选择一个啦
self.__Keypoint = Keypoint#内容关键点
self.__MarketImpact = MarketImpact #市场影响
@property
def StructId(self):
return self.__StructId
@StructId.setter
def StructId(self,StructId):
self.__StructId = str(StructId)
@property
def Category(self):
return self.__Category
@Category.setter
def Category(self,Category):
self.__Category = str(Category)
@property
def Subject(self):
return self.__Subject
@Subject.setter
def Subject(self,Subject):
self.__Subject = str(Subject)
@property
def Keypoint(self):
return self.__Keypoint
@Keypoint.setter
def Keypoint(self,Keypoint):
self.__Keypoint = str(Keypoint)
@property
def MarketImpact(self):
return self.__MarketImpact
@MarketImpact.setter
def MarketImpact(self,MarketImpact):
self.__MarketImpact = str(MarketImpact)
\ No newline at end of file
++ "a/4.\346\216\245\345\217\243\351\205\215\345\220\210\345\255\220\345\245\207\345\267\245\344\275\234/PythonApplication1/PythonApplication1/Model/__init__.py"
基本数据结构定义说明:
基本数据结构定义说明:
1.在PythonApplication1中每次访问获得的100条都会放在ItemList中。
2.ItemList内部会有100个Item类对象。
3.Item类对象中含有基本属性字段、1个ItemAnalyzeDataList列表(集合), ItemAnalyzeDataList中包含多条AnalyzeData
4.AnalyzeData类对象中含有基本属性字段
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
class Person():
#结构化编号,和分类相关
__StructId ="";
#结构化类别
__Category ="";
#文章主题,暂时仅选择一个啦
__Subject ="";
#内容关键点
__Keypoint ="";
#市场影响
__MarketImpact ="";
#!/usr/bin/python
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import sys, time, json
from datetime import datetime,timedelta
sys.path.append(r'C:\\Users\\admin\\Documents\\Visual Studio 2015\\Projects\\PythonApplication1\\PythonApplication1')
from taikoropensdk import TaikorOauthClient
from Model.Item import *
from Model.AnalyzeData import *
from Person import Person
#此内容最终被封装成一个函数,按照周期每隔多长时间调用一次使用。
# 1.基础配置预定义
giUserId = "TU20161027161331711608"
giAppSecert = "46af9da375b007aa20e49b0391bff871b607f2d8e068fac0e58ba9f33ecd749f9e3205761e5e5167cf38170c6c94b7730a026d3d33d40b77a80ef6f2fcd3e84a526f8d38de0c82678d5bfdb2407e089ee17715491e746adfbc6d33998e7fafa658537331366c46e11c8c46b91446bc6d38ff59fa0cbe65b8486fafe14a7f5ab1"
giSyncIssueID = "GuoHai"
giSource = "Q38APDrtEfU="
giBaseUrl = "http://180.153.146.60:7978/"
giOnlyHasCopyright = "false"
skiphour = 2#stime 和etime 查询范围几个小时
# 2.创建连接,通过权限校验,获取token
try:
client = TaikorOauthClient(giUserId,giAppSecert)
except Exception,err:
print 1,err
else:
#3.没报错的情况下,构建要传递的参数parameters
"""
实例化一个timespan
请注意它的参数顺序
timedelta([days[, seconds[, microseconds[, milliseconds[, minutes[, hours[, weeks]]]]]]])
"""
i = 0
step = 1000 #1次读取多少条
etime = int(time.time())
print etime
stime = int(time.mktime((datetime.now() - timedelta(hours=skiphour)).timetuple()))
print stime
parameters = {'Source': giSource,'Stime':stime, 'Etime': etime,'Count':0,'Skip':0}
if(giOnlyHasCopyright.upper() == "TRUE"):
parameters.setdefault("HasCopyright", true)
parameters["Count"] = step
while True:
#开始while
parameters["Skip"] = i * step
resultJson = client.httpGet("Search/FullContents", parameters);
if(not resultJson or resultJson==""): break;
#解析出本次传输的100条数据为100个对象
ItemList=[];
client.handleItemList(resultJson,ItemList);
if(not ItemList or len(ItemList)<=0):
break;
else:
i=i+1;
testkankan=ItemList[0];
print i;
#开始分析性代码
#伪代码: for item in ItemList: 循环分析。。。
#结束分析性代码
#结束while
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="4.0">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>dabc086c-2eba-4335-a981-588feff3445f</ProjectGuid>
<ProjectHome>.</ProjectHome>
<StartupFile>PythonApplication1.py</StartupFile>
<SearchPath>
</SearchPath>
<WorkingDirectory>.</WorkingDirectory>
<OutputPath>.</OutputPath>
<Name>PythonApplication1</Name>
<RootNamespace>PythonApplication1</RootNamespace>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
<DebugSymbols>true</DebugSymbols>
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' == 'Release' ">
<DebugSymbols>true</DebugSymbols>
<EnableUnmanagedDebugging>false</EnableUnmanagedDebugging>
</PropertyGroup>
<ItemGroup>
<Compile Include="Model\AnalyzeData.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\AttributeEvaluationResult.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ItemCountData.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ItemLabelData.py" />
<Compile Include="Model\ItemReply.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ItemTracking.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\Structured.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\ResearchReport.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Model\__init__.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="Person.py">
<SubType>Code</SubType>
</Compile>
<Compile Include="PythonApplication1.py" />
<Compile Include="Model\Item.py" />
<Compile Include="taikoropensdk.py" />
<Compile Include="Tool.py">
<SubType>Code</SubType>
</Compile>
</ItemGroup>
<ItemGroup>
<Folder Include="Model\" />
</ItemGroup>
<ItemGroup>
<Content Include="Model\说明.txt" />
</ItemGroup>
<PropertyGroup>
<VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">10.0</VisualStudioVersion>
<PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
</PropertyGroup>
<Import Condition="Exists($(PtvsTargetsFile))" Project="$(PtvsTargetsFile)" />
<Import Condition="!Exists($(PtvsTargetsFile))" Project="$(MSBuildToolsPath)\Microsoft.Common.targets" />
<!-- Uncomment the CoreCompile target to enable the Build command in
Visual Studio and specify your pre- and post-build commands in
the BeforeBuild and AfterBuild targets below. -->
<!--<Target Name="CoreCompile" />-->
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
</Project>
\ No newline at end of file
#/usr/bin/python2
#/usr/bin/python2
#encoding=utf-8
import sys
import urllib
import urllib2
import json
import ssl
from datetime import datetime,timedelta
sys.path.append(r'C:\Users\admin\Documents\Visual Studio 2015\Projects\PythonApplication1\PythonApplication1')
from ObjectModel import ObjectModel
class Tool:
#公开函数:将返回的JSON字符串解析成多个ObjectModel的集合
def get_target_value(self,key, dic, tmp_list):
"""
:param key: 目标key值
:param dic: JSON数据
:param tmp_list: 用于存储获取的数据
:return: list
"""
if not isinstance(dic, dict) or not isinstance(tmp_list, list): # 对传入数据进行格式校验
return 'argv[1] not an dict or argv[-1] not an list '
if key in dic.keys():
tmp_list.append(dic[key]) # 传入数据存在则存入tmp_list
else:
for value in dic.values(): # 传入数据不符合则对其value值进行遍历
if isinstance(value, dict):
self.get_target_value(key, value, tmp_list) # 传入数据的value值是字典,则直接调用自身
elif isinstance(value, (list, tuple)):
self._get_value(key, value, tmp_list) # 传入数据的value值是列表或者元组,则调用_get_value
return tmp_list
#下划线:私有,外部类就别调用了
def _get_value(self,key, val, tmp_list):
for val_ in val:
#k = type(val_)
if isinstance(val_, dict):
self.get_target_value(key, val_, tmp_list) # 传入数据的value值是字典,则调用get_target_value
elif isinstance(val_, (list, tuple)):
self._get_value(key, val_, tmp_list) # 传入数据的value值是列表或者元组,则调用自身
\ No newline at end of file
[
[
{
"UpdateTime": "0001-01-01T00:00:00",
"IndexTime": "0001-01-01T00:00:00",
"ContentDetailLevel": "内容详细程度",
"ItemID": "文章ID",
"ClientItemID": "客户系统或对方网站的ItemID",
"ParentItemID": "父级ItemID",
"Url": "文章Url",
"CleanTitle": "标题",
"ShortTitle": "短标题",
"CleanText": "正文",
"Summary": "高达81%的受访企业表示在未来三年有明确增员计划,该比例较上一年大幅上涨24个百分点,其中预计人员增长10%以上企业比例超过1/3\r\n扩张中的企业出于战略布局的考虑,往往会在不同城市设立分部甚至变更总部所在地",
"PubDate": "页面显示发布时间",
"AuthorName": "作者名(分词索引)",
"AuthorID": "作者ID",
"AuthorTag": "Author.Tag字段,用于区分人",
"Source": "显示来源(比如使用终端等)",
"RetweetID": "转载的ID,如果有的话",
"MediaID": "媒体编号ID",
"MediaName": "21世纪经济报道",
"MediaChannel": "政经",
"Rank": " 在媒体中的排序,用以表示顺序或者重要性(例如头版、首页之类)",
"MediaType":"媒体类型",
"MediaRegionType":"媒体区域类型",
"MediaWeight": " 媒体权重",
"MediaOrganType": "媒体性质",
"MediaStyle": "媒体样式",
"MediaTendency": "媒体倾向性",
"MediaIndustryIDs": "媒体行业ID",
"MediaTag": "媒体标签",
"ProxyZone": "代理分区 >0表示要翻墙",
"ReproducedMediaID": "页面显示原始媒体名称ID",
"ReproducedMediaName": "页面显示原始媒体名称",
"ReproducedUrl": "页面显示原始媒体的Url",
"MediaCopyright": "该媒体版权;0 未知,11 黑名单,12 无版权,13 不允许转载,14 拒绝抓取,20 白名单,21 采购可转授,22 自编,23 开放可商业转载,31 采购需单独授权,32 用户自有版权,33 用户自行承担版权,41 非商业使用",
"Tracking": "跟踪设置和信息(评论)",
"Tracking_Forward": "跟踪设置和信息(转发)",
"CurrentCount": "最新的点击数、评论数、转发数",
"CountHistory": " 定时获取的点击数、评论数、转发数(存ES)",
"Reply": "回复数据",
"DuplicationID": "相似转载ID(早先入库的相似文章ID,第一篇该字段为空,其他均为第一篇的ID)",
"SpliteTitle": "分词结果",
"SpliteText": " 分词结果",
"ProsdDuplication": "是否进行过转载分析",
"Tag": "通用标签,根据各实体库中的Tag字段设置",
"ResearchReport": "研报",
"AnalyzeData语义分析结果字段": [
{
"IssueID": "议题编号",
"MergeMethod": "归并入Issue的方式",
"CategoryIDs": " 用户分类ID(多个)",
"DepartmentIDs": " 用户部门ID(多个)",
"IndustryIDs": "行业ID(多个)",
"Product": " 产品名称(多个,<品牌-系列-品名-型号-类型>路径,用空格或,;隔开,分词索引)",
"TopicIDs": "用户话题ID(多个)",
"OrganizationIDs": "组织机构ID(多个)",
"People": "用户人物名称(多个,分词索引)",
"Region": "地区名称(多个,<省-市-区县-街道村镇>路径,用空格或,;隔开,分词索引)",
"Keywords": "关键词提取",
"Evaluation": "属性评分的结果",
"MarketIDs": " 品类/市场编号",
"BlockIDs": "主题/板块编号",
"StockIDs": "股票编号",
"RelatedEventID": "相关事件编号",
"DuplicationID": "转载编号",
"以下几个标志":"已分析过的标志",
"ProsdMedia": false,
"ProsdSplite": false,
"ProsdTopic": true,
"ProsdCategory": true,
"ProsdDepartment": false,
"ProsdIndustry": true,
"ProsdProduct": true,
"ProsdPeople": false,
"ProsdOrganization": true,
"ProsdRegion": true,
"ProsdSentiment": true,
"ProsdSentiment8": false,
"ProsdSentiment2": false,
"ProsdKeyword": false,
"prosdIG": false,
"Sentiment": "情感分析:正负情感(1 ~ 3)1:正面;2:中性;3:负面",
"EmotionHappy": "高兴",
"EmotionAccept": 0,
"EmotionSurprise": 0,
"EmotionFear": 0,
"EmotionSad": 0,
"EmotionHate": 0,
"EmotionExpect": 0,
"EmotionAngry": 0,
"EmotionPositive": 0,
"EmotionNegative": 0,
"IG": null
}
],
"LabelData": " 语料标注字段",
"IsOriginalDeleted": "原文是否已被删除",
"IsEPRPublished": "是否是epr公司发布的软广",
"Struct": "文本结构化信息"
}
]
This source diff could not be displayed because it is too large. You can view the blob instead.
/// <summary>
/// <summary>
/// 语义分析结果(每Issue可不同)
/// </summary>
[Serializable]
public class ItemAnalyzeData
{
/// <summary>
/// 议题编号
/// </summary>
public string IssueID { get; set; }
/// <summary>
/// 归并入Issue的方式
/// </summary>
public Enums.MergeMethod MergeMethod { get; set; }
/// <summary>
/// 用户分类ID(多个)
/// </summary>
public string[] CategoryIDs { get; set; }
/// <summary>
/// 用户部门ID(多个)
/// </summary>
public string[] DepartmentIDs { get; set; }
/// <summary>
/// 行业ID(多个)
/// </summary>
public string[] IndustryIDs { get; set; }
/// <summary>
/// 产品名称(多个,"品牌-系列-品名-型号-类型"路径,用空格或,;隔开,分词索引)
/// 其中类型为新加,特殊字段,不和前四个形成继承关系
/// </summary>
public string[] Product { get; set; }
/// <summary>
/// 用户话题ID(多个)
/// </summary>
public string[] TopicIDs { get; set; }
/// <summary>
/// 组织机构ID(多个)
/// </summary>
public string[] OrganizationIDs { get; set; }
/// <summary>
/// 用户人物名称(多个,分词索引)
/// </summary>
public string[] People { get; set; }
/// <summary>
/// 地区名称(多个,"省-市-区县-街道村镇"路径,用空格或,;隔开,分词索引)
/// </summary>
public string[] Region { get; set; }
/// <summary>
/// 关键词提取
/// </summary>
public string[] Keywords { get; set; }
/// <summary>
/// 属性评分的结果
/// </summary>
public AttributeEvaluationResult[] Evaluation { get; set; }
/// <summary>
/// 品类/市场编号
/// </summary>
public string[] MarketIDs { set; get; }
/// <summary>
/// 主题/板块编号
/// </summary>
public string[] BlockIDs { set; get; }
/// <summary>
/// 股票编号
/// </summary>
public string[] StockIDs { set; get; }
/// <summary>
/// 相关事件编号
/// </summary>
public string RelatedEventID { set; get; }
/// <summary>
/// 转载编号
/// </summary>
public string DuplicationID { set; get; }
/// <summary>
/// 已分析过的标志
/// </summary>
public bool ProsdMedia { get; set; }
public bool ProsdSplite { get; set; }
public bool ProsdTopic { get; set; }
public bool ProsdCategory { get; set; }
public bool ProsdDepartment { get; set; }
public bool ProsdIndustry { get; set; }
public bool ProsdProduct { get; set; }
public bool ProsdPeople { get; set; }
public bool ProsdOrganization { get; set; }
public bool ProsdRegion { get; set; }
public bool ProsdSentiment { get; set; }
public bool ProsdSentiment8 { get; set; }
public bool ProsdSentiment2 { get; set; }
public bool ProsdKeyword { get; set; }
public bool prosdIG { get; set; }
#region 情感分析
/// <summary>
/// 正负情感(1 ~ 3),1:正面;2:中性;3:负面
/// </summary>
public decimal Sentiment { get; set; }
/// <summary>
/// 高兴
/// </summary>
public decimal EmotionHappy { get; set; }
/// <summary>
/// EmotionAccept
/// </summary>
public decimal EmotionAccept { get; set; }
/// <summary>
/// EmotionSurprise
/// </summary>
public decimal EmotionSurprise { get; set; }
/// <summary>
/// EmotionFear
/// </summary>
public decimal EmotionFear { get; set; }
/// <summary>
/// EmotionSad
/// </summary>
public decimal EmotionSad { get; set; }
/// <summary>
/// EmotionHate
/// </summary>
public decimal EmotionHate { get; set; }
/// <summary>
/// EmotionExpect
/// </summary>
public decimal EmotionExpect { get; set; }
/// <summary>
/// EmotionAngry
/// </summary>
public decimal EmotionAngry { get; set; }
/// <summary>
/// EmotionPositive
/// </summary>
public decimal EmotionPositive { get; set; }
/// <summary>
/// EmotionNegative
/// </summary>
public decimal EmotionNegative { get; set; }
/// <summary>
/// IG Tree
/// </summary>
public string IG { set; get; }
#endregion 情感分析
public ItemAnalyzeData()
{
}
public ItemAnalyzeData(string IssueID, Enums.MergeMethod MergeMethod)
{
this.IssueID = IssueID;
this.MergeMethod = MergeMethod;
}
}
/// <summary>
/// <summary>
/// 普通爬虫任务数据包
/// </summary>
[Serializable]
[DataContract]
public class CrawlRecode
{
/// <summary>
/// 爬虫名称
/// </summary>
[DataMember]
public string CrawlerName { get; set; }
#region Crawl包含字段
/// <summary>
/// Crawl ID
/// </summary>
[DataMember]
public string CrawlID { get; set; }
/// <summary>
/// 最后一次入库的ItemID
/// </summary>
[DataMember]
public string LastItemID { get; set; }
/// <summary>
/// 最后一次Item发布时间
/// </summary>
[DataMember]
public DateTime? LastItemPubDate { get; set; }
/// <summary>
/// URL匹配正则
/// </summary>
[DataMember]
public string PatternFetchUrl { get; set; }
/// <summary>
/// Crawl所属IssueID
/// </summary>
[DataMember]
public string IssueID { get; set; }
/// <summary>
/// 抓取的Url
/// </summary>
[DataMember]
public string Url { get; set; }
/// <summary>
/// 爬虫分组,主要用于辨别爬虫类型
/// </summary>
[DataMember]
public string CrawlerGroup { get; set; }
/// <summary>
/// 分析标签,标注哪些需要分析
/// </summary>
[DataMember]
public int AnalyzeFlag { get; set; }
/// <summary>
/// 抓取的数据需要加入到的IssueID
/// </summary>
[DataMember]
public string JoinIssueIDs { get; set; }
/// <summary>
/// 抓取数量,没有则默认20页
/// </summary>
[DataMember]
public int RequiredCount { get; set; }
/// <summary>
/// 地区
/// </summary>
[DataMember]
public string RegionIDs { get; set; }
/// <summary>
/// 分类
/// </summary>
[DataMember]
public string CategoryIDs { get; set; }
/// <summary>
/// 部门
/// </summary>
[DataMember]
public string DepartmentIDs { get; set; }
/// <summary>
/// 爬虫涉及行业
/// </summary>
[DataMember]
public string CIndustryIDs { get; set; }
/// <summary>
/// 产品
/// </summary>
[DataMember]
public string ProductIDs { get; set; }
/// <summary>
/// 人物
/// </summary>
[DataMember]
public string PeopleIDs { get; set; }
/// <summary>
/// 组织机构
/// </summary>
[DataMember]
public string OrganizationIDs { get; set; }
/// <summary>
/// 标签,多个使用英文逗号分隔
/// </summary>
[DataMember]
public string Tags { get; set; }
#endregion
#region Site包含字段
/// <summary>
/// 站点ID
/// </summary>
[DataMember]
public string SiteID { get; set; }
/// <summary>
/// 浏览器类型
/// </summary>
[DataMember]
public sbyte BrowserType { get; set; }
/// <summary>
/// 安全请求间隔毫秒数
/// </summary>
[DataMember]
public int IntervalMSBtwReqs { get; set; }
/// <summary>
/// 网页编码类型
/// </summary>
[DataMember]
public string Encoding { get; set; }
/// <summary>
/// 失败后的重试次数
/// </summary>
[DataMember]
public sbyte FailRetry { get; set; }
/// <summary>
/// 抽取模式
/// </summary>
[DataMember]
public sbyte ParseMethod { get; set; }
/// <summary>
/// 抽取列表页的Pattern
/// </summary>
[DataMember]
public string ListPattern { get; set; }
/// <summary>
/// 取相关内容
/// </summary>
[DataMember]
public string ListPatternNest { get; set; }
/// <summary>
/// 相关内容模式
/// </summary>
[DataMember]
public sbyte ListPatternNestMethod { get; set; }
/// <summary>
/// 多线程限制
/// </summary>
[DataMember]
public int ParallelLimit { get; set; }
/// <summary>
/// 下一页的UrlPattern
/// </summary>
[DataMember]
public string ListPatternNextPageUrl { get; set; }
/// <summary>
/// 是否区分大小写
/// </summary>
[DataMember]
public bool UrlCaseSensitive { get; set; }
[DataMember]
public sbyte ContentDetailLevel { get; set; }
#endregion
#region Media包含字段
/// <summary>
/// 媒体ID
/// </summary>
[DataMember]
public string MediaID
{ get; set; }
/// <summary>
/// 媒体名称
/// </summary>
[DataMember]
public string MediaName { get; set; }
/// <summary>
/// 频道名称
/// </summary>
[DataMember]
public string Channel { get; set; }
/// <summary>
/// 媒体类型
/// </summary>
[DataMember]
public sbyte MediaType { get; set; }
/// <summary>
/// 媒体区域类型
/// </summary>
[DataMember]
public sbyte RegionType { get; set; }
/// <summary>
/// 媒体权重
/// </summary>
[DataMember]
public sbyte MediaWeight { get; set; }
/// <summary>
/// 媒体性质
/// </summary>
[DataMember]
public sbyte MediaOrganType { get; set; }
/// <summary>
/// 媒体样式
/// </summary>
[DataMember]
public sbyte MediaStyle { get; set; }
/// <summary>
/// 媒体倾向性
/// </summary>
[DataMember]
public sbyte MediaTendency { get; set; }
/// <summary>
/// 媒体行业ID
/// </summary>
[DataMember]
public string IndustryIDs { get; set; }
/// <summary>
/// 媒体标签
/// </summary>
[DataMember]
public string MediaTag { get; set; }
/// <summary>
/// 媒体版权
/// </summary>
public int MediaCopyright { get; set; }
#endregion
}
[
[
{
"CrawlID": "抓取任务ID",
"Crawler": "爬虫",
"FetchTime": "抓取时间",
"UpdateTime": "0001-01-01T00:00:00",
"IndexTime": "0001-01-01T00:00:00",
"ContentDetailLevel": "内容详细程度",
"ItemID": "文章ID",
"ClientItemID": "客户系统或对方网站的ItemID",
"ParentItemID": "父级ItemID",
"Url": "文章Url",
"CleanTitle": "标题",
"ShortTitle": "短标题",
"CleanText": "正文",
"HTMLText": " HTML正文",
"Summary": "高达81%的受访企业表示在未来三年有明确增员计划,该比例较上一年大幅上涨24个百分点,其中预计人员增长10%以上企业比例超过1/3\r\n扩张中的企业出于战略布局的考虑,往往会在不同城市设立分部甚至变更总部所在地",
"PubDate": "页面显示发布时间",
"IP": 0,
"Location": "作者或内容的地点属性",
"PoID": "Location集合的ID,外部的(如点评)设为Url",
"PoIDSource": " LocationID的来源,_前缀表示url未解开,Location集合中没有",
"NearCBD_PoID": "接近某CBD的PoID",
"Lat": "纬度",
"Lon": "经度",
"AuthorName": "作者名(分词索引)",
"AuthorID": "作者ID",
"AuthorImg": "作者图片",
"AuthorCertificated": "实名认证状态",
"AuthorTag": "Author.Tag字段,用于区分人",
"Source": "显示来源(比如使用终端等)",
"AttachImg": "附带图片",
"AttachUrl": "附带的外部链接",
"AttachFile": "附件",
"RetweetID": "转载的ID,如果有的话",
"MediaID": "媒体编号ID",
"MediaName": "21世纪经济报道",
"MediaChannel": "政经",
"Rank": " 在媒体中的排序,用以表示顺序或者重要性(例如头版、首页之类)",
"MediaType":"媒体类型",
"MediaRegionType":"媒体区域类型",
"MediaWeight": " 媒体权重",
"MediaOrganType": "媒体性质",
"MediaStyle": "媒体样式",
"MediaTendency": "媒体倾向性",
"MediaIndustryIDs": "媒体行业ID",
"MediaTag": "媒体标签",
"ProxyZone": "代理分区 >0表示要翻墙",
"ReproducedMediaID": "页面显示原始媒体名称ID",
"ReproducedMediaName": "页面显示原始媒体名称",
"ReproducedUrl": "页面显示原始媒体的Url",
"MediaCopyright": "该媒体版权;0 未知,11 黑名单,12 无版权,13 不允许转载,14 拒绝抓取,20 白名单,21 采购可转授,22 自编,23 开放可商业转载,31 采购需单独授权,32 用户自有版权,33 用户自行承担版权,41 非商业使用",
"Tracking": "跟踪设置和信息(评论)",
"Tracking_Forward": "跟踪设置和信息(转发)",
"CurrentCount": "最新的点击数、评论数、转发数",
"CountHistory": " 定时获取的点击数、评论数、转发数(存ES)",
"Reply": "回复数据",
"DuplicationID": "相似转载ID(早先入库的相似文章ID,第一篇该字段为空,其他均为第一篇的ID)",
"SpliteTitle": "分词结果",
"SpliteText": " 分词结果",
"ProsdDuplication": "是否进行过转载分析",
"Tag": "通用标签,根据各实体库中的Tag字段设置",
"ResearchReport": "研报",
"AnalyzeData语义分析结果字段": [
{
"IssueID": "议题编号",
"MergeMethod": "归并入Issue的方式",
"CategoryIDs": " 用户分类ID(多个)",
"DepartmentIDs": " 用户部门ID(多个)",
"IndustryIDs": "行业ID(多个)",
"Product": " 产品名称(多个,<品牌-系列-品名-型号-类型>路径,用空格或,;隔开,分词索引)",
"TopicIDs": "用户话题ID(多个)",
"OrganizationIDs": "组织机构ID(多个)",
"People": "用户人物名称(多个,分词索引)",
"Region": "地区名称(多个,<省-市-区县-街道村镇>路径,用空格或,;隔开,分词索引)",
"Keywords": "关键词提取",
"Evaluation": "属性评分的结果",
"MarketIDs": " 品类/市场编号",
"BlockIDs": "主题/板块编号",
"StockIDs": "股票编号",
"RelatedEventID": "相关事件编号",
"DuplicationID": "转载编号",
"以下几个标志":"已分析过的标志",
"ProsdMedia": false,
"ProsdSplite": false,
"ProsdTopic": true,
"ProsdCategory": true,
"ProsdDepartment": false,
"ProsdIndustry": true,
"ProsdProduct": true,
"ProsdPeople": false,
"ProsdOrganization": true,
"ProsdRegion": true,
"ProsdSentiment": true,
"ProsdSentiment8": false,
"ProsdSentiment2": false,
"ProsdKeyword": false,
"prosdIG": false,
"Sentiment": "情感分析:正负情感(1 ~ 3)1:正面;2:中性;3:负面",
"EmotionHappy": "高兴",
"EmotionAccept": 0,
"EmotionSurprise": 0,
"EmotionFear": 0,
"EmotionSad": 0,
"EmotionHate": 0,
"EmotionExpect": 0,
"EmotionAngry": 0,
"EmotionPositive": 0,
"EmotionNegative": 0,
"IG": null
}
],
"ManageData管理字段": [
{
"EditorTitle": null,
"EditorText": null,
"Summary": null,
"CategoryID": null,
"IssueID": "greedc",
"IsRead": false,
"IsDeleted": false,
"IsDeletedAnalyzeData": null,
"IsReleasePrimaryUrl": false,
"AlertLevel": 1,
"MessageBox": false,
"Mark": 0,
"Tag": null,
"IsTrack": false,
"Order": "2018-04-24T17:53:26.9511106+08:00",
"Sort": 0,
"HandleStatus": 0,
"ReviewedByUserID": null,
"LastReviewedTime": "2018-04-24T17:53:26.9511106+08:00",
"ReviewedAlert": false,
"ReviewedCategory": false,
"ReviewedDepartment": false,
"ReviewedRegion": false,
"ReviewedIndustry": false,
"ReviewedProduct": false,
"ReviewedPeople": false,
"ZyyicItemData": null
}
],
"PublishData发布字段": [
{
"IssueID": "Reader",
"HTML": null,
"ToRelease": false,
"IsReleased": false,
"ReleaseTime": "0001-01-01T00:00:00",
"Level": 0,
"HasImgAtt": false,
"IsIndex": false,
"IsPic": false,
"FileName": null,
"IsDeleted": false,
"SinglePic": null,
"BigPic": "b/ca9166dbd4c3af6879dc829357933b19_big.jpg",
"YieldPic": null,
"TopPic": null,
"ManyPic": null,
"OperationName": null,
"OperationTime": "2018-04-24T17:55:04.5467767+08:00",
"DisplayTag": [
"城市",
"二线",
"写字楼",
"产业",
"房地产",
"服务商",
"高新",
"科技",
"产值"
]
}
],
"LabelData": " 语料标注字段",
"ReferredAuthorIDs": "提及(@)的用户(不含转发的@)",
"IsOriginalDeleted": "原文是否已被删除",
"IsEPRPublished": "是否是epr公司发布的软广",
"Struct": "文本结构化信息"
}
]
\ No newline at end of file
/// <summary>
/// <summary>
/// 舆情管理字段(每Issue可不同)
/// </summary>
[Serializable]
public class ItemManageData
{
/// <summary>
/// 修改后的标题
/// </summary>
public string EditorTitle { get; set; }
/// <summary>
/// 修改后的正文
/// </summary>
public string EditorText { get; set; }
/// <summary>
/// 摘要
/// </summary>
public string Summary { get; set; }
/// <summary>
/// 原始的频道
/// </summary>
public string CategoryID { get; set; }
/// <summary>
/// 议题编号
/// </summary>
public string IssueID { get; set; }
/// <summary>
/// 已读
/// </summary>
public Boolean IsRead { get; set; }
/// <summary>
/// 客户删除
/// </summary>
public Boolean IsDeleted { get; set; }
/// <summary>
/// 被删除的分析内容(作为删除标志的bug的替补,现用)
/// </summary>
public ItemAnalyzeData IsDeletedAnalyzeData { get; set; }
/// <summary>
/// 是否发布原文链接
/// </summary>
public bool IsReleasePrimaryUrl { get; set; }
/// <summary>
/// 预警级别(0不预警)
/// </summary>
public SByte AlertLevel { get; set; }
/// <summary>
/// 是否弹出提示窗口
/// </summary>
public bool MessageBox { get; set; }
/// <summary>
/// 星标(多级别)
/// </summary>
public SByte Mark { get; set; }
/// <summary>
/// 用户自定义标签
/// </summary>
public string Tag { get; set; }
/// <summary>
/// 是否被设为跟踪
/// </summary>
public bool IsTrack { get; set; }
/// <summary>
/// 排序
/// </summary>
public DateTime Order { get; set; }
/// <summary>
/// 排序
/// </summary>
public int Sort { get; set; }
/// <summary>
/// 处理流程状态
/// </summary>
public SByte HandleStatus { get; set; }
/// <summary>
/// 修改人的UserID
/// </summary>
public string ReviewedByUserID { get; set; }
/// <summary>
/// 最后更新时间
/// </summary>
public DateTime LastReviewedTime { get; set; }
/// <summary>
/// 预警是否审核过
/// </summary>
public Boolean ReviewedAlert { get; set; }
/// <summary>
/// 分类人工审阅标志
/// </summary>
public Boolean ReviewedCategory { get; set; }
/// <summary>
/// 部门人工审阅标志
/// </summary>
public Boolean ReviewedDepartment { get; set; }
/// <summary>
/// 地区人工审阅标志
/// </summary>
public Boolean ReviewedRegion { get; set; }
/// <summary>
/// 行业人工审阅标志
/// </summary>
public Boolean ReviewedIndustry { get; set; }
/// <summary>
/// 产品人工审阅标志
/// </summary>
public Boolean ReviewedProduct { get; set; }
/// <summary>
/// 人名人工审阅标志
/// </summary>
public Boolean ReviewedPeople { get; set; }
/// <summary>
/// 智又盈定制字段!
/// </summary>
public ZyyicItemData ZyyicItemData { get; set; }
}
/// <summary>
/// <summary>
/// 发布管理属性(每Issue可不同)
/// </summary>
[Serializable]
public class ItemPublishData
{
public ItemPublishData()
{
}
/// <summary>
/// 议题编号
/// </summary>
public string IssueID { get; set; }
/// <summary>
/// 用于发布的HTML
/// </summary>
public string HTML { get; set; }
/// <summary>
/// 在待审核中/可供发布
/// </summary>
public bool ToRelease { get; set; }
/// <summary>
/// 标记已发布
/// </summary>
public bool IsReleased { get; set; }
/// <summary>
/// 显示发布时间
/// </summary>
[BsonDateTimeOptions(Kind = DateTimeKind.Local)]
public DateTime ReleaseTime { get; set; }
/// <summary>
/// 文章等级(表示热门等)
/// </summary>
public sbyte Level { get; set; }
/// <summary>
/// 包含图片附件
/// </summary>
public bool HasImgAtt { get; set; }
/// <summary>
/// 是否设为首页
/// </summary>
public bool IsIndex { get; set; }
/// <summary>
/// 是否包含图片
/// </summary>
public bool IsPic { get; set; }
/// <summary>
/// 文件名称
/// </summary>
public string FileName { get; set; }
/// <summary>
/// 是否删除
/// </summary>
public bool IsDeleted { get; set; }
/// <summary>
/// 单图模式
/// </summary>
public string SinglePic { get; set; }
/// <summary>
/// 大图模式 552*396
/// </summary>
public string BigPic { get; set; }
/// <summary>
/// 专题图片 395*168
/// </summary>
public string YieldPic { get; set; }
/// <summary>
/// 头条图片 600*320
/// </summary>
public string TopPic { get; set; }
/// <summary>
/// 多图模式
/// </summary>
public string[] ManyPic { get; set; }
/// <summary>
/// 操作人
/// </summary>
public string[] OperationName { get; set; }
/// <summary>
/// 操作时间
/// </summary>
public DateTime OperationTime { get; set; }
/// <summary>
/// 用来阅读器显示标签
/// </summary>
public string[] DisplayTag { get; set; }
}
@property
def MediaName(self):
return self.__MediaName
@MediaName.setter
def MediaName(self,MediaName):
self.__MediaName = str(MediaName)
@property
def ItemAnalyzeDataList(self):
return self.__ItemAnalyzeDataList
@ItemAnalyzeDataList.setter
def ItemAnalyzeDataList(self,ItemAnalyzeDataList):
self.__ItemAnalyzeDataList = [](ItemAnalyzeDataList)
@property
def MediaOrganType(self):
return self.__MediaOrganType
@MediaOrganType.setter
def MediaOrganType(self,MediaOrganType):
self.__MediaOrganType = int(MediaOrganType)
\ No newline at end of file
@property
@property
def MediaName(self):
return self.__MediaName
@MediaName.setter
def MediaName(self,MediaName):
self.__MediaName = str(MediaName)
\ No newline at end of file
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment