HWCloudAI 发表于 2022/12/28 12:30:35 2022/12/28
利用图数据库研究 COVID-19 论文数据集

COVID-19 大流行的形势依然很严峻,为应对 COVID-19 的传播及其对我们的影响,AI2等提供了一份 COVID-19 开放研究数据集(CORD-19)。CORD-19 数据集是关于冠状病毒的文献集,提供了超过50万篇学术论文的相关信息。我们研究了这份数据集,并用图数据库去组织和挖掘这份数据集蕴含的信息。

针对 CORD-19,我们设计了以下的图模型。

图模型中包含了两类点,分别是 paper 和 author,也包含了两类边,分别是 write 和 reference,下面的表格说明了它们的详细情况。


原始数据集位于COVID-19 开放研究数据集,根据我们设计的图模型重新组织的数据可从此处下载,这份图数据包含70万+ paper 点,173万+ author 点,67万+ reference 边,443万+ write 边。


利用华为云图数据库 GES 对以上数据集进行探索,需要先在 GES 上创图,GES 创图的详细流程见华为云图引擎服务 GES 实战——创图

使用 GES 查询的预置条件

下面封装了使用 GES 查询的预置条件,包括配置相关参数和对所调用 API 接口的封装,如果你对这些不感兴趣,可直接运行而不需要了解细节,这对理解后续具体查询没有影响。


import requests
import json
import time
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

class config(object):
    def __init__(self, iam_url, user_name, password, domain_name, project_name, eip, project_id, graph_name):
        self.iam_url = iam_url
        self.user_name = user_name
        self.password = password
        self.domain_name = domain_name
        self.project_name = project_name
        self.eip = eip
        self.project_id = project_id
        self.graph_name = graph_name
        self.token = self.get_token()

    def get_token(self):
        url = ('https://{}/v3/auth/tokens').format(self.iam_url)
        headers = {'Content-Type': 'application/json;charset=utf8'}
        body = json.dumps({"auth": {
            "identity": {
                "methods": ["password"],
                "password": {
                    "user": {
                        "name": self.user_name,
                        "password": self.password,
                        "domain": {
                            "name": self.domain_name
            "scope": {
                "project": {
                    "name": self.project_name
        r = requests.post(url=url, data=body, verify=False, headers=headers)
        return r.headers['x-subject-token']

config类的参数都是与调用 GES 服务有关的参数,依次为“终端节点”、“IAM 用户名”、“IAM 用户密码”、“IAM 用户所属账户名”、“项目名称”、“公网访问地址”、“项目ID”、“token值”,其获取方式可参考调用 GES 服务业务面 API 相关参数的获取


class GESFunc(object):

    def __init__(self, eip, project_id, graph_name, token):
        self.eip = eip
        self.project_id = project_id
        self.graph_name = graph_name
        self.headers = {'X-Auth-Token': token, 'Content-Type': 'application/json'}

    def build_vertex_index(self):
        url = ('http://{}/ges/v1.0/{}/graphs/{}/indices').format(self.eip,  self.project_id, self.graph_name)
        body = json.dumps({
            "indexName": "cypher_vertex_index",
            "indexType": "GlobalCompositeVertexIndex",
            "hasLabel": "true",
            "indexProperty": []
        r = requests.post(url=url, data=body, headers=self.headers)
        return r.json()['jobId']

    def build_edge_index(self):
        url = ('http://{}/ges/v1.0/{}/graphs/{}/indices').format(self.eip, self.project_id, self.graph_name)
        body = json.dumps({
            "indexName": "cypher_edge_index",
            "indexType": "GlobalCompositeEdgeIndex",
            "hasLabel": "true",
            "indexProperty": []
        r = requests.post(url=url, data=body, headers=self.headers)
        return r.json()['jobId']

    def get_job(self, job_id):
        url = ('http://{}/ges/v1.0/{}/graphs/{}/jobs/{}/status').format(self.eip, self.project_id, self.graph_name, job_id)
        r = requests.get(url=url, headers=self.headers)
        output = r.json()
        return output

    def summary(self):
        url = ('http://{}/ges/v1.0/{}/graphs/{}/summary?label_details=true').format(self.eip, self.project_id, self.graph_name)
        r = requests.get(url=url, headers=self.headers)
        output = r.json()['data']
        return output

    def cypher_query(self, statement):
        url = ('http://{}/ges/v1.0/{}/graphs/{}/action?action_id=execute-cypher-query').format(self.eip, self.project_id, self.graph_name)
        body = json.dumps({
            "statements": [
                    "statement": statement,
                    "parameters": {},
                    "resultDataContents": [
                    "includeStats": False
        r = requests.post(url=url, data=body, headers=self.headers)
        output = r.json()['results']
        return output
    def format_cypher_result(self, json_obj):
        for x in json_obj:
            columns = x["columns"]
            data = x["data"]
            rows = []
            for y in data:
            return pd.DataFrame(rows, columns=columns)


# 需填入参数
iam_url = ''
user_name = ''
password = ''
domain_name = ''
project_name = ''
eip = ''
project_id = ''
graph_name = ''
config = config(iam_url, user_name, password, domain_name, project_name, eip, project_id, graph_name)
ges_util = GESFunc(config.eip, config.project_id, config.graph_name, config.token)
GES 支持 cypher 查询语言,后续的查询示例使用的是 cypher 查询语言。在使用 cypher 查询之前,我们先创建点索引和边索引。

job_id = ges_util.build_vertex_index()
job_result = ges_util.get_job(job_id)
if 'errorCode' not in job_result:
    for i in range(100):
        if job_result['status'] == 'success':
            job_result = ges_util.get_job(job_id)

job_id = ges_util.build_edge_index()
job_result = ges_util.get_job(job_id)
if 'errorCode' not in job_result:
    for i in range(100):
        if job_result['status'] == 'success':
            job_result = ges_util.get_job(job_id)



下面,我们可以书写并运行标准的 cypher 语言,对这份图数据进行探索了。以下是部分图数据的可视化展示。


result = ges_util.summary()
res_str = json.dumps(result, indent=4)


    "vertexNum": 2449792,

    "labelDetails": {

        "labelInVertex": {

            "paper": 712464,

            "author": 1737328


        "labelInEdge": {

            "reference": 670627,

            "write": 4434198



    "edgeNum": 5104825


列举若干 paper 的信息:

print('查询 papers:')
statement = "match (n:paper) return n.title limit 5"
paper_result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(paper_result)
查询 papers:
0 Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia
1 Nitric oxide: a pro-inflammatory mediator in lung disease?
2 Surfactant protein-D and pulmonary host defense
3 Role of endothelin-1 in lung disease
4 Gene expression in epithelial cells in response to pneumovirus infection

查询某 paper 的 authors:

print('查询某 paper 的 authors:')
paper = paper_result[0]['data'][1]['row'][0]
statement = "match (n:paper)<--(m:author) WHERE n.title = '" + paper + "' return id(m)"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询某 paper 的 authors:
0 Vliet Albert van der
1 Eiserich Jason P
2 Cross Carroll E

查询某 paper 被引用的次数:

print('查询某 paper 被引用次数:')
paper = paper_result[0]['data'][2]['row'][0]
statement = "match (n:paper)<--(m:paper) WHERE n.title = '" + paper + "' return count(m)"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询某 paper 被引用次数:
0 5

查询某 paper 与哪些 paper 被联合引用及其次数:

print('查询某 paper 与哪些 paper 被联合引用及其次数:')
paper = paper_result[0]['data'][3]['row'][0]
statement = "match (n:paper)<--(p:paper)-->(m:paper) WHERE n.title = '" + paper + "' return m.title, count(*) as p order by p desc limit 5"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询某 paper 与哪些 paper 被联合引用及其次数:
m.title p
0 Clinical course and risk factors for mortality of adult inpatients with COVID-19 in Wuhan, China: a retrospective cohort study 2
1 The use of corticosteroid as treatment in SARS was associated with adverse outcomes: a retrospective cohort study 1
2 Covid-19: ibuprofen should not be used for managing symptoms, say doctors and scientists 1
3 SARS-CoV2: should inhibitors of the renin-angiotensin system be withdrawn in patients with COVID-19? 1
4 Network-based drug repurposing for novel coronavirus 2019-nCoV/SARS-CoV-2 1

查询标题带关键字"Virus"的 paper 数量:

print('查询标题带关键字"Virus"的 paper 数量:')
statement = "MATCH (p:paper) WHERE p.title IS NOT NULL AND p.title CONTAINS('Virus') RETURN count(p)"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询标题带关键字"Virus"的 paper 数量:
0 8354

查询标题带关键字"Virus"的 paper,并按发表日期排序:

print('查询标题带关键字"Virus"的 paper,并按发表日期排序:')
statement = "MATCH (p:paper) WHERE p.publish_time IS NOT NULL AND p.title IS NOT NULL AND p.title " \
            "CONTAINS('Virus') RETURN p.title, p.publish_time ORDER BY p.publish_time DESC LIMIT 5"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询标题带关键字"Virus"的 paper,并按发表日期排序:
p.title p.publish_time
0 Rates of Coinfection Between SARS-CoV-2 and Other Respiratory Viruses in Korea 2022
1 Predict Mortality in Patients Infected with COVID-19 Virus Based on Observed Characteristics of the Patient using Logistic Regression 2021-12-31
2 P140 TeCC (TeleMedicine, Cystic Fibrosis, Corona-Virus) study in a previous telemedicine-naive centre: clinical challenges, outcomes, and user experience in the first six months of a global pandemic 2021-12-31
3 Virus structure and structure-based antivirals 2021-12-31
4 Virus-associated ribozymes and nano carriers against COVID-19. 2021-12-01

查询 paper 被引用的次数并以此排序:

print('查询 paper 被引用的次数并以此排序:')
statement = "match (m:paper)<--(p:paper) with m, count(p) as citedNum return m.title, citedNum order by citedNum desc limit 5"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询 paper 被引用的次数并以此排序:
m.title citedNum
0 Publisher's Note 5436
1 Clinical course and risk factors for mortality of adult inpatients with COVID-19 in Wuhan, China: a retrospective cohort study 3824
2 A pneumonia outbreak associated with a new coronavirus of probable bat origin 3677
3 Epidemiological and clinical characteristics of 99 cases of 2019 novel coronavirus pneumonia in Wuhan, China: a descriptive study 3091
4 A new coronavirus associated with human respiratory disease in China 1975

列举若干 author 的信息:

print('查询 authors:')
statement = "match (n:author) return id(n) limit 5"
author_result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(author_result)
查询 authors:
0 Madani Tariq A
1 Al-Ghamdi Aisha A
2 Vliet Albert van der
3 Eiserich Jason P
4 Cross Carroll E

查询某 author 的合作者及合作次数:

print('查询某 author 的合作者及合作次数:')
author = author_result[0]['data'][0]['row'][0]
statement = "match (n:author)-->(p:paper)<--(m:author) WHERE id(n) = '" + author + "' return id(m), count(*) as p order by p desc limit 5"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询某 author 的合作者及合作次数:
id(m) p
0 Petersen Eskild 4
1 Zumla Alimuddin 4
2 Drosten Christian 4
3 Hui David S 4
4 I Azhar Esam 4

查询某 author 论文被引用次数:

print('查询某 author 论文被引用次数:')
author = author_result[0]['data'][0]['row'][0]
statement = "match (m:author)-->(p:paper)<--(n:paper) where id(m) = '" + author + "' return count(n)"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
查询某 author 论文被引用次数:
0 9


statement = "match (m:paper)-->(p:paper)-->(n:paper) where id(m) <> id(p) and id(m) = id(n) return m,p limit 10"
result = ges_util.cypher_query(statement)
format_result = ges_util.format_cypher_result(result)
m p
0 {'journal': 'Rev Mal Respir', 'publish_time': '2008-01-03', 'title': 'Chronique pour une pandémie grippale annoncée'} {'journal': 'Revue des Maladies Respiratoires', 'publish_time': '2004-12-31', 'title': 'La communication sur le SRAS : un outil essentiel de santé publique'}
1 {'journal': 'Intensive Care Med', 'publish_time': '2020-03-23', 'title': 'Acute respiratory distress syndrome-attributable mortality in critically ill patients with sepsis'} {'journal': 'J Biomed Sci', 'publish_time': '2003-10-18', 'title': 'Acute respiratory distress syndrome'}
2 {'journal': None, 'publish_time': '2020-11-19', 'title': 'Curvature domains in V4 of macaque monkey'} {'journal': None, 'publish_time': '2020-11-19', 'title': 'Curvature-processing domains in primate V4'}
3 {'journal': None, 'publish_time': '2020-11-19', 'title': 'Curvature-processing domains in primate V4'} {'journal': None, 'publish_time': '2020-11-19', 'title': 'Curvature domains in V4 of macaque monkey'}
4 {'journal': 'Theor Med Bioeth', 'publish_time': '2020-12-17', 'title': 'Philosophical investigations into the essence of pediatric suffering'} {'journal': 'Theor Med Bioeth', 'publish_time': '2021-01-05', 'title': 'What we talk about when we talk about pediatric suffering'}
5 {'journal': 'Theor Med Bioeth', 'publish_time': '2020-12-17', 'title': 'Philosophical investigations into the essence of pediatric suffering'} {'journal': 'Theor Med Bioeth', 'publish_time': '2020-12-07', 'title': 'Relational suffering and the moral authority of love and care'}
6 {'journal': 'Theor Med Bioeth', 'publish_time': '2020-12-17', 'title': 'Philosophical investigations into the essence of pediatric suffering'} {'journal': 'Theor Med Bioeth', 'publish_time': '2020-12-07', 'title': 'Our suffering and the suffering of our time'}
7 {'journal': 'PLoS One', 'publish_time': '2021-04-01', 'title': 'Prevalence of depression, anxiety and associated factors among school going adolescents in Bangladesh: Findings from a cross-sectional study'} {'journal': 'BMC Psychiatry', 'publish_time': '2021-05-25', 'title': 'Prevalence and correlates of anxiety and depression in frontline healthcare workers treating people with COVID-19 in Bangladesh'}
8 {'journal': 'PLoS Comput Biol', 'publish_time': '2020-09-21', 'title': 'Fast estimation of time-varying infectious disease transmission rates'} {'journal': 'PLoS Biol', 'publish_time': '2020-12-21', 'title': 'Patterns of smallpox mortality in London, England, over three centuries'}
9 {'journal': 'Commun Biol', 'publish_time': '2021-04-22', 'title': 'Structure and assembly of double-headed Sendai virus nucleocapsids'} {'journal': 'PLoS Pathog', 'publish_time': '2021-07-16', 'title': 'CryoEM structure of the Nipah virus nucleocapsid assembly'}
