Finding Occurrences of Chemicals and Drugs in PubMed Abstracts

Searches the ElasticSearch index created during CaseOLAP pipeline run for curated list of Drugs and Chemicals related to oxidative stress

Output:

Chemical_PMID_occurances.csv: CSV table where each row is the occurance of a chemical in PubMed
Drug_PMID_occurances.csv: CSV table where each row is the occurance of a drug in PubMed

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
import pandas as pd
from itertools import product
import seaborn as sns
import numpy as np
import time
import matplotlib.pyplot as plt
import json
import progressbar

Load Drug and Chemical lists, initialize Elastic Search

Requires elastic search engine to be running on cluster. Must have PMID index

chemical_list_df = pd.read_csv('input/oxidative_stress_chemicals_SA_10222019.csv')
chemical_list_df['Molecule/Enzyme/Protein'] = chemical_list_df['Molecule/Enzyme/Protein'].str.lower().str.strip()

chemical_list_df.head()

	Biological Events of Oxidative Stress	Molecular and Functional Categories	Molecule/Enzyme/Protein	MeSH Heading	MeSH Supplementary	MeSH tree numbers	Chemical Formula	Examples	Pharm Actions	Tree Numbers	References
0	Initiation of Oxidative	Reactive Oxygen Species (ROS)	superoxide (anion radical)	Superoxides	NaN	D01.248.497.158.685.750.850; D01.339.431.374.8...	O2-	Superoxide, Hydrogen Peroxide	Oxidants	D27.720.642,\nD27.888.569.540	PMID: 25547488
1	Initiation of Oxidative	Reactive Oxygen Species (ROS)	hydrogen peroxide	Hydrogen Peroxide	NaN	D01.248.497.158.685.750.424; D01.339.431.374.4...	H2O2	NaN	Anti-Infective Agents, Local	D27.505.954.122.187	NaN
2	Initiation of Oxidative	Reactive Oxygen Species (ROS)	NaN	NaN	NaN	NaN	NaN	NaN	Oxidants	D27.720.642,\nD27.888.569.540	NaN
3	Initiation of Oxidative	Reactive Oxygen Species (ROS)	hydroxyl (radical)	Hydroxyl Radical	NaN	D01.339.431.249; D01.248.497.158.459.300; D01....	HO	NaN	Oxidants	D27.720.642,\nD27.888.569.540	NaN
4	Initiation of Oxidative	Reactive Oxygen Species (ROS)	alpha oxygen	None listed	NaN	NaN	NaN	NaN	NaN	NaN	NaN

drug_list_df = pd.read_csv('input/drug_list_SA_10222019.csv')
drug_list_df['Name'] = drug_list_df['Name'].str.lower().str.strip()
drug_list_df.head()

	Drug Category	#	Name	Synonyms	MeSH Descriptor	MeSH tree(s)	Common adverse effects	Dosage (freq/amount/time/delivery)	Duration (time)	Pharm Action	...	Unnamed: 1015	Unnamed: 1016	Unnamed: 1017	Unnamed: 1018	Unnamed: 1019	Unnamed: 1020	Unnamed: 1021	Unnamed: 1022	Unnamed: 1023	Unnamed: 1024
0	Anticoagulants	1.0	heparin	['Calciparine', 'Eparina', 'heparina', 'Hepari...	heparin	D09.698.373.400	Thrombocytopenia, Cerebral haemorrhage, Haemog...	1/18U/kg/iv	2 days	Anticoagulants, \nFibrinolytic Agents	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Anticoagulants	2.0	warfarin	['4-Hydroxy-3-(3-oxo-1-phenylbutyl)coumarin', ...	warfarin	D03.383.663.283.446.520.914\nD03.633.100.150.4...	Haemorrhage, Haematoma, anaemia, Epistaxis, hy...	1/2-10mg/day/po	As needed	Anticoagulants, \nRodenticides	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Thrombolytics	3.0	streptokinase	['Streptokinase C precursor']	streptokinase	D08.811.277.656.300.775\nD12.776.124.125.662.537	blurred vision, confusion, dizziness, fever, s...	1/1,500,000 IU/iv	60min	Fibrinolytic Agents	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Thrombolytics	4.0	urokinase	['U-plasminogen activator', 'uPA', 'Urokinase-...	Urokinase-Type Plasminogen Activator	D08.811.277.656.300.760.910\nD08.811.277.656.9...	bleeding gums, coughing up blood, dizziness, h...	1/4,000,000U/iv	10min	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Thrombolytics	5.0	tpa	['Alteplasa', 'Alteplase (genetical recombinat...	Tissue Plasminogen Activator	D08.811.277.656.300.760.875\nD08.811.277.656.9...	NaN	1/0.9mg/kg/iv	60min	Fibrinolytic Agents	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 1025 columns

es = Elasticsearch(timeout=300)

Find PMIDs associated with every combination of drugs and chemicals via elastic search

output file saved to output/chem_drug_pair_matches.csv

# All combinations of drugs and chemicals
drug_chemicals = product(
    drug_list_df['Name'].dropna().unique(),
    chemical_list_df['Molecule/Enzyme/Protein'].dropna().unique()
)

matches = pd.DataFrame()
for (drug, chemical) in progressbar.progressbar(drug_chemicals):

    mol_matches = {
            'PMID': [],
            'title': [],
            'Year': [],
            'Month': []
        }    
    # Match drug and chemical
    q = Q("match_phrase", abstract=drug.lower().strip()) & Q("match_phrase", abstract=chemical.lower().strip())

    # Search
    hits = Search(
        using=es,
        index="pubmed"
    ).params(
        request_timeout=300
    ).query(q)
    for h in hits.scan():
        date_dict = json.loads(h.date.replace("'", '"'))        
        mol_matches['PMID'].append(h.pmid)
        mol_matches['title'].append(h.title)
        mol_matches['Year'].append(date_dict['Year'])
        mol_matches['Month'].append(date_dict['Month'])

    match_df = pd.DataFrame.from_dict(mol_matches)
    match_df['drug'] = drug.lower().strip()
    match_df['chemical'] = chemical.lower().strip()
    matches = matches.append(match_df)

matches.head()

| |                   #                           | 21059 Elapsed Time: 0:13:02

	PMID	title	Year	Month	drug	chemical
0	8376590	Homocysteine, a thrombogenic agent, suppresses...	1993	Sep	heparin	hydrogen peroxide
1	25037421	Degradation of fucoidans from Sargassum fulvel...	2014	Oct	heparin	hydrogen peroxide
2	14561655	Role of hydrogen peroxide in sperm capacitatio...	2004	Feb	heparin	hydrogen peroxide
3	9040037	Protective effect of dextran sulfate and hepar...	1997	Jan	heparin	hydrogen peroxide
4	10547607	Heparin-binding EGF-like growth factor is expr...	1999	Nov	heparin	hydrogen peroxide

matches.to_csv('output/chem_drug_pair_matches.csv', index=False)

matches = pd.read_csv('output/chem_drug_pair_matches.csv')

Plotting Drug, chemical co-occurence heatmap

colors used to show category of drug, and oxidative stress category association of chemicals

chem_name_cats = chemical_list_df[['Molecule/Enzyme/Protein', 'Biological Events of Oxidative Stress']]\
    .drop_duplicates().rename(columns={
        'Molecule/Enzyme/Protein': 'chemical',
        'Biological Events of Oxidative Stress':'chem_cat'
    }).dropna()
drug_name_cats = drug_list_df[['Name', 'Drug Category']]\
    .drop_duplicates().rename(columns={
        'Name': 'drug',
        'Drug Category': 'drug_cat'
    }).dropna()
with_cats_matches = matches.merge(
    chem_name_cats,
    how='left',
    validate='m:m'
)
with_cats_matches = with_cats_matches.merge(
    drug_name_cats,
    how='left',
    validate='m:m'
)
print(matches.dropna().shape, with_cats_matches.dropna().shape)
with_cats_matches.head()

(199479, 6) (204265, 8)

	PMID	title	Year	Month	drug	chemical	chem_cat	drug_cat
0	8376590	Homocysteine, a thrombogenic agent, suppresses...	1993.0	Sep	heparin	hydrogen peroxide	Initiation of Oxidative	Anticoagulants
1	25037421	Degradation of fucoidans from Sargassum fulvel...	2014.0	Oct	heparin	hydrogen peroxide	Initiation of Oxidative	Anticoagulants
2	14561655	Role of hydrogen peroxide in sperm capacitatio...	2004.0	Feb	heparin	hydrogen peroxide	Initiation of Oxidative	Anticoagulants
3	9040037	Protective effect of dextran sulfate and hepar...	1997.0	Jan	heparin	hydrogen peroxide	Initiation of Oxidative	Anticoagulants
4	10547607	Heparin-binding EGF-like growth factor is expr...	1999.0	Nov	heparin	hydrogen peroxide	Initiation of Oxidative	Anticoagulants

with_cats_matches.chem_cat.unique()

array(['Initiation of Oxidative', 'Outcomes of Oxidative Stress',
       'Regulation of Oxidative Stress'], dtype=object)

# Creating color palettes to label drug and chemical categories
chems = with_cats_matches.chem_cat.unique()
chem_pal = sns.color_palette("hls", n_colors=with_cats_matches.chem_cat.nunique())
chem_pal_dict = dict(zip(chems, chem_pal))

drugs = with_cats_matches.drug_cat.unique()
drug_pal = sns.color_palette("tab20c", n_colors=with_cats_matches.drug_cat.nunique())
drug_pal[-5:] = sns.color_palette("tab20b", n_colors=5)
drug_pal_dict = dict(zip(drugs, drug_pal))

with_cats_matches['chem_color'] = with_cats_matches.chem_cat.map(chem_pal_dict)
with_cats_matches['drug_color'] = with_cats_matches.drug_cat.map(drug_pal_dict)

sns.palplot(chem_pal)
plt.gca().set_xticklabels(chems)
plt.xticks(rotation=60)
sns.palplot(drug_pal)
plt.gca().set_xticklabels(drugs)
plt.xticks(rotation=90);

png

## Save Color Palettes
with open('drug_cat_palette.json', 'w') as fp:
    json.dump(drug_pal_dict, fp)

with open('chem_cat_palette.json', 'w') as fp:
    json.dump(chem_pal_dict, fp)

# Set NaN category color to white
with_cats_matches.loc[with_cats_matches.drug_color.isna(), 'drug_color'] = "white"

# Count articles per drug-chemical co-occurrence
article_count = pd.DataFrame(
    with_cats_matches.groupby(['drug', 'chemical', 'drug_cat', 'chem_cat']).PMID.nunique()
).reset_index().rename(columns={'PMID': 'Article Count'})
article_count['log_count'] = np.log10(article_count['Article Count'])

chem_colors_df = with_cats_matches[['chemical', 'chem_color']].drop_duplicates()
chem_colors = [chem_colors_df[chem_colors_df.chemical == chem].chem_color.unique()[0] for chem in piv_count.index]

drug_colors_df = with_cats_matches[['drug', 'drug_color']].drop_duplicates()
drug_colors = [drug_colors_df[drug_colors_df.drug == drug].drug_color.unique()[0] for drug in piv_count.columns]

piv_count = article_count.pivot_table(
    index='chemical',
    columns='drug',
    values='log_count',
    fill_value=0
)
sns.clustermap(
    piv_count,
    figsize=(22,13),
    cmap='viridis',
    row_colors=chem_colors,
    col_colors=drug_colors
)

<seaborn.matrix.ClusterGrid at 0x7ff7afcb4898>

png

Find PMIDS assocaited with drugs via elastic search

Searches abstracts for drug names or synonyms of drug names
Finds number of occurances of drug name or synonyms in abstract

Saves to output/Drug_PMID_occurances.csv

drug_matches = pd.DataFrame()
tot = drug_list_df.Name.nunique()

for (drug, synonyms, category), m_df in progressbar.progressbar(drug_list_df.groupby(['Name', 'Synonyms', 'Drug Category'])):
    drug_match = {
            'PMID': [],
            'title': [],
            'MeSH': [],
            'count': [],
            'Year': [],
            'Month': []
        }
    synonyms = synonyms.split(', ')
    drug = drug.lower()
    q = Q('match_phrase', abstract=drug)

    if synonyms:
        synonyms = [s.lower() for s in synonyms]
        for s in synonyms:
            q = q | Q('match_phrase', abstract=s)

    hits = Search(
        using=es,
        index="pubmed"
    ).query(q)

    for h in hits.scan():
        date_dict = json.loads(h.date.replace("'", '"'))        
        drug_match['PMID'].append(h.pmid)
        drug_match['title'].append(h.title)
        drug_match['MeSH'].append(h.MeSH)
        drug_match['Year'].append(date_dict['Year'])
        drug_match['Month'].append(date_dict['Month'])

        entity_count = 0
        for phrase in [drug] + synonyms:
            entity_lower = phrase.lower().replace("-", " ")
            entity_count += abs_lower.count(entity_lower)

        drug_match['count'].append(entity_count)

    drug_match_df = pd.DataFrame.from_dict(drug_match)
    drug_match_df['drug'] = drug
    drug_match_df['category'] = category
    drug_matches = drug_matches.append(drug_match_df)


drug_matches.head()

100% (161 of 161) |######################| Elapsed Time: 0:37:59 Time:  0:37:59

	PMID	title	MeSH	Year	Month	drug	category
0	24853116	Acarbose monotherapy and weight loss in Easter...	[Acarbose, therapeutic use, Asian Continental ...	2014	Nov	acarbose	Alpha-glucosidase Inhibitors
1	24863354	Comparative evaluation of polysaccharides isol...	[Asteraceae, chemistry, Astragalus Plant, chem...	2014	Apr	acarbose	Alpha-glucosidase Inhibitors
2	24866329	Effects of sitagliptin or mitiglinide as an ad...	[Acarbose, therapeutic use, Aged, Asian Contin...	2014	Jul	acarbose	Alpha-glucosidase Inhibitors
3	12918894	Nateglinide (Starlix): update on a new antidia...	[Blood Glucose, physiology, Cyclohexanes, phar...			acarbose	Alpha-glucosidase Inhibitors
4	20568489	Digoxin: serious drug interactions.	[Digoxin, adverse effects, blood, Drug Interac...	2010	Apr	acarbose	Alpha-glucosidase Inhibitors

drug_matches.to_csv('output/Drug_PMID_occurances.csv', index=False)
drug_matches.shape

(2702853, 8)

drug_category_PMID_count = pd.DataFrame(drug_matches.groupby('category').PMID.nunique()).reset_index()
drug_category_PMID_count

	category	PMID
0	ACE Inhibitors	70299
1	Alpha-glucosidase Inhibitors	2008
2	Angiotensin II Antagonists	16230
3	Anticoagulants	80628
4	Antiplatelets	82006
5	Beta Blockers	949488
6	Bile Acid Resins	2459
7	Calcium Antagonist	47735
8	Calcium Channel Blockers	25446
9	Cholesteron Absorption Blocker	2554
10	Diuretics	28829
11	Fibrates	12214
12	Glucagon-like peptide-1 blockers	5180
13	HMG-CoA Reductase inhibitors (Statins)	22166
14	Inotropes	246382
15	Insulin	7386
16	Metformin	16564
17	Na Channel Blockers	42512
18	Other Anti Arrhythmics	130224
19	Phosphodiesterase Inhbitors	21894
20	Potassium Channel Blockers	10292
21	Sulfonylureas	12331
22	Thiazolidinediones	5089
23	Thrombolytics	309306
24	Vasodilators	408803
25	Vasopressin Antagonists	850

Searching for PMIDs associated with each chemical

If there is a MeSH id, searches pubmed index for abstract containing drug name OR MeSH terms containin mesh term
If there is no MeSH id, only searches for drug name in abstract

saves to output/Chemical_PMID_occurances.csv

has_data_df = chemical_list_df[
    (~chemical_list_df['Molecule/Enzyme/Protein'].isnull()) |
    (~chemical_list_df['MeSH Heading'].isnull())
]
chem_matches_df = pd.DataFrame()

for (name, mesh, category), m_df in progressbar.progressbar(has_data_df.groupby(['Molecule/Enzyme/Protein', 'MeSH Heading', 'Biological Events of Oxidative Stress'])):
    hit_dict = {
        'PMID': [],
        'Article MeSH': [],
        'Year': [],
        'Month': [],
    }
    if mesh.lower() == 'none listed':
        q = Q('match_phrase', abstract=name.lower())
    else:
        q = Q('match_phrase', abstract=name.lower()) | Q('match_phrase', MeSH=mesh)

    hits = Search(
        using=es,
        index="pubmed"
    ).params(
        request_timeout=300
    ).query(q)
    for h in hits.scan():
        date_dict = json.loads(h.date.replace("'", '"'))
        hit_dict['PMID'].append(h.pmid)
        hit_dict['Article MeSH'].append(h.MeSH)
        hit_dict['Year'].append(date_dict['Year'])
        hit_dict['Month'].append(date_dict['Month'])


    hit_df = pd.DataFrame.from_dict(hit_dict)
    hit_df['category'] = category
    hit_df['chemical'] = name
    hit_df['MeSH'] = mesh
    chem_matches_df = chem_matches_df.append(hit_df)

chem_matches_df.head()

100% (157 of 157) |######################| Elapsed Time: 0:44:52 Time:  0:44:52

	PMID	Article MeSH	Year	Month	category	chemical	MeSH
0	24852702	[Alcohols, metabolism, toxicity, Aldehydes, me...	2014	Sep	Initiation of Oxidative	4-hydroxy-2-nonenal (4-HNE)	Aldehydes
1	24854020	[Adult, Aldehydes, metabolism, Case-Control St...	2015	Apr	Initiation of Oxidative	4-hydroxy-2-nonenal (4-HNE)	Aldehydes
2	24854122	[Acetylcysteine, pharmacology, Aldehydes, phar...	2014	Nov	Initiation of Oxidative	4-hydroxy-2-nonenal (4-HNE)	Aldehydes
3	24877583	[4-Butyrolactone, chemistry, Aldehydes, chemis...	2014	Jun	Initiation of Oxidative	4-hydroxy-2-nonenal (4-HNE)	Aldehydes
4	24878441	[Absorption, Physicochemical, Acetonitriles, c...	2014	Nov	Initiation of Oxidative	4-hydroxy-2-nonenal (4-HNE)	Aldehydes

chem_matches_df.to_csv('output/Chemical_PMID_occurances.csv', index=False)
chem_matches_df.shape

(3291433, 7)