Finding Occurrences of Chemicals and Drugs in PubMed Abstracts

Searches the ElasticSearch index created during CaseOLAP pipeline run for curated list of Drugs and Chemicals related to oxidative stress

Output:

  • Chemical_PMID_occurances.csv: CSV table where each row is the occurance of a chemical in PubMed
  • Drug_PMID_occurances.csv: CSV table where each row is the occurance of a drug in PubMed
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
import pandas as pd
from itertools import product
import seaborn as sns
import numpy as np
import time
import matplotlib.pyplot as plt
import json
import progressbar
  • Requires elastic search engine to be running on cluster. Must have PMID index
chemical_list_df = pd.read_csv('input/oxidative_stress_chemicals_SA_10222019.csv')
chemical_list_df['Molecule/Enzyme/Protein'] = chemical_list_df['Molecule/Enzyme/Protein'].str.lower().str.strip()
chemical_list_df.head()
Biological Events of Oxidative Stress Molecular and Functional Categories Molecule/Enzyme/Protein MeSH Heading MeSH Supplementary MeSH tree numbers Chemical Formula Examples Pharm Actions Tree Numbers References
0 Initiation of Oxidative Reactive Oxygen Species (ROS) superoxide (anion radical) Superoxides NaN D01.248.497.158.685.750.850; D01.339.431.374.8... O2- Superoxide, Hydrogen Peroxide Oxidants D27.720.642,\nD27.888.569.540 PMID: 25547488
1 Initiation of Oxidative Reactive Oxygen Species (ROS) hydrogen peroxide Hydrogen Peroxide NaN D01.248.497.158.685.750.424; D01.339.431.374.4... H2O2 NaN Anti-Infective Agents, Local D27.505.954.122.187 NaN
2 Initiation of Oxidative Reactive Oxygen Species (ROS) NaN NaN NaN NaN NaN NaN Oxidants D27.720.642,\nD27.888.569.540 NaN
3 Initiation of Oxidative Reactive Oxygen Species (ROS) hydroxyl (radical) Hydroxyl Radical NaN D01.339.431.249; D01.248.497.158.459.300; D01.... HO NaN Oxidants D27.720.642,\nD27.888.569.540 NaN
4 Initiation of Oxidative Reactive Oxygen Species (ROS) alpha oxygen None listed NaN NaN NaN NaN NaN NaN NaN
drug_list_df = pd.read_csv('input/drug_list_SA_10222019.csv')
drug_list_df['Name'] = drug_list_df['Name'].str.lower().str.strip()
drug_list_df.head()
Drug Category # Name Synonyms MeSH Descriptor MeSH tree(s) Common adverse effects Dosage (freq/amount/time/delivery) Duration (time) Pharm Action ... Unnamed: 1015 Unnamed: 1016 Unnamed: 1017 Unnamed: 1018 Unnamed: 1019 Unnamed: 1020 Unnamed: 1021 Unnamed: 1022 Unnamed: 1023 Unnamed: 1024
0 Anticoagulants 1.0 heparin ['Calciparine', 'Eparina', 'heparina', 'Hepari... heparin D09.698.373.400 Thrombocytopenia, Cerebral haemorrhage, Haemog... 1/18U/kg/iv 2 days Anticoagulants, \nFibrinolytic Agents ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Anticoagulants 2.0 warfarin ['4-Hydroxy-3-(3-oxo-1-phenylbutyl)coumarin', ... warfarin D03.383.663.283.446.520.914\nD03.633.100.150.4... Haemorrhage, Haematoma, anaemia, Epistaxis, hy... 1/2-10mg/day/po As needed Anticoagulants, \nRodenticides ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 Thrombolytics 3.0 streptokinase ['Streptokinase C precursor'] streptokinase D08.811.277.656.300.775\nD12.776.124.125.662.537 blurred vision, confusion, dizziness, fever, s... 1/1,500,000 IU/iv 60min Fibrinolytic Agents ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 Thrombolytics 4.0 urokinase ['U-plasminogen activator', 'uPA', 'Urokinase-... Urokinase-Type Plasminogen Activator D08.811.277.656.300.760.910\nD08.811.277.656.9... bleeding gums, coughing up blood, dizziness, h... 1/4,000,000U/iv 10min NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 Thrombolytics 5.0 tpa ['Alteplasa', 'Alteplase (genetical recombinat... Tissue Plasminogen Activator D08.811.277.656.300.760.875\nD08.811.277.656.9... NaN 1/0.9mg/kg/iv 60min Fibrinolytic Agents ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 1025 columns

es = Elasticsearch(timeout=300)
  • output file saved to output/chem_drug_pair_matches.csv
# All combinations of drugs and chemicals
drug_chemicals = product(
    drug_list_df['Name'].dropna().unique(),
    chemical_list_df['Molecule/Enzyme/Protein'].dropna().unique()
)

matches = pd.DataFrame()
for (drug, chemical) in progressbar.progressbar(drug_chemicals):

    mol_matches = {
            'PMID': [],
            'title': [],
            'Year': [],
            'Month': []
        }    
    # Match drug and chemical
    q = Q("match_phrase", abstract=drug.lower().strip()) & Q("match_phrase", abstract=chemical.lower().strip())

    # Search
    hits = Search(
        using=es,
        index="pubmed"
    ).params(
        request_timeout=300
    ).query(q)
    for h in hits.scan():
        date_dict = json.loads(h.date.replace("'", '"'))        
        mol_matches['PMID'].append(h.pmid)
        mol_matches['title'].append(h.title)
        mol_matches['Year'].append(date_dict['Year'])
        mol_matches['Month'].append(date_dict['Month'])

    match_df = pd.DataFrame.from_dict(mol_matches)
    match_df['drug'] = drug.lower().strip()
    match_df['chemical'] = chemical.lower().strip()
    matches = matches.append(match_df)

matches.head()
| |                   #                           | 21059 Elapsed Time: 0:13:02
PMID title Year Month drug chemical
0 8376590 Homocysteine, a thrombogenic agent, suppresses... 1993 Sep heparin hydrogen peroxide
1 25037421 Degradation of fucoidans from Sargassum fulvel... 2014 Oct heparin hydrogen peroxide
2 14561655 Role of hydrogen peroxide in sperm capacitatio... 2004 Feb heparin hydrogen peroxide
3 9040037 Protective effect of dextran sulfate and hepar... 1997 Jan heparin hydrogen peroxide
4 10547607 Heparin-binding EGF-like growth factor is expr... 1999 Nov heparin hydrogen peroxide
matches.to_csv('output/chem_drug_pair_matches.csv', index=False)
matches = pd.read_csv('output/chem_drug_pair_matches.csv')

Plotting Drug, chemical co-occurence heatmap

  • colors used to show category of drug, and oxidative stress category association of chemicals
chem_name_cats = chemical_list_df[['Molecule/Enzyme/Protein', 'Biological Events of Oxidative Stress']]\
    .drop_duplicates().rename(columns={
        'Molecule/Enzyme/Protein': 'chemical',
        'Biological Events of Oxidative Stress':'chem_cat'
    }).dropna()
drug_name_cats = drug_list_df[['Name', 'Drug Category']]\
    .drop_duplicates().rename(columns={
        'Name': 'drug',
        'Drug Category': 'drug_cat'
    }).dropna()
with_cats_matches = matches.merge(
    chem_name_cats,
    how='left',
    validate='m:m'
)
with_cats_matches = with_cats_matches.merge(
    drug_name_cats,
    how='left',
    validate='m:m'
)
print(matches.dropna().shape, with_cats_matches.dropna().shape)
with_cats_matches.head()
(199479, 6) (204265, 8)
PMID title Year Month drug chemical chem_cat drug_cat
0 8376590 Homocysteine, a thrombogenic agent, suppresses... 1993.0 Sep heparin hydrogen peroxide Initiation of Oxidative Anticoagulants
1 25037421 Degradation of fucoidans from Sargassum fulvel... 2014.0 Oct heparin hydrogen peroxide Initiation of Oxidative Anticoagulants
2 14561655 Role of hydrogen peroxide in sperm capacitatio... 2004.0 Feb heparin hydrogen peroxide Initiation of Oxidative Anticoagulants
3 9040037 Protective effect of dextran sulfate and hepar... 1997.0 Jan heparin hydrogen peroxide Initiation of Oxidative Anticoagulants
4 10547607 Heparin-binding EGF-like growth factor is expr... 1999.0 Nov heparin hydrogen peroxide Initiation of Oxidative Anticoagulants
with_cats_matches.chem_cat.unique()
array(['Initiation of Oxidative', 'Outcomes of Oxidative Stress',
       'Regulation of Oxidative Stress'], dtype=object)
# Creating color palettes to label drug and chemical categories
chems = with_cats_matches.chem_cat.unique()
chem_pal = sns.color_palette("hls", n_colors=with_cats_matches.chem_cat.nunique())
chem_pal_dict = dict(zip(chems, chem_pal))

drugs = with_cats_matches.drug_cat.unique()
drug_pal = sns.color_palette("tab20c", n_colors=with_cats_matches.drug_cat.nunique())
drug_pal[-5:] = sns.color_palette("tab20b", n_colors=5)
drug_pal_dict = dict(zip(drugs, drug_pal))

with_cats_matches['chem_color'] = with_cats_matches.chem_cat.map(chem_pal_dict)
with_cats_matches['drug_color'] = with_cats_matches.drug_cat.map(drug_pal_dict)

sns.palplot(chem_pal)
plt.gca().set_xticklabels(chems)
plt.xticks(rotation=60)
sns.palplot(drug_pal)
plt.gca().set_xticklabels(drugs)
plt.xticks(rotation=90);

png

png

## Save Color Palettes
with open('drug_cat_palette.json', 'w') as fp:
    json.dump(drug_pal_dict, fp)

with open('chem_cat_palette.json', 'w') as fp:
    json.dump(chem_pal_dict, fp)
# Set NaN category color to white
with_cats_matches.loc[with_cats_matches.drug_color.isna(), 'drug_color'] = "white"
# Count articles per drug-chemical co-occurrence
article_count = pd.DataFrame(
    with_cats_matches.groupby(['drug', 'chemical', 'drug_cat', 'chem_cat']).PMID.nunique()
).reset_index().rename(columns={'PMID': 'Article Count'})
article_count['log_count'] = np.log10(article_count['Article Count'])

chem_colors_df = with_cats_matches[['chemical', 'chem_color']].drop_duplicates()
chem_colors = [chem_colors_df[chem_colors_df.chemical == chem].chem_color.unique()[0] for chem in piv_count.index]

drug_colors_df = with_cats_matches[['drug', 'drug_color']].drop_duplicates()
drug_colors = [drug_colors_df[drug_colors_df.drug == drug].drug_color.unique()[0] for drug in piv_count.columns]

piv_count = article_count.pivot_table(
    index='chemical',
    columns='drug',
    values='log_count',
    fill_value=0
)
sns.clustermap(
    piv_count,
    figsize=(22,13),
    cmap='viridis',
    row_colors=chem_colors,
    col_colors=drug_colors
)
<seaborn.matrix.ClusterGrid at 0x7ff7afcb4898>

png

  • Searches abstracts for drug names or synonyms of drug names
  • Finds number of occurances of drug name or synonyms in abstract

Saves to output/Drug_PMID_occurances.csv

drug_matches = pd.DataFrame()
tot = drug_list_df.Name.nunique()

for (drug, synonyms, category), m_df in progressbar.progressbar(drug_list_df.groupby(['Name', 'Synonyms', 'Drug Category'])):
    drug_match = {
            'PMID': [],
            'title': [],
            'MeSH': [],
            'count': [],
            'Year': [],
            'Month': []
        }
    synonyms = synonyms.split(', ')
    drug = drug.lower()
    q = Q('match_phrase', abstract=drug)

    if synonyms:
        synonyms = [s.lower() for s in synonyms]
        for s in synonyms:
            q = q | Q('match_phrase', abstract=s)

    hits = Search(
        using=es,
        index="pubmed"
    ).query(q)

    for h in hits.scan():
        date_dict = json.loads(h.date.replace("'", '"'))        
        drug_match['PMID'].append(h.pmid)
        drug_match['title'].append(h.title)
        drug_match['MeSH'].append(h.MeSH)
        drug_match['Year'].append(date_dict['Year'])
        drug_match['Month'].append(date_dict['Month'])

        entity_count = 0
        for phrase in [drug] + synonyms:
            entity_lower = phrase.lower().replace("-", " ")
            entity_count += abs_lower.count(entity_lower)

        drug_match['count'].append(entity_count)

    drug_match_df = pd.DataFrame.from_dict(drug_match)
    drug_match_df['drug'] = drug
    drug_match_df['category'] = category
    drug_matches = drug_matches.append(drug_match_df)


drug_matches.head()
100% (161 of 161) |######################| Elapsed Time: 0:37:59 Time:  0:37:59
PMID title MeSH count Year Month drug category
0 24853116 Acarbose monotherapy and weight loss in Easter... [Acarbose, therapeutic use, Asian Continental ... 0 2014 Nov acarbose Alpha-glucosidase Inhibitors
1 24863354 Comparative evaluation of polysaccharides isol... [Asteraceae, chemistry, Astragalus Plant, chem... 0 2014 Apr acarbose Alpha-glucosidase Inhibitors
2 24866329 Effects of sitagliptin or mitiglinide as an ad... [Acarbose, therapeutic use, Aged, Asian Contin... 0 2014 Jul acarbose Alpha-glucosidase Inhibitors
3 12918894 Nateglinide (Starlix): update on a new antidia... [Blood Glucose, physiology, Cyclohexanes, phar... 0 acarbose Alpha-glucosidase Inhibitors
4 20568489 Digoxin: serious drug interactions. [Digoxin, adverse effects, blood, Drug Interac... 0 2010 Apr acarbose Alpha-glucosidase Inhibitors
drug_matches.to_csv('output/Drug_PMID_occurances.csv', index=False)
drug_matches.shape
(2702853, 8)
drug_category_PMID_count = pd.DataFrame(drug_matches.groupby('category').PMID.nunique()).reset_index()
drug_category_PMID_count
category PMID
0 ACE Inhibitors 70299
1 Alpha-glucosidase Inhibitors 2008
2 Angiotensin II Antagonists 16230
3 Anticoagulants 80628
4 Antiplatelets 82006
5 Beta Blockers 949488
6 Bile Acid Resins 2459
7 Calcium Antagonist 47735
8 Calcium Channel Blockers 25446
9 Cholesteron Absorption Blocker 2554
10 Diuretics 28829
11 Fibrates 12214
12 Glucagon-like peptide-1 blockers 5180
13 HMG-CoA Reductase inhibitors (Statins) 22166
14 Inotropes 246382
15 Insulin 7386
16 Metformin 16564
17 Na Channel Blockers 42512
18 Other Anti Arrhythmics 130224
19 Phosphodiesterase Inhbitors 21894
20 Potassium Channel Blockers 10292
21 Sulfonylureas 12331
22 Thiazolidinediones 5089
23 Thrombolytics 309306
24 Vasodilators 408803
25 Vasopressin Antagonists 850

Searching for PMIDs associated with each chemical

  • If there is a MeSH id, searches pubmed index for abstract containing drug name OR MeSH terms containin mesh term
  • If there is no MeSH id, only searches for drug name in abstract

saves to output/Chemical_PMID_occurances.csv

has_data_df = chemical_list_df[
    (~chemical_list_df['Molecule/Enzyme/Protein'].isnull()) |
    (~chemical_list_df['MeSH Heading'].isnull())
]
chem_matches_df = pd.DataFrame()

for (name, mesh, category), m_df in progressbar.progressbar(has_data_df.groupby(['Molecule/Enzyme/Protein', 'MeSH Heading', 'Biological Events of Oxidative Stress'])):
    hit_dict = {
        'PMID': [],
        'Article MeSH': [],
        'Year': [],
        'Month': [],
    }
    if mesh.lower() == 'none listed':
        q = Q('match_phrase', abstract=name.lower())
    else:
        q = Q('match_phrase', abstract=name.lower()) | Q('match_phrase', MeSH=mesh)

    hits = Search(
        using=es,
        index="pubmed"
    ).params(
        request_timeout=300
    ).query(q)
    for h in hits.scan():
        date_dict = json.loads(h.date.replace("'", '"'))
        hit_dict['PMID'].append(h.pmid)
        hit_dict['Article MeSH'].append(h.MeSH)
        hit_dict['Year'].append(date_dict['Year'])
        hit_dict['Month'].append(date_dict['Month'])


    hit_df = pd.DataFrame.from_dict(hit_dict)
    hit_df['category'] = category
    hit_df['chemical'] = name
    hit_df['MeSH'] = mesh
    chem_matches_df = chem_matches_df.append(hit_df)

chem_matches_df.head()
100% (157 of 157) |######################| Elapsed Time: 0:44:52 Time:  0:44:52
PMID Article MeSH Year Month category chemical MeSH
0 24852702 [Alcohols, metabolism, toxicity, Aldehydes, me... 2014 Sep Initiation of Oxidative 4-hydroxy-2-nonenal (4-HNE) Aldehydes
1 24854020 [Adult, Aldehydes, metabolism, Case-Control St... 2015 Apr Initiation of Oxidative 4-hydroxy-2-nonenal (4-HNE) Aldehydes
2 24854122 [Acetylcysteine, pharmacology, Aldehydes, phar... 2014 Nov Initiation of Oxidative 4-hydroxy-2-nonenal (4-HNE) Aldehydes
3 24877583 [4-Butyrolactone, chemistry, Aldehydes, chemis... 2014 Jun Initiation of Oxidative 4-hydroxy-2-nonenal (4-HNE) Aldehydes
4 24878441 [Absorption, Physicochemical, Acetonitriles, c... 2014 Nov Initiation of Oxidative 4-hydroxy-2-nonenal (4-HNE) Aldehydes
chem_matches_df.to_csv('output/Chemical_PMID_occurances.csv', index=False)
chem_matches_df.shape
(3291433, 7)