In this work we describe the temporal variability analysis of the Mexico Government dataset as part of initial data quality assessment of the COVID-19 Metaclustering project of the Biomedical Data Science Lab of the Universitat Politècnica de València, Spain.
An initial assessment has shown unstable periods at the beginning and end of the repository acquisition period. This is usually related to transient states due to e.g. few cases. In this report, to improve the visualization of results we have focused on the period between 2020-03-15 and 2020-10-15.
This study can also be fully explored in our ehrtemporalvariability.upv.es tool. Further information for EHRtemporalVariability package can be found in our GitHub repository EHRtemporalVariability. .
If you use this code please cite:
Lexin Zhou, Nekane Romero, Juan Martínez-Miranda, J Alberto Conejero, Juan M García-Gómez, Carlos Sáez. Heterogeneity in COVID-19 severity patterns among age-gender groups: an analysis of 778 692 Mexican patients through a meta-clustering technique. Preprint submitted to medRxiv.
If you are interested in collaborating in this work please contact us.
Install and load the required packages.
# install.packages('EHRtemporalVariability')
# install.packages('glue')
library(EHRtemporalVariability)
library(glue)### LOAD AND FORMAT DATASET
untar('./data/COVID19_MEXICO_LATEST_DATA.tar.gz', exdir = "data")
data = read.csv2("./data/COVID19_MEXICO_LATEST_DATA.csv", encoding="UTF-8", sep = ",", header = TRUE, na.strings = "", stringsAsFactors = FALSE,dec=".",
colClasses = c( "numeric", #Index of row
"Date", #LAST_UPDATE
"character", #ID_REGISTRATION
"factor", #ORIGIN
"factor", #SECTOR
"character", #ENTITY_UM
"factor", #SEX
"character", #ENTITY_NAT
"character", #ENTITY_RES
"character", #MUNICIPALITY_RES
"factor", #PATIENT_TYPE
"Date", #ADMISSION_DATE
"Date", #SYMPTOMS_DATE
"Date", #DEATH_DATE
"factor", #INTUBATED
"factor", #PNEUMONIA
"integer", #AGE
"character", #NATIONALITY
"factor", #PREGNANT
"factor", #SPEAK_INDIGENOUS_LANGUAGE
"factor", #Indigenous
"factor", #DIABETES
"factor", #COPD
"factor", #ASTHMA
"factor", #INMUSUPR
"factor", #HYPERTENSION
"factor", #OTHER_DISEASE
"factor", #CARDIOVASCULAR
"factor", #OBESITY
"factor", #CHRONIC_KIDNEY_DISEASE
"factor", #SMOKE
"factor", #OTHER_CASE_CONTACT
"factor", #TESTED
"factor", #RESULT_LAB
"factor", #FINAL_CLASIFICATION
"factor", #MIGRANT
"character", #COUNTRY_NATIONALITY
"character", #COUNTRY_ORIGIN
"factor", #UCI
"factor", #OUTCOME ('See the Python notebook')
"numeric", #Survival_days ('See the Python notebook')
"numeric" #From_symptoms_to_hospital_days ('See the Python notebook')
)) #########################################
# SELECT ONE OF THE FOLLOWING LINE #
#########################################
data = data[data$RESULT_LAB == 'Positive SARS-CoV-2', ]
# data = data[data$RESULT_LAB == 'Non-Positive SARS-CoV-2', ]
# Select the variables that we want to study.
data <- select(data, LAST_UPDATE, ID_REGISTRATION, ORIGIN, SECTOR, ENTITY_UM,ENTITY_NAT,ENTITY_RES,MUNICIPALITY_RES, NATIONALITY, MIGRANT,COUNTRY_NATIONALITY, COUNTRY_ORIGIN,SEX, AGE, OBESITY, SMOKE, PREGNANT, SPEAK_INDIGENOUS_LANGUAGE, PATIENT_TYPE, ADMISSION_DATE, SYMPTOMS_DATE, DEATH_DATE,PNEUMONIA, DIABETES, COPD, ASTHMA, INMUSUPR, HYPERTENSION, CARDIOVASCULAR, CHRONIC_KIDNEY_DISEASE, OTHER_DISEASE, UCI, INTUBATED, OUTCOME, Survival_days, FromSymptomToHospital_days,OTHER_CASE_CONTACT)
# convert some variable types
data$ageCat = vector(mode = "character", length = nrow(data))
names(data)[names(data) == "UCI"] <- "ICU" # Translate UCI (Spanish) to ICU (Intensive Care Unit)
data$AGE = as.integer(data$AGE)
data$Survival_days = as.integer(data$Survival_days)
data$FromSymptomToHospital_days = as.integer(data$FromSymptomToHospital_days)
# Manually fix variable types
data$ADMISSION_DATE = as.Date(data$ADMISSION_DATE)
data$SYMPTOMS_DATE = as.Date(data$SYMPTOMS_DATE)
data$DEATH_DATE = as.Date(data$DEATH_DATE)
# remove rows with missing Admission dates and remove index column
data = data[!is.na(data$ADMISSION_DATE),-1]
# remove rows: LAST_UPDATE and ID_REGISTRATION due to their irrelevance
names(data)[names(data) == "UCI"] <- "ICU" # Translate UCI (Spanish) to ICU (Intensive Care Unit)
# remove rows with missing dates and remove patient_id column
data = subset(data, select = -c(ID_REGISTRATION))Estimation of the data temporal maps and information geometric temporal plot objects. We set admission date as the reference date, select a weekly analysis, and focus on the period between 2020-03-15 and 2020-10-15. To expand the period to the full timespan, remove the startDate and endDate parameters.
probMaps <- estimateDataTemporalMap( data = data,
dateColumnName = "ADMISSION_DATE",
period = "week",
numericSmoothing = FALSE,
startDate = as.Date("2020-03-15"),
endDate = as.Date("2020-10-15"))
igtProjs <- sapply (probMaps, estimateIGTProjection, dimensions = 3)plotDataTemporalMap(probMaps$AGE, absolute = FALSE)plotDataTemporalMap(probMaps$AGE, absolute = TRUE)plotIGTProjection(igtProjs$AGE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$SEX, absolute = FALSE)plotIGTProjection(igtProjs$SEX, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$PATIENT_TYPE, absolute = FALSE)plotIGTProjection(igtProjs$PATIENT_TYPE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$SECTOR, absolute = FALSE)plotIGTProjection(igtProjs$SECTOR, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$ORIGIN, absolute = FALSE)plotIGTProjection(igtProjs$ORIGIN, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$ENTITY_UM, absolute = FALSE)plotIGTProjection(igtProjs$ENTITY_UM, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$ENTITY_RES, absolute = FALSE)plotIGTProjection(igtProjs$ENTITY_RES, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$MUNICIPALITY_RES, absolute = FALSE, endValue = 30)plotIGTProjection(igtProjs$MUNICIPALITY_RES, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$COUNTRY_NATIONALITY, absolute = FALSE)plotIGTProjection(igtProjs$COUNTRY_NATIONALITY, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$MIGRANT, absolute = FALSE)plotIGTProjection(igtProjs$MIGRANT, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$OBESITY, absolute = FALSE)plotIGTProjection(igtProjs$OBESITY, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$SMOKE, absolute = FALSE)plotIGTProjection(igtProjs$SMOKE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$PREGNANT, absolute = FALSE)plotIGTProjection(igtProjs$PREGNANT, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$SPEAK_INDIGENOUS_LANGUAGE, absolute = FALSE)plotIGTProjection(igtProjs$SPEAK_INDIGENOUS_LANGUAGE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$PNEUMONIA, absolute = FALSE)plotIGTProjection(igtProjs$PNEUMONIA, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$DIABETES, absolute = FALSE)plotIGTProjection(igtProjs$DIABETES, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$COPD, absolute = FALSE)plotIGTProjection(igtProjs$COPD, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$ASTHMA, absolute = FALSE)plotIGTProjection(igtProjs$ASTHMA, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$INMUSUPR, absolute = FALSE)plotIGTProjection(igtProjs$INMUSUPR, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$HYPERTENSION, absolute = FALSE)plotIGTProjection(igtProjs$HYPERTENSION, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$OTHER_DISEASE, absolute = FALSE)plotIGTProjection(igtProjs$OTHER_DISEASE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$CARDIOVASCULAR, absolute = FALSE)plotIGTProjection(igtProjs$CARDIOVASCULAR, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$CHRONIC_KIDNEY_DISEASE, absolute = FALSE)plotIGTProjection(igtProjs$CHRONIC_KIDNEY_DISEASE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$ICU, absolute = FALSE)plotIGTProjection(igtProjs$ICU, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$INTUBATED, absolute = FALSE)plotIGTProjection(igtProjs$INTUBATED, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$OTHER_CASE_CONTACT, absolute = FALSE)plotIGTProjection(igtProjs$OTHER_CASE_CONTACT, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$OUTCOME, absolute = FALSE)plotIGTProjection(igtProjs$OUTCOME, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$SYMPTOMS_DATE, absolute = FALSE)plotIGTProjection(igtProjs$SYMPTOMS_DATE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$DEATH_DATE, absolute = FALSE)plotIGTProjection(igtProjs$DEATH_DATE, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$Survival_days, absolute = FALSE)plotIGTProjection(igtProjs$Survival_days, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)plotDataTemporalMap(probMaps$FromSymptomToHospital_days, absolute = FALSE)plotIGTProjection(igtProjs$FromSymptomToHospital_days, dimensions = 2, colorPalette = "Magma", trajectory = TRUE)You can fully explore these results in our EHRtemporalVariability Shiny App by loading the .RData file.
save(probMaps, igtProjs, file="variability_MexCov_Positive_weekly.RData")