Skip to content

ChIP-seq data analysis using geneXplainR

The R script below implements the tutorial workflow. The described steps are indicated in code comments for reference.

The R code is provided with the tutorial material as genexplain_tutorial_chipseq_analysis.R. Please note that some parts require editing before running the script, including username, password, project name.

R code for the tutorial

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
library(geneXplainR)

# Please note that for successful login, username and password have to
# belong to a valid account on that server.
server   <- "https://platform.genexplain.com"
user     <- "someuser@email.io"
password <- "12345"

gx.login(server, user, password)

apiProjectName <- "api2022_tutorial"
apiProjectPath <- paste0("data/Projects/", apiProjectName)

# Create a new platform project
# This step should be conducted only once for a new project.
# The gx.createProject function returns an error if the project
# already exists.
gx.createProject(apiProjectName, description = "API 2022 tutorial project")

folderPath <- paste0(apiProjectPath, "/Data/chipseq_analysis_workflow")

# Within a project, folders for data elements need to be created within the
# "Data" folder or its subfolders.
gx.createFolder("data/Projects/api2022_tutorial/Data", "chipseq_analysis_workflow")


#
# Step 1. Import and mapping of TAL1-bound genomic regions
#

# Import the BED file with TAL1-bound regions into the destination folder
gx.import("../data/GSM614003_jurkat.tal1.bed", folderPath, "BED format (*.bed)", list(dbSelector = "Ensembl 52.36n Human (hg18)"))

# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)

gx.analysis.parameters("liftOver1")
#
# Output
#                                                                       description
# input                                                                            
# mapping                                                                          
# minMatch        Recommended values: same species = 0.95, different species = 0.10
# multiple|choice    Recommended values: same species = No, different species = Yes
# out_file1                                                                        
# out_file2
#

bedPath      <- paste0(folderPath, "/GSM614003_jurkat.tal1")
mappedPath   <- paste0(folderPath, "/jurkat_chipseq_hg38")
unmappedPath <- paste0(folderPath, "/jurkat_chipseq_hg38_unmapped")
mapping      <- "hg18->hg38"

# Run Liftover to map hg18 coordinates to hg38
gx.analysis("liftOver1", list(input = bedPath, 
                              mapping = mapping, 
                              minMatch = 0.95, 
                              out_file1 = mappedPath, 
                              out_file2 = unmappedPath),
            TRUE, TRUE)


#
# Step 2. Mapping TAL1-bound genomic regions to nearby genes
#

species <- "Human (Homo sapiens)"
mappedGenePath <- paste0(mappedPath, " Genes")

# Use gx.trackToGeneSet function to map genomic coordinates to genes
gx.trackToGeneSet(mappedPath, species, -5000, 2000, destPath = mappedGenePath)


#
# Step 3. Functional enrichment analysis of genes near TAL1-bound regions
#

# Enrichment of genes associated with Gene Ontology terms using
# "Functional classification"
funclassResultPath <- paste0(mappedGenePath," GO")
gx.analysis("Functional classification", list(sourcePath = mappedGenePath,
                                              species    = species,
                                              bioHub     = "Full gene ontology classification",
                                              minHits    = 1,
                                              pvalueThreshold = 1,
                                              outputTable = funclassResultPath),
           TRUE, TRUE)

# Export analysis result to local file
# The default exporter is for data tables, so that it is sufficient
# to specify the data element path and the local output file
gx.export(funclassResultPath, target.file="functional_classification_result_GO.tsv")

# Enrichment of genes associated with Reactome pathways
funclassResultPath <- paste0(mappedGenePath," Reactome")
gx.analysis("Functional classification", list(sourcePath = mappedGenePath,
                                              species    = species,
                                              bioHub     = "Reactome pathways (74)",
                                              minHits    = 1,
                                              pvalueThreshold = 1,
                                              outputTable = funclassResultPath),
            TRUE, TRUE)

# Export analysis result to local file
gx.export(funclassResultPath, target.file="functional_classification_result_Reactome.tsv")


# Enrichment of genes associated with human diseases based on
# HumanPSD disease biomarker annotation
funclassResultPath <- paste0(mappedGenePath," Human disease biomarkers")
gx.analysis("Functional classification", list(sourcePath = mappedGenePath,
                                              species    = species,
                                              bioHub     = "HumanPSD(TM) disease (2022.1)",
                                              minHits    = 1,
                                              pvalueThreshold = 1,
                                              outputTable = funclassResultPath),
            TRUE, TRUE)

# Export analysis result to local file
gx.export(funclassResultPath, target.file="functional_classification_result_HumanPSD.tsv")


#
# Step 4. Sampling genomic regions not bound by TAL1
#

mealrBackgroundTrack <- paste0(mappedPath, " random 1000")

# Create random track not overlapping with TAL1-bound regions
gx.analysis("Create random track", list(inputTrackPath      = mappedPath,
                                        dbSelector          = "Ensembl 104.38 Human (hg38)",
                                        species             = species,
                                        standardChromosomes = TRUE,
                                        seqNumber           = 1000,
                                        seqLength           = 0,
                                        from                = 0,
                                        to                  = 0,
                                        withOverlap         = FALSE,
                                        randomShift         = FALSE,
                                        outputTrackPath     = mealrBackgroundTrack,
                                        randSeed            = 123),
            TRUE, TRUE)


#
# Step 5. Import and mapping of TAL1 binding site subset
#

#
# Data upload and lifting as in Step 1 for 1000 TAL1 sites sampled
# from the original BED file
gx.import("../data/GSM614003_jurkat.tal1_1000.bed", folderPath, "BED format (*.bed)", list(dbSelector = "Ensembl 52.36n Human (hg18)"))

# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)

bedPath      <- paste0(folderPath, "/GSM614003_jurkat.tal1_1000")
mappedPath   <- paste0(folderPath, "/jurkat_chipseq_hg38_1000")
unmappedPath <- paste0(folderPath, "/jurkat_chipseq_hg38_1000_unmapped")

# Map hg18 coordinates to hg38
gx.analysis("liftOver1", list(input = bedPath, 
                              mapping = mapping, 
                              minMatch = 0.95, 
                              out_file1 = mappedPath, 
                              out_file2 = unmappedPath),
            TRUE, TRUE)


#
# Step 6. Selection of important PWM models using MEALR
#

gx.analysis.parameters("MEALR (tracks)")
#
# Output
#                                                                                              description
# yesSetPath                                                Study track / Track with intervals of interest
# noSetPath                                                Background track / Track of non-bound intervals
# dbSelector                                                   Select a deployed or custom sequence source
# profilePath                                                                   Profile of weight matrices
# maxPosCoef      Maximum number of positive coefficients (motifs of factors with positive binding effect)
# maxComplexity                                                    Maximal complexity paramater to explore
# complexityInc                                         Step-size of incrementing the complexity parameter
# maxUnimproved                                     Maximal number of iterations without improved accuracy
# scoresWithNoSet                                   Include scores for No sequences in score table outputs
# output                                                                                Output folder path
#

mealrOutputPath <- paste0(mappedPath, " MEALR")
transfacProfile <- "databases/TRANSFAC(R) 2022.1/Data/profiles/vertebrate_human_p0.05_non3d"

# Analyze target and background genomic regions using MEALR
gx.analysis("MEALR (tracks)", list(yesSetPath      = mappedPath,
                                   noSetPath       = mealrBackgroundTrack,
                                   dbSelector      = "Ensembl 104.38 Human (hg38)",
                                   profilePath     = transfacProfile,
                                   maxPosCoef      = 150,
                                   maxComplexity   = 0.5,
                                   complexityInc   = 0.02,
                                   maxUnimproved   = 20,
                                   scoresWithNoSet = FALSE,
                                   output          = mealrOutputPath),
            TRUE, TRUE)


#
# Step 7. Extraction of binding transcription factors
#

gx.analysis.parameters("Select top rows")
#
# Output
#                        description
# inputTable             Input table
# column                      Column
# types                         Type
# topPercent             Top percent
# topCount             Top max count
# topMinCount          Top min count
# topTable          Top table output
# middlePercent       Middle percent
# middleCount       Middle max count
# middleMinCount    Middle min count
# middleTable    Middle table output
# bottomPercent       Bottom percent
# bottomCount       Bottom max count
# bottomMinCount    Bottom min count
# bottomTable    Bottom table output
#

mealrMotifPath <- paste0(mealrOutputPath, "/MEALR_positive_coefficients")
mealrTopPath <- paste0(mealrMotifPath, " Top 50")

# Extract top 50 PWMs ranked by logistic regression coefficient
gx.analysis("Select top rows", list(inputTable  = mealrMotifPath,
                                    column      = "Coefficient",
                                    topPercent  = 100.0,
                                    topCount    = 50,
                                    topMinCount = 50,
                                    topTable    = mealrTopPath),
            TRUE, TRUE)

gx.analysis.parameters("Matrices to molecules")
#
# Output
#                                                                                                                                         description
# sitesCollection                          Select table with the results of "Site search on gene set". Such table contains site model ID in each row.
# siteModelsCollection                     Select the profile that was used for site search. In most of the cases, profile is selected automatically.
# species                               Select arabidopsis, nematoda, zebrafish, fruit fly, human, mouse, rat, baker s yeast or fission yeast species
# targetType                                                                                       Select type of identifiers for the resulting table
# ignoreNaNInAggregator                                                                                    Ignore empty values during aggregator work
# aggregator            Select one of the rules to treat values in the numerical columns of the table when several rows are merged into a single one.
# columnName                                                        Select the column with numerical values to apply one of the rules described above
# outputTable                                                                                           Path to store the resulting table in the tree
#

mealrTopGenePath <- paste0(mealrTopPath, " Genes")

# Convert PWMs to factor genes
gx.analysis("Matrices to molecules", list(sitesCollection       = mealrTopPath,
                                          siteModelsCollection  = transfacProfile,
                                          species               = species,
                                          targetType            = "Genes: Ensembl",
                                          outputTable           = mealrTopGenePath),
            TRUE, TRUE)


#
# Step 8. Intersection of potentially TAL1-regulated genes and MEALR TFs
#

gx.analysis.parameters("Venn diagrams")
#
# Output
#                                                                           description
# table1Path                         Table which will be represented as left-top circle
# table1Name     Name for the left table on the diagram (leave empty to use table name)
# circle1Color                             Color for the left-top circle on the diagram
# table2Path                        Table which will be represented as right-top circle
# table2Name    Name for the right table on the diagram (leave empty to use table name)
# circle2Color                            Color for the right-top circle on the diagram
# table3Path                    Table which will be represented as center-bottom circle
# table3Name   Name for the center table on the diagram (leave empty to use table name)
# circle3Color                        Color for the center-bottom circle on the diagram
# simple                                                   All circles has equal radius
# output                                               Folder name to store the results
#

mappedNearbyGenePath   <- paste0(folderPath, "/jurkat_chipseq_hg38 Genes")
mealrTopVennPath <- paste0(mealrTopPath, " Venn")

# Intersect factors identified by MEALR and genes with nearby TAL1 ChIP-seq
# sites
gx.analysis("Venn diagrams", list(table1Path   = mappedNearbyGenePath,
                                  table1Name   = "Genes near TAL1 sites",
                                  table2Path   = mealrTopGenePath,
                                  table2Name   = "MEALR transcription factors",
                                  simple       = TRUE,
                                  output       = mealrTopVennPath),
            TRUE, TRUE)


#
# Step 9. Prediction of binding sites of identified TFs in TAL1-bound genomic regions
#

grnFactorPath <- paste0(mealrTopVennPath, "/Rows present in both tables")

# Load potential GRN factors into R data frame
tfs <- gx.get(grnFactorPath)
# tfs
#                 Gene symbol jurkat_chipseq_hg38: Count
# ENSG00000081059        TCF7                          1
# ENSG00000118513         MYB                          1
# ENSG00000124813       RUNX2                          4
# ENSG00000125952         MAX                          2
# ENSG00000138795        LEF1                          2
# ENSG00000157554         ERG                          1
# ENSG00000159216       RUNX1                          4
# ENSG00000179348       GATA2                          1
#                                            Site model ID Coefficient
# ENSG00000081059                                V$TCF1_09  0.05875131
# ENSG00000118513                                V$CMYB_01  0.02749087
# ENSG00000124813 V$AML_Q6,V$OSF2_Q6,V$RUNX2_03,V$RUNX2_04  0.08663064
# ENSG00000125952                              V$MYCMAX_01  0.02998472
# ENSG00000138795                      V$LEF1_10,V$LEF1_17  0.03285110
# ENSG00000157554                                 V$ERG_07  0.04488260
# ENSG00000159216                       V$AML1_02,V$AML_Q6  0.08663064
# ENSG00000179348                               V$GATA2_09  0.04911769
#

# Extract PWM ids
grnPwms <- unlist(strsplit(tfs[,3], ",", fixed = TRUE))

# Create PWM table for import
write.table(cbind(PWM = grnPwms, Num = 1:length(grnPwms)), file = "grn_pwms.tsv", quote = FALSE, sep = "\t", row.names = FALSE)

# Import PWM table
gx.importTable("grn_pwms.tsv", mealrOutputPath, 
               "MEALR_positive_coefficients Top 50 GRN PWMs", columnForID = "PWM", 
               tableType = "Matrices: TRANSFAC", species = species)

# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)

grnPwmPath <- paste0(mealrOutputPath, "/MEALR_positive_coefficients Top 50 GRN PWMs")

gx.analysis.parameters("Create profile from site model table")
#
# Output
#                                                                                    description
# table                                                Table containing site models as row names
# profile                                                        Profile to copy the values from
# thresholdsColumn Column containing cutoff values (use 'none' to copy cutoffs from the profile)
# outputProfile                                 Specify the path whether to store output profile
#

transfacProfile <- "databases/TRANSFAC(R) 2022.1/Data/profiles/vertebrate_human_p0.001_non3d"
grnPwmProfile   <- paste0(grnPwmPath, " profile")

# Create Match(TM) profile for PWMs of potential GRN factors
gx.analysis("Create profile from site model table", list(table   = grnPwmPath,
                                                         profile = transfacProfile,
                                                         outputProfile = grnPwmProfile),
            TRUE, TRUE)

gx.analysis.parameters("TRANSFAC(R) Match(TM) for tracks")
#
# Output
#                                                                         description
# sequencePath                                               Sequence track to search
# dbSelector                              Select a deployed or custom sequence source
# profilePath                                                                 Profile
# withoutDuplicates Ensure to report sites in overlapping genomic intervals only once
# ignoreCore               Core scores are not calculated and not used for filtering.
# output                                                            Output track path
#

grnMatchPath <- paste0(grnPwmProfile, " Match")

# Predict binding sites of GRN factors in TAL1-bound regions
gx.analysis("TRANSFAC(R) Match(TM) for tracks", list(sequencePath = paste0(folderPath, "/jurkat_chipseq_hg38"),
                                                     dbSelector   = "Ensembl 104.38 Human (hg38)",
                                                     profilePath  = grnPwmProfile,
                                                     withoutDuplicates = TRUE,
                                                     ignoreCore = TRUE,
                                                     output = grnMatchPath),
            TRUE, TRUE)


#
# Step 10. Prediction of binding sites of identified TFs around TAL1 transcription start site
#

# Create TAL1 gene table for import
write.table(cbind(ID = c("ENSG00000162367"), Symbol = c("TAL1")), file = "tal1.tsv", quote = FALSE, sep = "\t", row.names = FALSE)

# Import TAL1 gene
gx.importTable("tal1.tsv", mealrOutputPath,
               "TAL1 gene", columnForID = "ID", 
               tableType = "Genes: Ensembl", species = species)

# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)

tal1GenePath <- paste0(mealrOutputPath, "/TAL1 gene")

gx.analysis.parameters("Gene set to track")
#
# Output
#                                                      description
# sourcePath                                 Table of source genes
# species                                      Taxonomical species
# from                      From position (relative to gene start)
# to                          To position (relative to gene start)
# overlapMergingMode How to handle TSS located close to each other
# destPath                                             Output name
#

tal1TrackPath <- paste0(tal1GenePath, " promoter")

# Create track of genomic region around TAL1 TSS (promoter)
gx.analysis("Gene set to track", list(sourcePath = tal1GenePath,
                                      species    = species,
                                      from       = 2000,
                                      to         = 1000,
                                      destPath   = tal1TrackPath),
            TRUE, TRUE)

tal1MatchPath <- paste0(grnPwmProfile, " TAL1 Match")

# Predict binding sites of GRN factors in TAL1 promoter
gx.analysis("TRANSFAC(R) Match(TM) for tracks", list(sequencePath = tal1TrackPath,
                                                     dbSelector   = "Ensembl 104.38 Human (hg38)",
                                                     profilePath  = grnPwmProfile,
                                                     withoutDuplicates = TRUE,
                                                     ignoreCore = TRUE,
                                                     output = tal1MatchPath),
            TRUE, TRUE)

# Export genomic locations of predicted sites for GRN factors around TAL1 TSS
gx.export(tal1MatchPath, exporter = "BED format (*.bed)", target.file="TAL1_grn_pwm_sites.bed")

gx.logout()