1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449 | library(geneXplainR)
# Please note that for successful login, username and password have to
# belong to a valid account on that server.
server <- "https://platform.genexplain.com"
user <- "someuser@email.io"
password <- "12345"
gx.login(server, user, password)
apiProjectName <- "api2022_tutorial"
apiProjectPath <- paste0("data/Projects/", apiProjectName)
# Create a new platform project
# This step should be conducted only once for a new project.
# The gx.createProject function returns an error if the project
# already exists.
gx.createProject(apiProjectName, description = "API 2022 tutorial project")
folderPath <- paste0(apiProjectPath, "/Data/chipseq_analysis_workflow")
# Within a project, folders for data elements need to be created within the
# "Data" folder or its subfolders.
gx.createFolder("data/Projects/api2022_tutorial/Data", "chipseq_analysis_workflow")
#
# Step 1. Import and mapping of TAL1-bound genomic regions
#
# Import the BED file with TAL1-bound regions into the destination folder
gx.import("../data/GSM614003_jurkat.tal1.bed", folderPath, "BED format (*.bed)", list(dbSelector = "Ensembl 52.36n Human (hg18)"))
# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)
gx.analysis.parameters("liftOver1")
#
# Output
# description
# input
# mapping
# minMatch Recommended values: same species = 0.95, different species = 0.10
# multiple|choice Recommended values: same species = No, different species = Yes
# out_file1
# out_file2
#
bedPath <- paste0(folderPath, "/GSM614003_jurkat.tal1")
mappedPath <- paste0(folderPath, "/jurkat_chipseq_hg38")
unmappedPath <- paste0(folderPath, "/jurkat_chipseq_hg38_unmapped")
mapping <- "hg18->hg38"
# Run Liftover to map hg18 coordinates to hg38
gx.analysis("liftOver1", list(input = bedPath,
mapping = mapping,
minMatch = 0.95,
out_file1 = mappedPath,
out_file2 = unmappedPath),
TRUE, TRUE)
#
# Step 2. Mapping TAL1-bound genomic regions to nearby genes
#
species <- "Human (Homo sapiens)"
mappedGenePath <- paste0(mappedPath, " Genes")
# Use gx.trackToGeneSet function to map genomic coordinates to genes
gx.trackToGeneSet(mappedPath, species, -5000, 2000, destPath = mappedGenePath)
#
# Step 3. Functional enrichment analysis of genes near TAL1-bound regions
#
# Enrichment of genes associated with Gene Ontology terms using
# "Functional classification"
funclassResultPath <- paste0(mappedGenePath," GO")
gx.analysis("Functional classification", list(sourcePath = mappedGenePath,
species = species,
bioHub = "Full gene ontology classification",
minHits = 1,
pvalueThreshold = 1,
outputTable = funclassResultPath),
TRUE, TRUE)
# Export analysis result to local file
# The default exporter is for data tables, so that it is sufficient
# to specify the data element path and the local output file
gx.export(funclassResultPath, target.file="functional_classification_result_GO.tsv")
# Enrichment of genes associated with Reactome pathways
funclassResultPath <- paste0(mappedGenePath," Reactome")
gx.analysis("Functional classification", list(sourcePath = mappedGenePath,
species = species,
bioHub = "Reactome pathways (74)",
minHits = 1,
pvalueThreshold = 1,
outputTable = funclassResultPath),
TRUE, TRUE)
# Export analysis result to local file
gx.export(funclassResultPath, target.file="functional_classification_result_Reactome.tsv")
# Enrichment of genes associated with human diseases based on
# HumanPSD disease biomarker annotation
funclassResultPath <- paste0(mappedGenePath," Human disease biomarkers")
gx.analysis("Functional classification", list(sourcePath = mappedGenePath,
species = species,
bioHub = "HumanPSD(TM) disease (2022.1)",
minHits = 1,
pvalueThreshold = 1,
outputTable = funclassResultPath),
TRUE, TRUE)
# Export analysis result to local file
gx.export(funclassResultPath, target.file="functional_classification_result_HumanPSD.tsv")
#
# Step 4. Sampling genomic regions not bound by TAL1
#
mealrBackgroundTrack <- paste0(mappedPath, " random 1000")
# Create random track not overlapping with TAL1-bound regions
gx.analysis("Create random track", list(inputTrackPath = mappedPath,
dbSelector = "Ensembl 104.38 Human (hg38)",
species = species,
standardChromosomes = TRUE,
seqNumber = 1000,
seqLength = 0,
from = 0,
to = 0,
withOverlap = FALSE,
randomShift = FALSE,
outputTrackPath = mealrBackgroundTrack,
randSeed = 123),
TRUE, TRUE)
#
# Step 5. Import and mapping of TAL1 binding site subset
#
#
# Data upload and lifting as in Step 1 for 1000 TAL1 sites sampled
# from the original BED file
gx.import("../data/GSM614003_jurkat.tal1_1000.bed", folderPath, "BED format (*.bed)", list(dbSelector = "Ensembl 52.36n Human (hg18)"))
# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)
bedPath <- paste0(folderPath, "/GSM614003_jurkat.tal1_1000")
mappedPath <- paste0(folderPath, "/jurkat_chipseq_hg38_1000")
unmappedPath <- paste0(folderPath, "/jurkat_chipseq_hg38_1000_unmapped")
# Map hg18 coordinates to hg38
gx.analysis("liftOver1", list(input = bedPath,
mapping = mapping,
minMatch = 0.95,
out_file1 = mappedPath,
out_file2 = unmappedPath),
TRUE, TRUE)
#
# Step 6. Selection of important PWM models using MEALR
#
gx.analysis.parameters("MEALR (tracks)")
#
# Output
# description
# yesSetPath Study track / Track with intervals of interest
# noSetPath Background track / Track of non-bound intervals
# dbSelector Select a deployed or custom sequence source
# profilePath Profile of weight matrices
# maxPosCoef Maximum number of positive coefficients (motifs of factors with positive binding effect)
# maxComplexity Maximal complexity paramater to explore
# complexityInc Step-size of incrementing the complexity parameter
# maxUnimproved Maximal number of iterations without improved accuracy
# scoresWithNoSet Include scores for No sequences in score table outputs
# output Output folder path
#
mealrOutputPath <- paste0(mappedPath, " MEALR")
transfacProfile <- "databases/TRANSFAC(R) 2022.1/Data/profiles/vertebrate_human_p0.05_non3d"
# Analyze target and background genomic regions using MEALR
gx.analysis("MEALR (tracks)", list(yesSetPath = mappedPath,
noSetPath = mealrBackgroundTrack,
dbSelector = "Ensembl 104.38 Human (hg38)",
profilePath = transfacProfile,
maxPosCoef = 150,
maxComplexity = 0.5,
complexityInc = 0.02,
maxUnimproved = 20,
scoresWithNoSet = FALSE,
output = mealrOutputPath),
TRUE, TRUE)
#
# Step 7. Extraction of binding transcription factors
#
gx.analysis.parameters("Select top rows")
#
# Output
# description
# inputTable Input table
# column Column
# types Type
# topPercent Top percent
# topCount Top max count
# topMinCount Top min count
# topTable Top table output
# middlePercent Middle percent
# middleCount Middle max count
# middleMinCount Middle min count
# middleTable Middle table output
# bottomPercent Bottom percent
# bottomCount Bottom max count
# bottomMinCount Bottom min count
# bottomTable Bottom table output
#
mealrMotifPath <- paste0(mealrOutputPath, "/MEALR_positive_coefficients")
mealrTopPath <- paste0(mealrMotifPath, " Top 50")
# Extract top 50 PWMs ranked by logistic regression coefficient
gx.analysis("Select top rows", list(inputTable = mealrMotifPath,
column = "Coefficient",
topPercent = 100.0,
topCount = 50,
topMinCount = 50,
topTable = mealrTopPath),
TRUE, TRUE)
gx.analysis.parameters("Matrices to molecules")
#
# Output
# description
# sitesCollection Select table with the results of "Site search on gene set". Such table contains site model ID in each row.
# siteModelsCollection Select the profile that was used for site search. In most of the cases, profile is selected automatically.
# species Select arabidopsis, nematoda, zebrafish, fruit fly, human, mouse, rat, baker s yeast or fission yeast species
# targetType Select type of identifiers for the resulting table
# ignoreNaNInAggregator Ignore empty values during aggregator work
# aggregator Select one of the rules to treat values in the numerical columns of the table when several rows are merged into a single one.
# columnName Select the column with numerical values to apply one of the rules described above
# outputTable Path to store the resulting table in the tree
#
mealrTopGenePath <- paste0(mealrTopPath, " Genes")
# Convert PWMs to factor genes
gx.analysis("Matrices to molecules", list(sitesCollection = mealrTopPath,
siteModelsCollection = transfacProfile,
species = species,
targetType = "Genes: Ensembl",
outputTable = mealrTopGenePath),
TRUE, TRUE)
#
# Step 8. Intersection of potentially TAL1-regulated genes and MEALR TFs
#
gx.analysis.parameters("Venn diagrams")
#
# Output
# description
# table1Path Table which will be represented as left-top circle
# table1Name Name for the left table on the diagram (leave empty to use table name)
# circle1Color Color for the left-top circle on the diagram
# table2Path Table which will be represented as right-top circle
# table2Name Name for the right table on the diagram (leave empty to use table name)
# circle2Color Color for the right-top circle on the diagram
# table3Path Table which will be represented as center-bottom circle
# table3Name Name for the center table on the diagram (leave empty to use table name)
# circle3Color Color for the center-bottom circle on the diagram
# simple All circles has equal radius
# output Folder name to store the results
#
mappedNearbyGenePath <- paste0(folderPath, "/jurkat_chipseq_hg38 Genes")
mealrTopVennPath <- paste0(mealrTopPath, " Venn")
# Intersect factors identified by MEALR and genes with nearby TAL1 ChIP-seq
# sites
gx.analysis("Venn diagrams", list(table1Path = mappedNearbyGenePath,
table1Name = "Genes near TAL1 sites",
table2Path = mealrTopGenePath,
table2Name = "MEALR transcription factors",
simple = TRUE,
output = mealrTopVennPath),
TRUE, TRUE)
#
# Step 9. Prediction of binding sites of identified TFs in TAL1-bound genomic regions
#
grnFactorPath <- paste0(mealrTopVennPath, "/Rows present in both tables")
# Load potential GRN factors into R data frame
tfs <- gx.get(grnFactorPath)
# tfs
# Gene symbol jurkat_chipseq_hg38: Count
# ENSG00000081059 TCF7 1
# ENSG00000118513 MYB 1
# ENSG00000124813 RUNX2 4
# ENSG00000125952 MAX 2
# ENSG00000138795 LEF1 2
# ENSG00000157554 ERG 1
# ENSG00000159216 RUNX1 4
# ENSG00000179348 GATA2 1
# Site model ID Coefficient
# ENSG00000081059 V$TCF1_09 0.05875131
# ENSG00000118513 V$CMYB_01 0.02749087
# ENSG00000124813 V$AML_Q6,V$OSF2_Q6,V$RUNX2_03,V$RUNX2_04 0.08663064
# ENSG00000125952 V$MYCMAX_01 0.02998472
# ENSG00000138795 V$LEF1_10,V$LEF1_17 0.03285110
# ENSG00000157554 V$ERG_07 0.04488260
# ENSG00000159216 V$AML1_02,V$AML_Q6 0.08663064
# ENSG00000179348 V$GATA2_09 0.04911769
#
# Extract PWM ids
grnPwms <- unlist(strsplit(tfs[,3], ",", fixed = TRUE))
# Create PWM table for import
write.table(cbind(PWM = grnPwms, Num = 1:length(grnPwms)), file = "grn_pwms.tsv", quote = FALSE, sep = "\t", row.names = FALSE)
# Import PWM table
gx.importTable("grn_pwms.tsv", mealrOutputPath,
"MEALR_positive_coefficients Top 50 GRN PWMs", columnForID = "PWM",
tableType = "Matrices: TRANSFAC", species = species)
# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)
grnPwmPath <- paste0(mealrOutputPath, "/MEALR_positive_coefficients Top 50 GRN PWMs")
gx.analysis.parameters("Create profile from site model table")
#
# Output
# description
# table Table containing site models as row names
# profile Profile to copy the values from
# thresholdsColumn Column containing cutoff values (use 'none' to copy cutoffs from the profile)
# outputProfile Specify the path whether to store output profile
#
transfacProfile <- "databases/TRANSFAC(R) 2022.1/Data/profiles/vertebrate_human_p0.001_non3d"
grnPwmProfile <- paste0(grnPwmPath, " profile")
# Create Match(TM) profile for PWMs of potential GRN factors
gx.analysis("Create profile from site model table", list(table = grnPwmPath,
profile = transfacProfile,
outputProfile = grnPwmProfile),
TRUE, TRUE)
gx.analysis.parameters("TRANSFAC(R) Match(TM) for tracks")
#
# Output
# description
# sequencePath Sequence track to search
# dbSelector Select a deployed or custom sequence source
# profilePath Profile
# withoutDuplicates Ensure to report sites in overlapping genomic intervals only once
# ignoreCore Core scores are not calculated and not used for filtering.
# output Output track path
#
grnMatchPath <- paste0(grnPwmProfile, " Match")
# Predict binding sites of GRN factors in TAL1-bound regions
gx.analysis("TRANSFAC(R) Match(TM) for tracks", list(sequencePath = paste0(folderPath, "/jurkat_chipseq_hg38"),
dbSelector = "Ensembl 104.38 Human (hg38)",
profilePath = grnPwmProfile,
withoutDuplicates = TRUE,
ignoreCore = TRUE,
output = grnMatchPath),
TRUE, TRUE)
#
# Step 10. Prediction of binding sites of identified TFs around TAL1 transcription start site
#
# Create TAL1 gene table for import
write.table(cbind(ID = c("ENSG00000162367"), Symbol = c("TAL1")), file = "tal1.tsv", quote = FALSE, sep = "\t", row.names = FALSE)
# Import TAL1 gene
gx.importTable("tal1.tsv", mealrOutputPath,
"TAL1 gene", columnForID = "ID",
tableType = "Genes: Ensembl", species = species)
# After data imports it is recommendable to shortly pause in order to
# allow server processes to finish before using the data
Sys.sleep(1)
tal1GenePath <- paste0(mealrOutputPath, "/TAL1 gene")
gx.analysis.parameters("Gene set to track")
#
# Output
# description
# sourcePath Table of source genes
# species Taxonomical species
# from From position (relative to gene start)
# to To position (relative to gene start)
# overlapMergingMode How to handle TSS located close to each other
# destPath Output name
#
tal1TrackPath <- paste0(tal1GenePath, " promoter")
# Create track of genomic region around TAL1 TSS (promoter)
gx.analysis("Gene set to track", list(sourcePath = tal1GenePath,
species = species,
from = 2000,
to = 1000,
destPath = tal1TrackPath),
TRUE, TRUE)
tal1MatchPath <- paste0(grnPwmProfile, " TAL1 Match")
# Predict binding sites of GRN factors in TAL1 promoter
gx.analysis("TRANSFAC(R) Match(TM) for tracks", list(sequencePath = tal1TrackPath,
dbSelector = "Ensembl 104.38 Human (hg38)",
profilePath = grnPwmProfile,
withoutDuplicates = TRUE,
ignoreCore = TRUE,
output = tal1MatchPath),
TRUE, TRUE)
# Export genomic locations of predicted sites for GRN factors around TAL1 TSS
gx.export(tal1MatchPath, exporter = "BED format (*.bed)", target.file="TAL1_grn_pwm_sites.bed")
gx.logout()
|