The R code is provided with the tutorial material as genexplain_tutorial_chipseq_analysis.R.
Please note that some parts require editing before running the script, including username, password,
project name.
library(geneXplainR)# Please note that for successful login, username and password have to# belong to a valid account on that server.server<-"https://platform.genexplain.com"user<-"someuser@email.io"password<-"12345"gx.login(server,user,password)apiProjectName<-"api2022_tutorial"apiProjectPath<-paste0("data/Projects/",apiProjectName)# Create a new platform project# This step should be conducted only once for a new project.# The gx.createProject function returns an error if the project# already exists.gx.createProject(apiProjectName,description="API 2022 tutorial project")folderPath<-paste0(apiProjectPath,"/Data/chipseq_analysis_workflow")# Within a project, folders for data elements need to be created within the# "Data" folder or its subfolders.gx.createFolder("data/Projects/api2022_tutorial/Data","chipseq_analysis_workflow")## Step 1. Import and mapping of TAL1-bound genomic regions## Import the BED file with TAL1-bound regions into the destination foldergx.import("../data/GSM614003_jurkat.tal1.bed",folderPath,"BED format (*.bed)",list(dbSelector="Ensembl 52.36n Human (hg18)"))# After data imports it is recommendable to shortly pause in order to# allow server processes to finish before using the dataSys.sleep(1)gx.analysis.parameters("liftOver1")## Output# description# input # mapping # minMatch Recommended values: same species = 0.95, different species = 0.10# multiple|choice Recommended values: same species = No, different species = Yes# out_file1 # out_file2#bedPath<-paste0(folderPath,"/GSM614003_jurkat.tal1")mappedPath<-paste0(folderPath,"/jurkat_chipseq_hg38")unmappedPath<-paste0(folderPath,"/jurkat_chipseq_hg38_unmapped")mapping<-"hg18->hg38"# Run Liftover to map hg18 coordinates to hg38gx.analysis("liftOver1",list(input=bedPath,mapping=mapping,minMatch=0.95,out_file1=mappedPath,out_file2=unmappedPath),TRUE,TRUE)## Step 2. Mapping TAL1-bound genomic regions to nearby genes#species<-"Human (Homo sapiens)"mappedGenePath<-paste0(mappedPath," Genes")# Use gx.trackToGeneSet function to map genomic coordinates to genesgx.trackToGeneSet(mappedPath,species,-5000,2000,destPath=mappedGenePath)## Step 3. Functional enrichment analysis of genes near TAL1-bound regions## Enrichment of genes associated with Gene Ontology terms using# "Functional classification"funclassResultPath<-paste0(mappedGenePath," GO")gx.analysis("Functional classification",list(sourcePath=mappedGenePath,species=species,bioHub="Full gene ontology classification",minHits=1,pvalueThreshold=1,outputTable=funclassResultPath),TRUE,TRUE)# Export analysis result to local file# The default exporter is for data tables, so that it is sufficient# to specify the data element path and the local output filegx.export(funclassResultPath,target.file="functional_classification_result_GO.tsv")# Enrichment of genes associated with Reactome pathwaysfunclassResultPath<-paste0(mappedGenePath," Reactome")gx.analysis("Functional classification",list(sourcePath=mappedGenePath,species=species,bioHub="Reactome pathways (74)",minHits=1,pvalueThreshold=1,outputTable=funclassResultPath),TRUE,TRUE)# Export analysis result to local filegx.export(funclassResultPath,target.file="functional_classification_result_Reactome.tsv")# Enrichment of genes associated with human diseases based on# HumanPSD disease biomarker annotationfunclassResultPath<-paste0(mappedGenePath," Human disease biomarkers")gx.analysis("Functional classification",list(sourcePath=mappedGenePath,species=species,bioHub="HumanPSD(TM) disease (2022.1)",minHits=1,pvalueThreshold=1,outputTable=funclassResultPath),TRUE,TRUE)# Export analysis result to local filegx.export(funclassResultPath,target.file="functional_classification_result_HumanPSD.tsv")## Step 4. Sampling genomic regions not bound by TAL1#mealrBackgroundTrack<-paste0(mappedPath," random 1000")# Create random track not overlapping with TAL1-bound regionsgx.analysis("Create random track",list(inputTrackPath=mappedPath,dbSelector="Ensembl 104.38 Human (hg38)",species=species,standardChromosomes=TRUE,seqNumber=1000,seqLength=0,from=0,to=0,withOverlap=FALSE,randomShift=FALSE,outputTrackPath=mealrBackgroundTrack,randSeed=123),TRUE,TRUE)## Step 5. Import and mapping of TAL1 binding site subset### Data upload and lifting as in Step 1 for 1000 TAL1 sites sampled# from the original BED filegx.import("../data/GSM614003_jurkat.tal1_1000.bed",folderPath,"BED format (*.bed)",list(dbSelector="Ensembl 52.36n Human (hg18)"))# After data imports it is recommendable to shortly pause in order to# allow server processes to finish before using the dataSys.sleep(1)bedPath<-paste0(folderPath,"/GSM614003_jurkat.tal1_1000")mappedPath<-paste0(folderPath,"/jurkat_chipseq_hg38_1000")unmappedPath<-paste0(folderPath,"/jurkat_chipseq_hg38_1000_unmapped")# Map hg18 coordinates to hg38gx.analysis("liftOver1",list(input=bedPath,mapping=mapping,minMatch=0.95,out_file1=mappedPath,out_file2=unmappedPath),TRUE,TRUE)## Step 6. Selection of important PWM models using MEALR#gx.analysis.parameters("MEALR (tracks)")## Output# description# yesSetPath Study track / Track with intervals of interest# noSetPath Background track / Track of non-bound intervals# dbSelector Select a deployed or custom sequence source# profilePath Profile of weight matrices# maxPosCoef Maximum number of positive coefficients (motifs of factors with positive binding effect)# maxComplexity Maximal complexity paramater to explore# complexityInc Step-size of incrementing the complexity parameter# maxUnimproved Maximal number of iterations without improved accuracy# scoresWithNoSet Include scores for No sequences in score table outputs# output Output folder path#mealrOutputPath<-paste0(mappedPath," MEALR")transfacProfile<-"databases/TRANSFAC(R) 2022.1/Data/profiles/vertebrate_human_p0.05_non3d"# Analyze target and background genomic regions using MEALRgx.analysis("MEALR (tracks)",list(yesSetPath=mappedPath,noSetPath=mealrBackgroundTrack,dbSelector="Ensembl 104.38 Human (hg38)",profilePath=transfacProfile,maxPosCoef=150,maxComplexity=0.5,complexityInc=0.02,maxUnimproved=20,scoresWithNoSet=FALSE,output=mealrOutputPath),TRUE,TRUE)## Step 7. Extraction of binding transcription factors#gx.analysis.parameters("Select top rows")## Output# description# inputTable Input table# column Column# types Type# topPercent Top percent# topCount Top max count# topMinCount Top min count# topTable Top table output# middlePercent Middle percent# middleCount Middle max count# middleMinCount Middle min count# middleTable Middle table output# bottomPercent Bottom percent# bottomCount Bottom max count# bottomMinCount Bottom min count# bottomTable Bottom table output#mealrMotifPath<-paste0(mealrOutputPath,"/MEALR_positive_coefficients")mealrTopPath<-paste0(mealrMotifPath," Top 50")# Extract top 50 PWMs ranked by logistic regression coefficientgx.analysis("Select top rows",list(inputTable=mealrMotifPath,column="Coefficient",topPercent=100.0,topCount=50,topMinCount=50,topTable=mealrTopPath),TRUE,TRUE)gx.analysis.parameters("Matrices to molecules")## Output# description# sitesCollection Select table with the results of "Site search on gene set". Such table contains site model ID in each row.# siteModelsCollection Select the profile that was used for site search. In most of the cases, profile is selected automatically.# species Select arabidopsis, nematoda, zebrafish, fruit fly, human, mouse, rat, baker s yeast or fission yeast species# targetType Select type of identifiers for the resulting table# ignoreNaNInAggregator Ignore empty values during aggregator work# aggregator Select one of the rules to treat values in the numerical columns of the table when several rows are merged into a single one.# columnName Select the column with numerical values to apply one of the rules described above# outputTable Path to store the resulting table in the tree#mealrTopGenePath<-paste0(mealrTopPath," Genes")# Convert PWMs to factor genesgx.analysis("Matrices to molecules",list(sitesCollection=mealrTopPath,siteModelsCollection=transfacProfile,species=species,targetType="Genes: Ensembl",outputTable=mealrTopGenePath),TRUE,TRUE)## Step 8. Intersection of potentially TAL1-regulated genes and MEALR TFs#gx.analysis.parameters("Venn diagrams")## Output# description# table1Path Table which will be represented as left-top circle# table1Name Name for the left table on the diagram (leave empty to use table name)# circle1Color Color for the left-top circle on the diagram# table2Path Table which will be represented as right-top circle# table2Name Name for the right table on the diagram (leave empty to use table name)# circle2Color Color for the right-top circle on the diagram# table3Path Table which will be represented as center-bottom circle# table3Name Name for the center table on the diagram (leave empty to use table name)# circle3Color Color for the center-bottom circle on the diagram# simple All circles has equal radius# output Folder name to store the results#mappedNearbyGenePath<-paste0(folderPath,"/jurkat_chipseq_hg38 Genes")mealrTopVennPath<-paste0(mealrTopPath," Venn")# Intersect factors identified by MEALR and genes with nearby TAL1 ChIP-seq# sitesgx.analysis("Venn diagrams",list(table1Path=mappedNearbyGenePath,table1Name="Genes near TAL1 sites",table2Path=mealrTopGenePath,table2Name="MEALR transcription factors",simple=TRUE,output=mealrTopVennPath),TRUE,TRUE)## Step 9. Prediction of binding sites of identified TFs in TAL1-bound genomic regions#grnFactorPath<-paste0(mealrTopVennPath,"/Rows present in both tables")# Load potential GRN factors into R data frametfs<-gx.get(grnFactorPath)# tfs# Gene symbol jurkat_chipseq_hg38: Count# ENSG00000081059 TCF7 1# ENSG00000118513 MYB 1# ENSG00000124813 RUNX2 4# ENSG00000125952 MAX 2# ENSG00000138795 LEF1 2# ENSG00000157554 ERG 1# ENSG00000159216 RUNX1 4# ENSG00000179348 GATA2 1# Site model ID Coefficient# ENSG00000081059 V$TCF1_09 0.05875131# ENSG00000118513 V$CMYB_01 0.02749087# ENSG00000124813 V$AML_Q6,V$OSF2_Q6,V$RUNX2_03,V$RUNX2_04 0.08663064# ENSG00000125952 V$MYCMAX_01 0.02998472# ENSG00000138795 V$LEF1_10,V$LEF1_17 0.03285110# ENSG00000157554 V$ERG_07 0.04488260# ENSG00000159216 V$AML1_02,V$AML_Q6 0.08663064# ENSG00000179348 V$GATA2_09 0.04911769## Extract PWM idsgrnPwms<-unlist(strsplit(tfs[,3],",",fixed=TRUE))# Create PWM table for importwrite.table(cbind(PWM=grnPwms,Num=1:length(grnPwms)),file="grn_pwms.tsv",quote=FALSE,sep="\t",row.names=FALSE)# Import PWM tablegx.importTable("grn_pwms.tsv",mealrOutputPath,"MEALR_positive_coefficients Top 50 GRN PWMs",columnForID="PWM",tableType="Matrices: TRANSFAC",species=species)# After data imports it is recommendable to shortly pause in order to# allow server processes to finish before using the dataSys.sleep(1)grnPwmPath<-paste0(mealrOutputPath,"/MEALR_positive_coefficients Top 50 GRN PWMs")gx.analysis.parameters("Create profile from site model table")## Output# description# table Table containing site models as row names# profile Profile to copy the values from# thresholdsColumn Column containing cutoff values (use 'none' to copy cutoffs from the profile)# outputProfile Specify the path whether to store output profile#transfacProfile<-"databases/TRANSFAC(R) 2022.1/Data/profiles/vertebrate_human_p0.001_non3d"grnPwmProfile<-paste0(grnPwmPath," profile")# Create Match(TM) profile for PWMs of potential GRN factorsgx.analysis("Create profile from site model table",list(table=grnPwmPath,profile=transfacProfile,outputProfile=grnPwmProfile),TRUE,TRUE)gx.analysis.parameters("TRANSFAC(R) Match(TM) for tracks")## Output# description# sequencePath Sequence track to search# dbSelector Select a deployed or custom sequence source# profilePath Profile# withoutDuplicates Ensure to report sites in overlapping genomic intervals only once# ignoreCore Core scores are not calculated and not used for filtering.# output Output track path#grnMatchPath<-paste0(grnPwmProfile," Match")# Predict binding sites of GRN factors in TAL1-bound regionsgx.analysis("TRANSFAC(R) Match(TM) for tracks",list(sequencePath=paste0(folderPath,"/jurkat_chipseq_hg38"),dbSelector="Ensembl 104.38 Human (hg38)",profilePath=grnPwmProfile,withoutDuplicates=TRUE,ignoreCore=TRUE,output=grnMatchPath),TRUE,TRUE)## Step 10. Prediction of binding sites of identified TFs around TAL1 transcription start site## Create TAL1 gene table for importwrite.table(cbind(ID=c("ENSG00000162367"),Symbol=c("TAL1")),file="tal1.tsv",quote=FALSE,sep="\t",row.names=FALSE)# Import TAL1 genegx.importTable("tal1.tsv",mealrOutputPath,"TAL1 gene",columnForID="ID",tableType="Genes: Ensembl",species=species)# After data imports it is recommendable to shortly pause in order to# allow server processes to finish before using the dataSys.sleep(1)tal1GenePath<-paste0(mealrOutputPath,"/TAL1 gene")gx.analysis.parameters("Gene set to track")## Output# description# sourcePath Table of source genes# species Taxonomical species# from From position (relative to gene start)# to To position (relative to gene start)# overlapMergingMode How to handle TSS located close to each other# destPath Output name#tal1TrackPath<-paste0(tal1GenePath," promoter")# Create track of genomic region around TAL1 TSS (promoter)gx.analysis("Gene set to track",list(sourcePath=tal1GenePath,species=species,from=2000,to=1000,destPath=tal1TrackPath),TRUE,TRUE)tal1MatchPath<-paste0(grnPwmProfile," TAL1 Match")# Predict binding sites of GRN factors in TAL1 promotergx.analysis("TRANSFAC(R) Match(TM) for tracks",list(sequencePath=tal1TrackPath,dbSelector="Ensembl 104.38 Human (hg38)",profilePath=grnPwmProfile,withoutDuplicates=TRUE,ignoreCore=TRUE,output=tal1MatchPath),TRUE,TRUE)# Export genomic locations of predicted sites for GRN factors around TAL1 TSSgx.export(tal1MatchPath,exporter="BED format (*.bed)",target.file="TAL1_grn_pwm_sites.bed")gx.logout()