%run ../config/init.py
os.environ['STAR_INDEX'] = '/gfs/data/genomes/Gencode/Gencode_human/GRCh37/release_19/STAR'
os.environ['GTF'] = '/gfs/data/genomes/Gencode/Gencode_human/GRCh37/release_19/gencode.v19.annotation.gtf'
os.environ['BED'] = '/gfs/data/genomes/Gencode/Gencode_human/GRCh37/release_19/gencode.v19.annotation.bed'
data_dir = os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'trimmomatic')
os.chdir(data_dir)
files = [ f for ds,dr,fs in os.walk('./') for f in fs if f.endswith('.fastq.gz')]
result_dir = os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification')
if not os.path.exists(result_dir):
os.mkdir(result_dir)
os.chdir(result_dir)
os.environ['w'] = result_dir
%%time
errors = {}
!rm -f commands
for f in files:
os.environ['f'] = os.path.join(data_dir, f)
if not os.path.exists(os.path.basename(f).replace('.fastq.gz','_sorted.bam')):
!echo "cwl-runner --tmp-outdir-prefix=$TMP/ --tmpdir-prefix=$TMP/ $CWLWORKFLOWS/RNA-Seq/rnaseq-quatification-SE.cwl --threads 16 --reads_1 $f --genomeDir $STAR_INDEX --gtf $GTF -q=255 -r $BED" >> commands
errors = !cat commands | parallel -j 1
corrects = 0
for e in errors:
if 'Final process status is success' == e:
corrects += 1
if corrects == len(files):
print('Run completed')
else:
print('There are errors. Please check.')
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_tr_sorted_genes.out"
files = [ f for ds, df, files in os.walk('./') for f in files if output_suffix in f]
for column in columns:
print(column)
data = pandas.DataFrame()
i = 1
for f in files:
# Get sample name removing the suffix and check if the output is compressed
if f.endswith('.gz'):
output_suffix_real = output_suffix + '.gz'
else:
output_suffix_real = output_suffix
s = f.replace(output_suffix_real, '')
df = pandas.read_csv(f, sep='\t')
df = df[['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength', column]]
df = df.rename(index=str, columns={column: s})
if data.empty:
data = df
else:
data = data.merge(df, on=['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength'], how='outer')
i += 1
print('Data columns: ' + str(len(data.columns)))
print('Data rows: ' + str(len(data)))
data.to_csv(column + '.tsv', sep='\t', index=False, na_rep='0')
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_tr_sorted_transcripts.out"
files = [ f for ds, df, files in os.walk('./') for f in files if output_suffix in f]
for column in columns:
print(column)
data = pandas.DataFrame()
i = 1
for f in files:
# Get sample name removing the suffix and check if the output is compressed
if f.endswith('.gz'):
output_suffix_real = output_suffix + '.gz'
else:
output_suffix_real = output_suffix
s = f.replace(output_suffix_real, '')
df = pandas.read_csv(f, sep='\t')
df = df[['Gene_Id', 'Transcript_Id', 'Chr', 'Start', 'End', 'ExonLength', column]]
df = df.rename(index=str, columns={column: s})
if data.empty:
data = df
else:
data = data.merge(df, on=['Gene_Id', 'Transcript_Id', 'Chr', 'Start', 'End', 'ExonLength'], how='outer')
i += 1
print('Data columns: ' + str(len(data.columns)))
print('Data rows: ' + str(len(data)))
data.to_csv(column + '_transcripts.tsv', sep='\t', index=False, na_rep='0')
gtf = pandas.read_csv(os.environ['GTF'], sep='\t', header=None, comment='#')
a = []
gene_gtf = gtf[gtf[2] == 'gene']
for i, r in gene_gtf.iterrows():
in_a = r.tolist()
fs = re.findall(r"[-\w\./']+", r[8])
df = {'gene_id': '', 'gene_name': ''}
for c in range(0, len(fs), 2):
if fs[c] in df:
df[fs[c]] = fs[c + 1]
in_a.append(df['gene_id'])
in_a.append(df['gene_name'])
a.append(in_a)
gene_gtf = pandas.DataFrame(a, columns=['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame',
'attribute', 'gene_id', 'gene_name'])
gene_gtf = gene_gtf.reset_index(drop=True)
gene_gtf.to_csv(os.environ['GTF'].replace('.gtf', '_genes.gtf'), sep='\t', index=False, header=None)