In [1]:
%run ../config/init.py

Genome GTF and STAR indexes

In [5]:
os.environ['STAR_INDEX'] = '/gfs/data/genomes/Gencode/Gencode_human/GRCh37/release_19/STAR'
os.environ['GTF'] = '/gfs/data/genomes/Gencode/Gencode_human/GRCh37/release_19/gencode.v19.annotation.gtf'
os.environ['BED'] = '/gfs/data/genomes/Gencode/Gencode_human/GRCh37/release_19/gencode.v19.annotation.bed'

Loading samples from the trimmed folder

In [6]:
data_dir = os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'trimmomatic')
os.chdir(data_dir)
files = [ f for ds,dr,fs in os.walk('./') for f in fs if f.endswith('.fastq.gz')]

Creating folder named: quantification

In [7]:
result_dir = os.path.join(os.environ['RESULTS'], os.environ['DATASET'], 'quantification')
if not os.path.exists(result_dir):
    os.mkdir(result_dir)
os.chdir(result_dir)
os.environ['w'] = result_dir

Executing quantification workflow

In [ ]:
%%time
errors = {}
!rm -f commands
for f in files:
    os.environ['f'] = os.path.join(data_dir, f)
    if not os.path.exists(os.path.basename(f).replace('.fastq.gz','_sorted.bam')):
        !echo "cwl-runner --tmp-outdir-prefix=$TMP/ --tmpdir-prefix=$TMP/ $CWLWORKFLOWS/RNA-Seq/rnaseq-quatification-SE.cwl --threads 16 --reads_1 $f --genomeDir $STAR_INDEX --gtf $GTF -q=255 -r $BED" >> commands
errors = !cat commands | parallel -j 1    
corrects = 0
for e in errors:
    if 'Final process status is success' == e:
        corrects += 1
if corrects == len(files):
    print('Run completed')
else:
    print('There are errors. Please check.')        

Creates TPM and reads matrices for genes

In [ ]:
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_tr_sorted_genes.out"
files = [ f for ds, df, files in os.walk('./') for f in files if output_suffix in f]
for column in columns:
    print(column)
    data = pandas.DataFrame()
    i = 1
    for f in files:
        # Get sample name removing the suffix and check if the output is compressed
        if f.endswith('.gz'):
            output_suffix_real = output_suffix + '.gz'
        else:
            output_suffix_real = output_suffix
        s = f.replace(output_suffix_real, '')
        df = pandas.read_csv(f, sep='\t')
        df = df[['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength', column]]
        df = df.rename(index=str, columns={column: s})
        if data.empty:
            data = df
        else:
            data = data.merge(df, on=['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength'], how='outer')
        i += 1
    print('Data columns: ' + str(len(data.columns)))
    print('Data rows: ' + str(len(data)))
    data.to_csv(column + '.tsv', sep='\t', index=False, na_rep='0')

Creates TPM and reads matrices for transcripts

In [ ]:
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_tr_sorted_transcripts.out"
files = [ f for ds, df, files in os.walk('./') for f in files if output_suffix in f]
for column in columns:
    print(column)
    data = pandas.DataFrame()
    i = 1
    for f in files:
        # Get sample name removing the suffix and check if the output is compressed
        if f.endswith('.gz'):
            output_suffix_real = output_suffix + '.gz'
        else:
            output_suffix_real = output_suffix
        s = f.replace(output_suffix_real, '')
        df = pandas.read_csv(f, sep='\t')
        df = df[['Gene_Id', 'Transcript_Id', 'Chr', 'Start', 'End', 'ExonLength', column]]
        df = df.rename(index=str, columns={column: s})
        if data.empty:
            data = df
        else:
            data = data.merge(df, on=['Gene_Id', 'Transcript_Id', 'Chr', 'Start', 'End', 'ExonLength'], how='outer')
        i += 1
    print('Data columns: ' + str(len(data.columns)))
    print('Data rows: ' + str(len(data)))
    data.to_csv(column + '_transcripts.tsv', sep='\t', index=False, na_rep='0')

Transforming Ensembl IDs to gene names

In [ ]:
gtf = pandas.read_csv(os.environ['GTF'], sep='\t', header=None, comment='#')
a = []
gene_gtf = gtf[gtf[2] == 'gene']
for i, r in gene_gtf.iterrows():
    in_a = r.tolist()
    fs = re.findall(r"[-\w\./']+", r[8])
    df = {'gene_id': '', 'gene_name': ''}
    for c in range(0, len(fs), 2):
        if fs[c] in df:
            df[fs[c]] = fs[c + 1]
    in_a.append(df['gene_id'])
    in_a.append(df['gene_name'])
    a.append(in_a)
gene_gtf = pandas.DataFrame(a, columns=['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame',
                                   'attribute', 'gene_id', 'gene_name'])
gene_gtf = gene_gtf.reset_index(drop=True)
gene_gtf.to_csv(os.environ['GTF'].replace('.gtf', '_genes.gtf'), sep='\t', index=False, header=None)