date="2019-10-01" data_source="data_sources" organism="Homo sapiens" #assumes we're using samples.tsv and samples.tsv.header study=$1 #human or mouse org=$2 #sra, gtex, or tcga dsource=$3 #RNA-seq, tissue, celltype Predictions/curations file #e.g. Human.SRA_curated_predict.meta.w_rids.tsv preds_file=$4 perl -e '$study="'$study'"; $study=~/(..)$/; $lo=$1; `mkdir -p $lo/$study`; print "$lo/$study\n";' > ${study}.dir dir=`cat ${study}.dir` cat samples.tsv.header <(fgrep " $study " samples.tsv) > $dir/samples.tsv.${study} if [[ "$org" == "mouse" ]]; then organism="Mus musculus" date="2020-01-01" #sra specific MD file cut -f 1-43 $dir/samples.tsv.${study} | gzip > $dir/sra.sra.${study}.MD.gz #QC MD file cut -f 1-3,44-152 $dir/samples.tsv.${study} | gzip > $dir/sra.recount_qc.${study}.MD.gz else #sra specific MD file cut -f 1-41 $dir/samples.tsv.${study} | gzip > $dir/sra.sra.${study}.MD.gz #QC MD file cut -f 1-3,42-150 $dir/samples.tsv.${study} | gzip > $dir/sra.recount_qc.${study}.MD.gz fi proj_header='rail_id external_id study project organism file_source metadata_source date_processed' dsource="$data_source/$dsource" extra_proj_cols="$organism $dsource $dsource $date" #project MD file cat <(echo "$proj_header") <(tail -n+2 $dir/samples.tsv.${study} | cut -f 1-3 | perl -ne 'chomp; $f=$_; ($rid,$run,$study)=split(/\t/,$f); print "$f\t$study\t'"$extra_proj_cols"'\n";') | gzip > $dir/sra.recount_project.${study}.MD.gz #RNA-seq, tissue, celltype Predictions/curations cat <(head -1 $preds_file) <(fgrep " $study " $preds_file) | gzip > $dir/sra.recount_pred.${study}.MD.gz rm $dir/samples.tsv.${study}