03. January 2021
Bash snippets
Bash snippets that are useful when doing bioinformatic jobs.
These scripts are classified into different usage scenario. Some of them are easier to understand while not so efficient.
General usage
get size
1# get dir size
2du -hs dirname
3# get disk space
4df -h
5
find dir or file
1find /where/to/find/ -type f -name "*.sh"
print specific content
1# get the nth line from a file, print a specific line from a file
2head -n filename | tail -1
3sed 'NUMq;d' filename
compress, archive dir
- c – Creates a new .tar archive file
- v – Verbosely show the .tar file progress
- f – File name type of the archive file
- z – filter archive through gzip
1# create a tar.gz archive from a dir
2tar -zcvf tar-archive-name.tar.gz source-folder-name
3# preserve permissions
4tar -pcvzf tar-archive-name.tar.gz source-folder-name
5
6
7# extract a tar.gz compressed archive, switch the ‘c’ flag to an ‘x’ to extract
8tar -zxvf tar-archive-name.tar.gz
9# permission preserving condition
10tar -pxvzf tar-archive-name.tar.gz
remove symbolic links
1ls -la . | grep "\->" | awk '{print $9}' | xargs rm
get the number of files in a directory
1shopt -s nullglob # set the counting start from 1 instead of 0
2num_files=(*)
3num_files=${#num_files[@]}
4echo #num_files
Path variable
1export PATH=$PATH:/place/with/the/file
Fasta file manipulation
1
2# add something to end of all header lines
3sed 's/>.*/&SUFFIX/' file.fa > outfile.fa
4
5#clean up a fasta file so only first column of the header remain
6awk '{print $1}' file.fa > output.fa
7
8# count the number of sequences in fasta file
9grep -c '>' file.fa
10grep '>' file.fa | wc -l
11
variable
set up path into variable in a file called genome_path_var.sh
1#!/usr/bin/env bash
2
3genomeList='ppopv1 p717main p717alt pPtriNisquallyV4 pPdelWV94 pPtriStettler pSpurV5 pSpurFC pAtha pGmax pMtru pEgra pPper pSlyc pVvin pSbic pSita pOsat'
4
5ppopv1='/work/cjtlab/popv1/Poptr1_1_GeneModels_GeneCatalog_frozen20080522_aa.fasta'
6p717main='/work/cjtlab/717genomeV2/annotation/main.prelim.v2.1/PtremulaxPopulusalbav2.1p.primaryTrs.pep.fa'
7p717alt='/work/cjtlab/717genomeV2/annotation/alt.prelim.v2.1/PtremulaxPopulusalbaaltv2.1.primaryTrs.pep.fa'
8pPtriNisquallyV4='/work/cjtlab/NisquallyV4/annotation/Ptrichocarpav4.1g.primaryTrs.pep.fa'
9pPdelWV94='/work/cjtlab/Pdelt/PdeltoidesWV94_445_v2.1/PdeltoidesWV94_445_v2.1.protein_primaryTranscriptOnly.fa'
10pPtriStettler='/work/cjtlab/Database/Ptri_stettlerV1.1/annotation/PtrichocarpaStettler14_532_v1.1.protein.fa'
11pSpurV5='/work/cjtlab/Spurpurea94006/Spurpurea_519_v5.1.protein.fa'
12pSpurFC='/work/cjtlab/SpurpureaFC/annotation/SpurpureaFishCreek_518_v3.1.protein.fa'
13pAtha='/work/cjtlab/Database/Atha/annotation/Athaliana_447_Araport11.protein.fa'
14pGmax='/work/cjtlab/Database/Gmax/annotation/Gmax_275_Wm82.a2.v1.protein.fa'
15pMtru='/work/cjtlab/Database/Mtru/annotation/Mtruncatula_285_Mt4.0v1.protein.fa'
16pEgra='/work/cjtlab/Database/Egrandis/v2.0/annotation/Egrandis_297_v2.0.protein.fa'
17pPper='/work/cjtlab/Database/Pper/annotation/Ppersica_298_v2.1.protein.fa'
18pSlyc='/work/cjtlab/Database/Slycopersicum/ITAG3.2/annotation/Slycopersicum_514_ITAG3.2.protein.fa'
19pVvin='/work/cjtlab/Database/Vvinifera/v2.1/annotation/Vvinifera_457_v2.1.protein.fa'
20pSbic='/work/cjtlab/Database/Sbicolor/v3.1.1/annotation/Sbicolor_454_v3.1.1.protein.fa'
21pSita='/work/cjtlab/Database/Sitalica/v2.2/annotation/Sitalica_312_v2.2.protein.fa'
22pOsat='/work/cjtlab/Database/Osativa/v7.0/annotation/Osativa_323_v7.0.protein.fa'
23
24echo 'genome path loaded'
25
import those variables, iterate them through a list of the variables
1#!/usr/bin/env bash
2source /work/cjtlab/scripts/bash/genome_path_var.sh
3
4
5
6for species in $genomeList
7do
8 eval 'cp $'{$species}' fastaFiles'
9 eval 'hmmscan --tblout $species-tbl.tab --domtblout $species-domtbl.tab --pfamtblout $species-pfamtbl.tab /db/pfam/27.0-3.1b1/Pfam-A.hmm $'{$species}
10
11done
tree alternative
1#!/bin/bash
2# only if you have bash 4 in your CentOS system
3shopt -s globstar
4for file in **/*
5do
6 slash=${file//[^\/]}
7 case "${#slash}" in
8 0) echo "|-- ${file}";;
9 1) echo "| |-- ${file}";;
10 2) echo "| | |-- ${file}";;
11 esac
12done
For loop
1#!/bin/bash
2for filename in /Data/*.txt; do
3 for ((i=0; i<=3; i++)); do
4 ./MyProgram.exe "$filename" "Logs/$(basename "$filename" .txt)_Log$i.txt"
5 done
6done
Unfiled
1
2# extract fasta file id
3grep -o -E "^>\w+" file.fasta | tr -d ">"
4
5# Return the lengths of all the sequences in a multifasta
6
7falens(){
8awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length($0)}END{print seqlen}' $1
9}
10
11# Remove duplicated fastas in a multifasta:
12
13dedupe(){
14cat $1 | awk '!_[$0]++'
15}
16
17# Merge a multifasta into a single fasta sequence (remove all but the first header, and deal with newlines):
18
19fastcat(){
20cat $1 | sed -e '1!{/^>.*/d;}' | sed ':a;N;$!ba;s/\n//2g'
21}
22
23# split a multifasta into separate files
24splitfa(){
25i=1;
26while read line ; do
27 if [ ${line:0:1} == ">" ] ; then
28 header="$line"
29 echo "$header" >> seq"${i}".fasta
30 else
31 seq="$line"
32 echo "$seq" >> seq"${i}".fasta
33 ((i++))
34 fi
35done < $1
36}
37
38# get the nucleotide count for all sequences in a multi sequence file
39echo -e "seq_id\tA\tU\tG\tC"; while read line; do echo $line | grep ">" | sed 's/>//g'; for i in A U G C;do echo $line | grep -v ">" | grep -o $i | wc -l | grep -v "^0"; done; done < your_fasta_file.fa | paste - - - - -
40
41# linearize the complete fasta file
42while read line; do
43 if [ "${line:0:1}" == ">" ]; then
44 echo -e "\n"$line;
45 else
46 echo $line | tr -d '\n' ;
47 fi;
48done < input.fasta | sed '/^--$/d' > output.fasta
49
50# or
51awk '{if(NR==1) {print $0} else {if($0 ~ /^>/) {print "\n"$0} else {printf $0}}}' input.fasta > output.fasta
52
53# and then pick up certain id with the following cmd
54grep -A1 'Q15049' output.fasta
55