03. January 2021

Bash snippets

Bash snippets that are useful when doing bioinformatic jobs.

These scripts are classified into different usage scenario. Some of them are easier to understand while not so efficient.

General usage

get size

1# get dir size
2du -hs dirname
3# get disk space
4df -h
5

find dir or file

1find /where/to/find/ -type f -name "*.sh"

print specific content

1# get the nth line from a file, print a specific line from a file
2head -n filename | tail -1
3sed 'NUMq;d' filename

compress, archive dir

c – Creates a new .tar archive file
v – Verbosely show the .tar file progress
f – File name type of the archive file
z – filter archive through gzip

 1# create a tar.gz archive from a dir
 2tar -zcvf tar-archive-name.tar.gz source-folder-name
 3# preserve permissions
 4tar -pcvzf tar-archive-name.tar.gz source-folder-name
 5
 6
 7# extract a tar.gz compressed archive, switch the ‘c’ flag to an ‘x’ to extract
 8tar -zxvf tar-archive-name.tar.gz
 9# permission preserving condition
10tar -pxvzf tar-archive-name.tar.gz

remove symbolic links

1ls -la . | grep "\->" | awk '{print $9}' | xargs rm

get the number of files in a directory

1shopt -s nullglob # set the counting start from 1 instead of 0
2num_files=(*)
3num_files=${#num_files[@]}
4echo #num_files

Path variable

1export PATH=$PATH:/place/with/the/file

Fasta file manipulation

 1
 2# add something to end of all header lines
 3sed 's/>.*/&SUFFIX/' file.fa > outfile.fa
 4
 5#clean up a fasta file so only first column of the header remain
 6awk '{print $1}' file.fa > output.fa
 7
 8# count the number of sequences in fasta file
 9grep -c '>' file.fa
10grep '>' file.fa | wc -l
11

variable

set up path into variable in a file called genome_path_var.sh

 1#!/usr/bin/env bash
 2
 3genomeList='ppopv1 p717main p717alt pPtriNisquallyV4 pPdelWV94 pPtriStettler pSpurV5 pSpurFC pAtha pGmax pMtru pEgra pPper pSlyc pVvin pSbic pSita pOsat'
 4
 5ppopv1='/work/cjtlab/popv1/Poptr1_1_GeneModels_GeneCatalog_frozen20080522_aa.fasta'
 6p717main='/work/cjtlab/717genomeV2/annotation/main.prelim.v2.1/PtremulaxPopulusalbav2.1p.primaryTrs.pep.fa'
 7p717alt='/work/cjtlab/717genomeV2/annotation/alt.prelim.v2.1/PtremulaxPopulusalbaaltv2.1.primaryTrs.pep.fa'
 8pPtriNisquallyV4='/work/cjtlab/NisquallyV4/annotation/Ptrichocarpav4.1g.primaryTrs.pep.fa'
 9pPdelWV94='/work/cjtlab/Pdelt/PdeltoidesWV94_445_v2.1/PdeltoidesWV94_445_v2.1.protein_primaryTranscriptOnly.fa'
10pPtriStettler='/work/cjtlab/Database/Ptri_stettlerV1.1/annotation/PtrichocarpaStettler14_532_v1.1.protein.fa'
11pSpurV5='/work/cjtlab/Spurpurea94006/Spurpurea_519_v5.1.protein.fa'
12pSpurFC='/work/cjtlab/SpurpureaFC/annotation/SpurpureaFishCreek_518_v3.1.protein.fa'
13pAtha='/work/cjtlab/Database/Atha/annotation/Athaliana_447_Araport11.protein.fa'
14pGmax='/work/cjtlab/Database/Gmax/annotation/Gmax_275_Wm82.a2.v1.protein.fa'
15pMtru='/work/cjtlab/Database/Mtru/annotation/Mtruncatula_285_Mt4.0v1.protein.fa'
16pEgra='/work/cjtlab/Database/Egrandis/v2.0/annotation/Egrandis_297_v2.0.protein.fa'
17pPper='/work/cjtlab/Database/Pper/annotation/Ppersica_298_v2.1.protein.fa'
18pSlyc='/work/cjtlab/Database/Slycopersicum/ITAG3.2/annotation/Slycopersicum_514_ITAG3.2.protein.fa'
19pVvin='/work/cjtlab/Database/Vvinifera/v2.1/annotation/Vvinifera_457_v2.1.protein.fa'
20pSbic='/work/cjtlab/Database/Sbicolor/v3.1.1/annotation/Sbicolor_454_v3.1.1.protein.fa'
21pSita='/work/cjtlab/Database/Sitalica/v2.2/annotation/Sitalica_312_v2.2.protein.fa'
22pOsat='/work/cjtlab/Database/Osativa/v7.0/annotation/Osativa_323_v7.0.protein.fa'
23
24echo 'genome path loaded'
25

import those variables, iterate them through a list of the variables

 1#!/usr/bin/env bash
 2source /work/cjtlab/scripts/bash/genome_path_var.sh
 3
 4
 5
 6for species in $genomeList
 7do
 8	eval 'cp $'{$species}' fastaFiles'
 9	eval 'hmmscan --tblout $species-tbl.tab --domtblout $species-domtbl.tab --pfamtblout $species-pfamtbl.tab /db/pfam/27.0-3.1b1/Pfam-A.hmm $'{$species}
10
11done

tree alternative

 1#!/bin/bash
 2# only if you have bash 4 in your CentOS system
 3shopt -s globstar
 4for file in **/*
 5do
 6    slash=${file//[^\/]}
 7    case "${#slash}" in
 8        0) echo "|-- ${file}";;
 9        1) echo "|   |--  ${file}";;
10        2) echo "|   |   |--  ${file}";;
11    esac
12done

For loop

1#!/bin/bash
2for filename in /Data/*.txt; do
3    for ((i=0; i<=3; i++)); do
4        ./MyProgram.exe "$filename" "Logs/$(basename "$filename" .txt)_Log$i.txt"
5    done
6done

Unfiled

 1
 2# extract fasta file id
 3grep -o -E "^>\w+" file.fasta | tr -d ">"
 4
 5# Return the lengths of all the sequences in a multifasta
 6
 7falens(){
 8awk '/^>/ {if (seqlen){print seqlen}; print ;seqlen=0;next; } { seqlen += length($0)}END{print seqlen}' $1
 9}
10
11# Remove duplicated fastas in a multifasta:
12
13dedupe(){
14cat $1 | awk '!_[$0]++'
15}
16
17# Merge a multifasta into a single fasta sequence (remove all but the first header, and deal with newlines):
18
19fastcat(){
20cat $1 | sed -e '1!{/^>.*/d;}' | sed  ':a;N;$!ba;s/\n//2g'
21}
22
23# split a multifasta into separate files
24splitfa(){
25i=1;
26while read line ; do
27  if [ ${line:0:1} == ">" ] ; then
28    header="$line"
29    echo "$header" >> seq"${i}".fasta
30  else
31    seq="$line"
32    echo "$seq" >> seq"${i}".fasta
33    ((i++))
34  fi
35done < $1
36}
37
38# get the nucleotide count for all sequences in a multi sequence file
39echo -e "seq_id\tA\tU\tG\tC"; while read line; do echo $line | grep ">" | sed 's/>//g'; for i in A U G C;do echo $line | grep -v ">" | grep -o $i | wc -l | grep -v "^0"; done; done < your_fasta_file.fa | paste - - - - -
40
41# linearize the complete fasta file
42while read line; do
43  if [ "${line:0:1}" == ">" ]; then 
44    echo -e "\n"$line;
45  else 
46    echo $line | tr -d '\n' ; 
47  fi; 
48done < input.fasta | sed '/^--$/d' > output.fasta
49
50# or 
51awk '{if(NR==1) {print $0} else {if($0 ~ /^>/) {print "\n"$0} else {printf $0}}}' input.fasta > output.fasta
52
53# and then pick up certain id with the following cmd
54grep -A1 'Q15049' output.fasta
55

The Latest