1.获取GO和associations

# Get the GO data and their associations.
wget http://purl.obolibrary.org/obo/go.obo
wget http://geneontology.org/gene-associations/goa_human.gaf.gz

# 大小
ls -lh

# 解压
gunzip goa_human.gaf.gz

# 去除前面的感叹号开头的注释信息
cat goa_human.gaf | grep -v '!' > assoc.txt

# association file有多少行基因信息
cat assoc.txt | wc -l
# 477692 (this number changes when the data is updated)

# Gene names 在第三列
cat assoc.txt | cut -f 3 | head -20

# Gene names在第三列。去重复。
cat assoc.txt | cut -f 3 | sort | uniq -c | head -20

#多少基因 去重复
cat assoc.txt | cut -f 3 | sort | uniq -c | wc -l
# 19713 (this number changes when the data is updated)

2.统计

# What are the ten most highly annotated proteins in the GO dataset?
cat assoc.txt | cut -f 2 | sort | uniq -c | sort -k1,1nr > prot_counts.txt

# What are the ten most highly annotated gene names in the GO dataset?
cat assoc.txt | cut -f 3 | sort | uniq -c | sort -k1,1nr > gene_counts.txt

#uniq -c 产生很多空行？？需要suqueze 方便后面datamash统计
cat gene_counts.txt | tr -s ' '

cat gene_counts.txt | tr -s ' ' | datamash -t ' ' mean 2  min 2 max 2 sstdev 2
# 21.928170537048 1 724 31.285153079175

#被注释最多的前十基因
cat gene_counts.txt | head

3.how complete is GO

cat assoc.txt \
| cut -f 14 \
| awk '{ print substr($1, 1, 4) }' \
| sort \
| uniq -c \
| sort -k 2,2 -n \
| awk '{ print  $2 "\t" $1 }'

...
2008 9682
2009 13322
2010 15026
2011 23490
2012 16428
2013 60555
2014 34925
2015 33096
2016 235077

4.

curl -L wget http://geneontology.org/gene-associations/goa_human.gaf.gz | gunzip -c > assoc.txt

#第三列和第五列 geneid和GO、并按第一列排序（b是啥意思）
cat assoc.txt | grep -v '!' | cut -f 3,5 | sort -k 1b,1 > pairs.txt

curl -O http://data.biostarhandbook.com/ontology/deseq-output.txt

cat deseq-output.txt | head -100 | sort -k 1,1 > top.txt

#将两个文件合并
join pairs.txt top.txt -t $'\t' | cut -f 1,2 > agrigo.txt