From c76cf796771350e38184079c9d8a33f9dd6ec417 Mon Sep 17 00:00:00 2001
From: hjmegens <hjmegens@gmail.com>
Date: Wed, 11 Mar 2015 09:53:07 +0100
Subject: [PATCH] pop genomics

---
 5.7_Population_Genomics_commands.sh | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/5.7_Population_Genomics_commands.sh b/5.7_Population_Genomics_commands.sh
index aaba776..08efab8 100644
--- a/5.7_Population_Genomics_commands.sh
+++ b/5.7_Population_Genomics_commands.sh
@@ -1,17 +1,25 @@
 
+## Path data:
+/home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/
+
 ### STEP 1. SNP calling using ANGSD (output in plink format) and basic filtering:
-angsd -bam list_bam_leon.txt -minMapQ 30 -minQ 20 -minInd 13 -baq 1 -doCounts 1 -setMinDepth 30 -setMaxDepth 450 -GL 1 -out data_raw -ref UMD_5_genome_nospaces_60.fa -doMajorMinor 4 -doMaf 2 -SNP_pval 1e-6 -doGeno 4 -doPost 1 -doPlink 2 -P 10
+angsd -bam /home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/list_bam_leon.txt -minMapQ 30 -minQ 20 -minInd 13 -baq 1 -doCounts 1 -setMinDepth 30 -setMaxDepth 450 -GL 1 -out data_raw -ref /home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/reference/UMD_5_genome_nospaces_60.fa -doMajorMinor 4 -doMaf 2 -SNP_pval 1e-6 -doGeno 4 -doPost 1 -doPlink 2 -P 10
 
 # Edit tfam file (population should be in the first column):
-cat data_raw.tfam | awk 'print {$2,$1,$3,$4,$5,$6}' > data_raw.tfam2
-mv data_raw.tfam2 data_raw.tfam
+# cat data_raw.tfam | awk '{print $2,$1,$3,$4,$5,$6}' > data_raw.tfam2
+# mv data_raw.tfam2 data_raw.tfam
+# vim data_raw.tfam (pop1: id1-5; pop2: ids6-10; pop3: id11-15
+
 
+# copy (cp) files tfam and tped into your working directory.
+mkdir PCA_LD_session
+cp /home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/data_raw.t* /your directory/
 
 
 ### STEP 2. Population stratification analysis: PCA, genomic distance matrix and cluster as implemented in Plink
 
 # Filter data using plink:
-plink  --tfile data_raw --geno 0.3 --maf 0.05 --hwe 1e-5 --make-bed --out data_clean
+plink  --tfile data_raw --geno 0.1 --maf 0.05 --hwe 1e-5 --make-bed --out data_clean
 
 # PCA:
 plink --bfile data_clean --pca 3 header --out data.struc1
@@ -20,8 +28,9 @@ plink --bfile data_clean --pca 3 header --out data.struc1
 PCA <- read.table("data.struc1.eigenvec", header=T, quote="\"")
 head (PCA)
 library (ggplot2)
-c <- ggplot(PCA, aes(y=PC1, x=PC2, colour=factor(FID)))
-c + geom_point(size = 3)
+c <- ggplot(PCA, aes(y=PC1, x=PC2, colour=factor(FID))) + geom_point(size = 3)
+ggsave(c, file="PCA_PC1-PC2.pdf")
+
 ####################################################
 
 # Other structure analysis:
@@ -29,8 +38,6 @@ plink --bfile data_clean --cluster -K 3 --out data.struc2
 plink --bfile data_clean --distance triangle 1-ibs --out data.struc3
 
 
-
-
 ### STEP 3. LD:
 plink --bfile data_clean --r2 --ld-snp Chr1_10002364 --ld-window-kb 1000 --ld-window 500 --ld-window-r2 0.1 --out data.LD
  
@@ -38,9 +45,9 @@ plink --bfile data_clean --r2 --ld-snp Chr1_10002364 --ld-window-kb 1000 --ld-wi
 LD <- read.table("data.LD.ld", header=T, quote="\"")
 head (LD)
 LD$dist <-  LD$BP_B - LD$BP_A
-p <- ggplot(LD, aes(x=dist, y=R2))
-p + stat_smooth(span = 1, se=TRUE)
-
+library (ggplot2)
+p <- ggplot(LD, aes(x=dist, y=R2)) + stat_smooth(span = 1, se=TRUE)
+ggsave(p, file="LD_plot.pdf")
  
  
  
-- 
GitLab