From c76cf796771350e38184079c9d8a33f9dd6ec417 Mon Sep 17 00:00:00 2001 From: hjmegens <hjmegens@gmail.com> Date: Wed, 11 Mar 2015 09:53:07 +0100 Subject: [PATCH] pop genomics --- 5.7_Population_Genomics_commands.sh | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/5.7_Population_Genomics_commands.sh b/5.7_Population_Genomics_commands.sh index aaba776..08efab8 100644 --- a/5.7_Population_Genomics_commands.sh +++ b/5.7_Population_Genomics_commands.sh @@ -1,17 +1,25 @@ +## Path data: +/home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/ + ### STEP 1. SNP calling using ANGSD (output in plink format) and basic filtering: -angsd -bam list_bam_leon.txt -minMapQ 30 -minQ 20 -minInd 13 -baq 1 -doCounts 1 -setMinDepth 30 -setMaxDepth 450 -GL 1 -out data_raw -ref UMD_5_genome_nospaces_60.fa -doMajorMinor 4 -doMaf 2 -SNP_pval 1e-6 -doGeno 4 -doPost 1 -doPlink 2 -P 10 +angsd -bam /home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/list_bam_leon.txt -minMapQ 30 -minQ 20 -minInd 13 -baq 1 -doCounts 1 -setMinDepth 30 -setMaxDepth 450 -GL 1 -out data_raw -ref /home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/reference/UMD_5_genome_nospaces_60.fa -doMajorMinor 4 -doMaf 2 -SNP_pval 1e-6 -doGeno 4 -doPost 1 -doPlink 2 -P 10 # Edit tfam file (population should be in the first column): -cat data_raw.tfam | awk 'print {$2,$1,$3,$4,$5,$6}' > data_raw.tfam2 -mv data_raw.tfam2 data_raw.tfam +# cat data_raw.tfam | awk '{print $2,$1,$3,$4,$5,$6}' > data_raw.tfam2 +# mv data_raw.tfam2 data_raw.tfam +# vim data_raw.tfam (pop1: id1-5; pop2: ids6-10; pop3: id11-15 + +# copy (cp) files tfam and tped into your working directory. +mkdir PCA_LD_session +cp /home/formacion/COMUNES/IAMZ/data/CIHEAM/sessions/Session5.7_PCA_LD/data_raw.t* /your directory/ ### STEP 2. Population stratification analysis: PCA, genomic distance matrix and cluster as implemented in Plink # Filter data using plink: -plink --tfile data_raw --geno 0.3 --maf 0.05 --hwe 1e-5 --make-bed --out data_clean +plink --tfile data_raw --geno 0.1 --maf 0.05 --hwe 1e-5 --make-bed --out data_clean # PCA: plink --bfile data_clean --pca 3 header --out data.struc1 @@ -20,8 +28,9 @@ plink --bfile data_clean --pca 3 header --out data.struc1 PCA <- read.table("data.struc1.eigenvec", header=T, quote="\"") head (PCA) library (ggplot2) -c <- ggplot(PCA, aes(y=PC1, x=PC2, colour=factor(FID))) -c + geom_point(size = 3) +c <- ggplot(PCA, aes(y=PC1, x=PC2, colour=factor(FID))) + geom_point(size = 3) +ggsave(c, file="PCA_PC1-PC2.pdf") + #################################################### # Other structure analysis: @@ -29,8 +38,6 @@ plink --bfile data_clean --cluster -K 3 --out data.struc2 plink --bfile data_clean --distance triangle 1-ibs --out data.struc3 - - ### STEP 3. LD: plink --bfile data_clean --r2 --ld-snp Chr1_10002364 --ld-window-kb 1000 --ld-window 500 --ld-window-r2 0.1 --out data.LD @@ -38,9 +45,9 @@ plink --bfile data_clean --r2 --ld-snp Chr1_10002364 --ld-window-kb 1000 --ld-wi LD <- read.table("data.LD.ld", header=T, quote="\"") head (LD) LD$dist <- LD$BP_B - LD$BP_A -p <- ggplot(LD, aes(x=dist, y=R2)) -p + stat_smooth(span = 1, se=TRUE) - +library (ggplot2) +p <- ggplot(LD, aes(x=dist, y=R2)) + stat_smooth(span = 1, se=TRUE) +ggsave(p, file="LD_plot.pdf") -- GitLab