From cedea31af6205ad0792bef48b184cab382aa58af Mon Sep 17 00:00:00 2001
From: Jan van Haarst <jan.vanhaarst@wur.nl>
Date: Fri, 7 Mar 2008 07:55:25 +0000
Subject: [PATCH] started work on taxonomy retrieval

---
 get_latest_taxonomy.sh | 158 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100755 get_latest_taxonomy.sh

diff --git a/get_latest_taxonomy.sh b/get_latest_taxonomy.sh
new file mode 100755
index 0000000..eefe693
--- /dev/null
+++ b/get_latest_taxonomy.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+# This script fetches nr & nt blast databases from ncbi
+# written by Jan van Haarst, PRI
+# needed : wget, curl & perl
+# latest update 28 Februari 2007
+# variables
+# Get location of this script, so we can call the other script
+DIRNAME=`dirname $0`
+SCRIPT_LOCATION=`perl -e 'END {use Cwd qw(realpath);print realpath($ARGV[0]);}' $DIRNAME`
+
+# Set site and data locations depending on where we are
+if [ $HOSTNAME == "dev1" ] 						# Primary site
+then
+	site_location="ftp://ftp.ncbi.nlm.nih.gov/blast/db/"
+	data_dir="/home/jvh/data/blast_latest/databases.nobackup"
+	extracted_dir=$data_dir"/extracted"
+	# Remove secondary checkfile
+	rm $data_dir"/newest_date.txt"
+	WGET="/home/jvh/bin/wget"
+	CURL="/usr/bin/curl -s"
+	TAR="/bin/tar"
+	PRIMARY="true"
+else										# Secondary sites
+	site_location="http://dev1.ab.wurnet.nl/~jvh/databases.nobackup/"
+	PRIMARY="false"
+	if [ $HOSTNAME == "kwatta" ]
+	then
+		target_dir="/hwraid4/data/blast/databases"
+		data_dir=$target_dir"/updates"
+		extracted_dir=$data_dir"/extracted"
+		WGET="/usr/local/bin/wget"
+		CURL="/opt/sfw/bin/curl"
+		TAR="/usr/local/bin/tar"
+	else
+		target_dir="/state/partition1/blast/db/"
+		data_dir=$target_dir"/updates"
+		extracted_dir=$data_dir"/extracted"
+		WGET="/usr/bin/wget"
+		CURL="/usr/bin/curl"
+		TAR="/bin/tar"
+	fi
+
+	# Only secondary sites need to check the date
+	newest_date_local=`cat $data_dir"/newest_date.txt"`
+	if [ $? != "0" ]
+	then
+		echo Problem checking local file !
+		#exit
+	fi
+
+	newest_date_remote=`$CURL -s -f $site_location"newest_date.txt"`
+	if [ $? != "0" ]
+	then
+		# Primary site is updating or offline
+		echo Primary site is updating or offline
+		exit
+	fi
+
+	if [ $newest_date_remote == $newest_date_local ]
+	then
+		# No update needed
+		echo No update needed
+		exit
+	fi
+fi
+
+now=`date '+%Y_%m_%d_%A_%Hh%M'`
+log_dir=$data_dir"/logs"
+logfile=$log_dir"/blastlog_"$now
+lockfile=$data_dir"/update_is_running.lock"
+accept_list="nt*,nr*,taxdb*,*.txt,*.htm*"
+reject_list="*.html*,newest_date.txt"
+wget_options="--proxy=off --mirror --no-parent --no-directories --level=1 --timestamping --passive-ftp --accept "$accept_list" -o "$logfile" --reject "$reject_list
+# make the needed directories
+
+mkdir -p $data_dir
+mkdir -p $log_dir
+mkdir -p $extracted_dir
+
+# change to datadir
+cd $data_dir
+
+# catch sigint and remove lockfile
+
+remove_lock()
+{
+if test -e $lockfile
+    then
+    # remove the update file.
+    rm $lockfile
+    exit
+fi
+}
+
+trap remove_lock SIGINT
+
+# fetch the data
+if test -e $lockfile
+    then
+	# update is running, so print a message and stop.
+	echo "Update is already running."
+	exit
+    else
+	# lockfile does NOT exist, so we create the lockfile, run the update and remove the file.
+	touch $lockfile
+	# get the data
+	$WGET  $wget_options  $site_location
+	# Add return value to log file
+	echo "Return Value="$? >> $logfile
+	# Generate a timestamp, so we know what date the latest file was added
+	perl $SCRIPT_LOCATION"/last_file_time.pl" > $data_dir"/newest_date.txt"
+	# For secondary sites :
+	# Unpack the retrieved set, and remove the original archive
+	if [ $PRIMARY == "false" ]
+	then
+		for NAME in *.tar.gz
+		do
+			$TAR --directory=$extracted_dir -xzf $NAME
+			if [ $? != "0" ]
+			then
+				echo Problem unpacking $NAME
+			else
+				rm $NAME
+			fi
+		done
+		if [ $HOSTNAME == "apbioinf100.wurnet.nl" ]
+		then
+			perl "$DIRNAME"/fastcopy.pl --directory $extracted_dir
+			cluster-fork rsync --archive --include="/*"  -e ssh `hostname`:$extracted_dir $extracted_dir
+		fi
+		if [ $HOSTNAME == "kwatta" ]
+		then
+			# Check if blast is running
+			while [ `/usr/bin/pgrep blastall > /dev/null;echo $?` != "1" ]
+			do
+				echo "Blast is running"
+				# Try again in an hour
+				sleep 3600
+			done
+			# Double check before move
+			/usr/bin/pgrep blastall
+			# If not, move the data to the target location
+			if [ $? == "1" ]
+			then
+				pushd $extracted_dir > /dev/null
+				mv * $target_dir
+				popd > /dev/null
+			else
+				echo "Blast was running"
+			fi
+		fi
+	fi
+	# remove the lockfile.
+	rm $lockfile
+	# done
+	exit
+fi
+
-- 
GitLab