From cedea31af6205ad0792bef48b184cab382aa58af Mon Sep 17 00:00:00 2001 From: Jan van Haarst <jan.vanhaarst@wur.nl> Date: Fri, 7 Mar 2008 07:55:25 +0000 Subject: [PATCH] started work on taxonomy retrieval --- get_latest_taxonomy.sh | 158 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100755 get_latest_taxonomy.sh diff --git a/get_latest_taxonomy.sh b/get_latest_taxonomy.sh new file mode 100755 index 0000000..eefe693 --- /dev/null +++ b/get_latest_taxonomy.sh @@ -0,0 +1,158 @@ +#!/bin/bash +# This script fetches nr & nt blast databases from ncbi +# written by Jan van Haarst, PRI +# needed : wget, curl & perl +# latest update 28 Februari 2007 +# variables +# Get location of this script, so we can call the other script +DIRNAME=`dirname $0` +SCRIPT_LOCATION=`perl -e 'END {use Cwd qw(realpath);print realpath($ARGV[0]);}' $DIRNAME` + +# Set site and data locations depending on where we are +if [ $HOSTNAME == "dev1" ] # Primary site +then + site_location="ftp://ftp.ncbi.nlm.nih.gov/blast/db/" + data_dir="/home/jvh/data/blast_latest/databases.nobackup" + extracted_dir=$data_dir"/extracted" + # Remove secondary checkfile + rm $data_dir"/newest_date.txt" + WGET="/home/jvh/bin/wget" + CURL="/usr/bin/curl -s" + TAR="/bin/tar" + PRIMARY="true" +else # Secondary sites + site_location="http://dev1.ab.wurnet.nl/~jvh/databases.nobackup/" + PRIMARY="false" + if [ $HOSTNAME == "kwatta" ] + then + target_dir="/hwraid4/data/blast/databases" + data_dir=$target_dir"/updates" + extracted_dir=$data_dir"/extracted" + WGET="/usr/local/bin/wget" + CURL="/opt/sfw/bin/curl" + TAR="/usr/local/bin/tar" + else + target_dir="/state/partition1/blast/db/" + data_dir=$target_dir"/updates" + extracted_dir=$data_dir"/extracted" + WGET="/usr/bin/wget" + CURL="/usr/bin/curl" + TAR="/bin/tar" + fi + + # Only secondary sites need to check the date + newest_date_local=`cat $data_dir"/newest_date.txt"` + if [ $? != "0" ] + then + echo Problem checking local file ! + #exit + fi + + newest_date_remote=`$CURL -s -f $site_location"newest_date.txt"` + if [ $? != "0" ] + then + # Primary site is updating or offline + echo Primary site is updating or offline + exit + fi + + if [ $newest_date_remote == $newest_date_local ] + then + # No update needed + echo No update needed + exit + fi +fi + +now=`date '+%Y_%m_%d_%A_%Hh%M'` +log_dir=$data_dir"/logs" +logfile=$log_dir"/blastlog_"$now +lockfile=$data_dir"/update_is_running.lock" +accept_list="nt*,nr*,taxdb*,*.txt,*.htm*" +reject_list="*.html*,newest_date.txt" +wget_options="--proxy=off --mirror --no-parent --no-directories --level=1 --timestamping --passive-ftp --accept "$accept_list" -o "$logfile" --reject "$reject_list +# make the needed directories + +mkdir -p $data_dir +mkdir -p $log_dir +mkdir -p $extracted_dir + +# change to datadir +cd $data_dir + +# catch sigint and remove lockfile + +remove_lock() +{ +if test -e $lockfile + then + # remove the update file. + rm $lockfile + exit +fi +} + +trap remove_lock SIGINT + +# fetch the data +if test -e $lockfile + then + # update is running, so print a message and stop. + echo "Update is already running." + exit + else + # lockfile does NOT exist, so we create the lockfile, run the update and remove the file. + touch $lockfile + # get the data + $WGET $wget_options $site_location + # Add return value to log file + echo "Return Value="$? >> $logfile + # Generate a timestamp, so we know what date the latest file was added + perl $SCRIPT_LOCATION"/last_file_time.pl" > $data_dir"/newest_date.txt" + # For secondary sites : + # Unpack the retrieved set, and remove the original archive + if [ $PRIMARY == "false" ] + then + for NAME in *.tar.gz + do + $TAR --directory=$extracted_dir -xzf $NAME + if [ $? != "0" ] + then + echo Problem unpacking $NAME + else + rm $NAME + fi + done + if [ $HOSTNAME == "apbioinf100.wurnet.nl" ] + then + perl "$DIRNAME"/fastcopy.pl --directory $extracted_dir + cluster-fork rsync --archive --include="/*" -e ssh `hostname`:$extracted_dir $extracted_dir + fi + if [ $HOSTNAME == "kwatta" ] + then + # Check if blast is running + while [ `/usr/bin/pgrep blastall > /dev/null;echo $?` != "1" ] + do + echo "Blast is running" + # Try again in an hour + sleep 3600 + done + # Double check before move + /usr/bin/pgrep blastall + # If not, move the data to the target location + if [ $? == "1" ] + then + pushd $extracted_dir > /dev/null + mv * $target_dir + popd > /dev/null + else + echo "Blast was running" + fi + fi + fi + # remove the lockfile. + rm $lockfile + # done + exit +fi + -- GitLab