This script does stuff...

[root@starbuck /var/www/docs.redbrick]# cat /etc/cron.daily/redbrickdocs     
#!/bin/bash

# Modified version of Andrew Harford's script

# Warning! Warning! Danga! There's an rm -rf operation performed
# on DOCSPATH, so MAKE SURE ITS SET PROPERLY!
DOCSPATH=/var/vhosts/docs5.redbrick.dcu.ie
DOCSUSER=www-data
DOCSGROUP=www-data

#
# CHECK IT'S UP
#
wget --no-check-certificate http://docs.redbrick.dcu.ie -O - &> /dev/null
if [ $? -ne 0 ]; then
        exit $?
fi

if [ ! -d $DOCSPATH ]; then
        mkdir -p $DOCSPATH
fi


#
# Clean out the directory
#
cd $DOCSPATH
rm -fr $DOCSPATH/*

#
# Get a list of pages
#
# We do this by downloading the index page with the full absolute links and
# then parsing them out and downloading each of the links individually.
#
wget -k -nH --no-check-certificate http://docs.redbrick.dcu.ie
grep docs.redbrick.dcu.ie index.html|tr "\"" "\n" | grep docs.redbrick.dcu.ie > pages.list
cat pages.list|xargs wget --no-check-certificate -nH -E

#
# Get a new index file, without the absolute links.
#
rm index.html
wget --no-check-certificate http://docs.redbrick.dcu.ie

#
# Do all the cool stuff
#
for file in `ls *.html`; do

        # Work out how many lines are before and after the `<body>` tag
        # so that we can hack in our own bit of html at the top of each
        # page to remind users that they are on a mirror
        top_lines=`grep -n -m1 "body" $file| awk -F: '{print $1}'`
        total_lines=`wc -l $file | awk '{print $1}'`
        let bottom_lines=total_lines-top_lines

        # Move the page to a temporary file and then
        # re-create it with the extra html added.
        mv $file $$.html
        head -n $top_lines $$.html > $file
        echo "`<div style=\"width:100%;  background-color: #FFCCCC; text-align: center; " >`> $file
        echo "padding-bottom: 5px; margin-top: 5px; margin-bottom: 5px; float: right;\">"  >> $file
        echo "`<h2>`This is a mirror of the main RedBrick System Documentation`</h2>` This site only serves static html. This mirror is hosted by"  >> $file
        echo "`hostname`. Please make changes on `<a href=\"http://docs.redbrick.dcu.ie\">`docs.redbrick.dcu.ie`</a>``<p style=\"font-size: 10px\">` " >> $file
        echo "This page was archived on `date` `</a>``</div>` `<div class=\"clearer\">``</div>` " >> $file
        tail -n $bottom_lines $$.html >> $file

        # It expects the css to be a php file, but this won't work
        # so we replace with css.css
        sed -i 's/css.php/css.css/' $file


        # Lastly, use directories to imitate the fancy url structure
        # that dokuwiki uses to preserve the links.
        #
        # We do this by creating a new directory which is the name of
        # the html file with the extension stripped off, and making the
        # original html file that directory's index.html
        if [ "$file" != "index.html" ]; then

                dir=$DOCSPATH/${file%.html}
                dir=${dir%.1}

                mkdir $dir
                mv $file $dir/index.html
        fi

done

#
# get the css
#
mkdir -p $DOCSPATH/lib/exe
cd $DOCSPATH/lib/exe
wget --no-check-certificate http://docs.redbrick.dcu.ie/lib/exe/css.php -O - > css.css

#
# fix the logo
#
imgpath=$DOCSPATH/lib/tpl/dokuwiki/images
mkdir -p $imgpath
mv $DOCSPATH/*.png $imgpath/
mv $DOCSPATH/*.gif $imgpath/


#
# fix the permissions
#
chown -R $DOCSUSER:$DOCSGROUP $DOCSPATH
chmod 755 `find $DOCSPATH -type d`
chmod 644 `find $DOCSPATH -type f`