Follow this page | Robert's private homepage

Sometimes I want to follow the changes of some website which does not provide an RSS feed. For such cases I have written a simple bash script that I run periodically with cron.

Here it is:

#!/bin/bash

# file with pages that should be checked
PAGES=pages-to-follow.txt
USER=me@example.com

# goto working dir
cd followthatpage

for i in $( cat $PAGES ); do
    PAGE=$i
    if [[ $PAGE =~ ^[\#] ]]; then
        continue
    fi
    PAGE=$(echo $PAGE | tr -dc '[:print:]')
    
    # prefix of the filename is the md5sum
    NAME=rr-$(echo $PAGE | md5sum - | cut -d" " -f1)
    
    # check if initial
    ls $NAME* > /dev/null 2>&1
    if [ "$?" -eq 0 ]; then
        # save page to file
        links -dump $PAGE > $NAME\_$(date +%F_%H-%M).page

    # remove old files
    if [ $(ls $NAME* | wc -l) -gt 2  ]; then
        TOBEREMOVED=$(ls $NAME* | head --lines=-2)
        rm $TOBEREMOVED
    fi

    # check if something changed and mail it
    DIFFERENCE=$(diff $NAME*)
    if [ "$?" -eq 1 ]; then
        echo -e "Changes in Page:\n$PAGE\n\nDifferences are:\n- Begin diff output ------------\n\n$DIFFERENCE\n\n- End of diff output ---------" | head --bytes=100000 | mail -s "Changes in $PAGE" $USER
        echo "$DIFFERENCE" > last-result-$NAME.diff
    fi
    else
        # first time
    
        # save page to file
        lynx -nolist -dump $PAGE > $NAME\_$(date +%F_%H-%M).page

        cat $NAME* | mail -s "Observation started $PAGE" $USER
    fi  
    sleep 10
done