#!/bin/sh

# find all plain URLs in yam files under a specified directory
#
# find-plain-urls <directory> <yamversion>
#
# <yamversion> is one of 
#   5: URL is everything from http:// up to the next unescaped whitespace
#   6: http:// followed by a contiguous sequence of letters, numbers, or any of
#      _-./?=&;#+%~@
#   6withspace: like 6 but also allow escaped spaces '\ ' as in 5

if [ -n "$1" ]; then
  DIR=$1
else
  DIR=.
fi

if [ -n "$2" ]; then
  YAMVER=$2
else
  YAMVER=6
fi

case $YAMVER in
  5)
    # YAM 5 style - everything up to the next space
    find $DIR -name \*.yam | xargs grep -HnoP \
      '(?<!\%\()(?:https?:\/\/|ftp:\/\/|mailto:)[^ \t\n\r]+'
  ;;

  6)
    # YAM 6 style - whitelist
    find $DIR -name \*.yam | xargs grep -HnoP \
      '(?<!\%\()(?:https?:\/\/|ftp:\/\/|mailto:)[A-Za-z0-9_\-\.\/\?=&;#\+%~@]+'
  ;;

  6withspace)
    # YAM 6 style, but also allow backslash-escaped spaces like 5 did
    find $DIR -name \*.yam | xargs grep -HnoP \
      '(?<!\%\()(?:https?:\/\/|ftp:\/\/|mailto:)(?:[A-Za-z0-9_\-\.\/\?=&;#\+%~@]|\\ )+'
  ;;

  *)
    echo "Unrecognised YAM version, options are 5, 6 or 6withspace"
  ;;
esac
