#!/usr/bin/python # -*- coding: utf-8 -*- # #-------------------------------------------------------------------------------- #monitor_website_links.py v1.1, Copyright Bjoern Olausson #-------------------------------------------------------------------------------- #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 2 of the License, or #(at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #To view the license visit #http://www.gnu.org/licenses/old-licenses/gpl-2.0.html #or write to #Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #-------------------------------------------------------------------------------- #-------------------------------------------------------------------------------- import time, random, mechanize, sys, smtplib, os, re from datetime import timedelta, datetime from email.MIMEMultipart import MIMEMultipart from email.MIMEBase import MIMEBase from email.MIMEText import MIMEText from email import Encoders # URL to monitor URL = "http://www.example.com/" # Compile a regular expression to search for in the link text SEARCH_FOR_REGEXP = re.compile("RFC", re.I) # Remove something e.g. session ID from the URL with this regex REMOVE_FROM_URL = re.compile("s=.*&") # Run the script for X days MONITOR_FOR_DAYS = 30 # How long to sleep between checks (in Seconds) SLEEP_SECONDS_BETWEEN_CHECKS = 60*30 # Set to False if you do not want to store the results in a file # Otherwise enter a path to a file e.g. "/home/USER/search_results.txt" SAVE_TO = False # Set to False if you do not want to mail the results # Otherwise enter your mailaddress e.g. "user@somedomain.tld" MAIL_TO = False # GMAIL username and password gmail_user = "" gmail_pwd = "" def mail(text, attach="false"): '''http://kutuma.blogspot.com/2007/08/sending-emails-via-gmail-with-python.html''' msg = MIMEMultipart() msg['From'] = gmail_user msg['To'] = MAIL_TO msg['Subject'] = "Found the following for your search" msg.attach(MIMEText(text)) if attach != "false": part = MIMEBase('application', 'octet-stream') part.set_payload(open(attach, 'rb').read()) Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(attach)) msg.attach(part) mailServer = smtplib.SMTP("smtp.gmail.com", 587) mailServer.ehlo() mailServer.starttls() mailServer.ehlo() mailServer.login(gmail_user, gmail_pwd) mailServer.sendmail(gmail_user, MAIL_TO, msg.as_string()) # Should be mailServer.quit(), but that crashes... mailServer.close() if SAVE_TO: f = open(SAVE_TO, 'w') base_url = URL[:URL.rfind("/")] start_time = datetime.now() stop_time = start_time + timedelta(days=int(MONITOR_FOR_DAYS)) br = mechanize.Browser() br.set_handle_gzip(False) br.set_handle_referer(True) br.set_handle_redirect(True) br.set_handle_equiv(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) UA_DICT = { "OPERA": "Opera/9.80 (X11; Linux x86_64; U; Opera Next; en) Presto/2.8.131 Version/11.50 Gentoo", "KONQUEROR": "Mozilla/5.0 (compatible; Konqueror/4.5; FreeBSD) KHTML/4.5.4 (like Gecko)", "MICROB": "Mozilla/5.0 (X11; U; Linux armv7l; en-GB; rv:1.9.2a1pre) Gecko/20090514 Firefox/3.0 Tablet browser 0.9.7 RX-34", } MESSAGE = "" while (datetime.now() < stop_time): UAS = random.choice(list(UA_DICT.keys())) USER_AGENT_STRING = UA_DICT[UAS] br.addheaders = [('User-agent', '%(UAS)s' %{"UAS": USER_AGENT_STRING})] try: br.open(URL) response = br.response().read() except Exception, e: print e mail(e) try: LINKS_FOUND = br.links(text_regex=SEARCH_FOR_REGEXP) except Exception, e: print e mail(e) else: LINKTEXT = ["\n".join([LINK.text, base_url+"/"+REMOVE_FROM_URL.sub("", LINK.url)]) for LINK in LINKS_FOUND] RESULT = "\n".join(LINKTEXT) if MESSAGE != RESULT and len(LINKTEXT): MESSAGE = RESULT if SAVE_TO: f.write(str(datetime.now())+"\n"+MESSAGE+"\n\n") f.flush() if MAIL_TO: mail(str(datetime.now())+"\n"+MESSAGE) print "sleeping:", SLEEP_SECONDS_BETWEEN_CHECKS time.sleep(float(SLEEP_SECONDS_BETWEEN_CHECKS))