################################################################
# Scrapes the ZeroPunctuation .flv links from
# EscapistMagazine.com.
#
################################################################
# Coded by: dbr (htp://neverfear.org)
# Coded on: 01/04/2008
#
################################################################
# Short usage:
#
# Run this script, copy the output into a Terminal (on a machine with wget, 
# bash is what I used, but even cmd.exe should work, assuming wget is in your PATH)
# Then, wget will grab all the .flv files.
# I then used VisualHub on OS X to transcode the files to iPod compatible .mp4 files.
#
# If you are running this in the future (after 03/04/2008), you may need to increase
# NUM_PAGES to grab them all.
#
################################################################
# Long, rambly tech-notes:
#
# It uses /articles/views/editorials/zeropunctuation.1 .2 .3 etc
# If this changes, chances are things will break, 
# and I don't intend to fix it in any timely manner, so do it yourself!

#
# Once it finds all the links to the ZP pages, it looks for script tags,
# then it looks for the following section:
#
# <script language="javascript">var vars = {file:'selfserve300.download.videoegg.com/gid38[...]FsY',
#
# Simply prepending http://, and appending _high.flv results in the link to the FLV file
#
# Then we do some vile-hackery to add the "episode number" to the file 
# name (which was grabbed form an alt tag somewhere)
#

#
# The code was thrown together far-to-late and I didn't spend long on it.
# It's rather horrible code, it probably has many bugs, and will break
# if certain bits of the HTMl changes (common problem with screen-scraping).. 
# And there's very little error-catching....
# ..but.. it works. I would rewrite it, but I don't think it's that important..
#
# Oh, it outputs a wget command, like the following:
#
# wget  http://the.flv/url.flv -O "Zero Punctuation - [03] - Episode Name.flv"
#
# If you don't want to use wget, either edit the ot_command = "" line, or
# just throw the output into a text editor and find/replace it away.
#
# I think the output-file-name may be in reverse order (oposite of the 
# order of release), as I threw it in at the last minute.. If so, I suggest 
# you manually edit the output, or mess around with the script..
#




################################################################
# Huge config section

NUM_PAGES = 3 	# Number of pages on http://www.escapistmagazine.com/articles/view/editorials/zeropunctuation.1
				# If the last page is number 4, change this line to: NUM_PAGES = 4


################################################################
# Main grab-page-full-of-ZP-flvs function
import urllib,re
from BeautifulSoup import BeautifulSoup
def grabpage(url):
	c_count=1
	soup=BeautifulSoup(urllib.urlopen(url).read())
	all_headlines = soup.findAll('div',{'class':'headline'})
	for current_page in all_headlines:
		current_page_links = current_page.findAll('a',alt=re.compile('Zero Punctuation:.+$'))
		if current_page_links:
			current_link = current_page_links[0]['href']
			soup=BeautifulSoup(urllib.urlopen(current_link).read())
			for found_script in soup.findAll('script'):	
				findUrl = re.findall('file:\'(.*?\.download\.videoegg\.com.*?)\'',str(found_script))
				if findUrl:
					c_name = current_page_links[0]['alt'].replace(':'," -").replace("&","and")
					c_name_split = c_name.split("-",)
					print c_name_split[0] + "- [" + str(c_count).zfill(2) + "] -", c_name_split[1]
					c_url = "http://"+findUrl[0]+"_high.flv"
					out_command = "wget '%s' -O '%s.flv'" % (c_url,c_name)
					print out_command
					c_count+=1
				#end if
			#end for x
		#end if current_page_link
	#end for current_page
#end grabpage


NUM_PAGES = 3
################################################################
# Launch the above function on all specifed pages.
allpages=['http://www.escapistmagazine.com/articles/view/editorials/zeropunctuation.%d' %(i,) for i in range(1,NUM_PAGES+1)]
for cpage in allpages:
	grabpage(cpage)

# The End.
# Really.
