#!/usr/local/bin/python

from urllib2 import build_opener, HTTPCookieProcessor, Request
from urllib import urlencode, urlretrieve
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup
import re
import time
import subprocess
import sys
#import cgitb; cgitb.enable(display=0, logdir="/tmp")

model_summary_page = "Extracted_model_log"

#specify the root URL of the wiki
url_root = 'http://www.yeastpheromonemodel.org'

import sys

date = sys.argv[1]

#the folder where the html documents will be stored
docFolder =  date + '/'
docPath = '/data/www/html/alpha/ModelDocs/'


#Build opener
opener = build_opener(HTTPCookieProcessor)

#Authenticate - note that I took the password out of the authentication
f=opener.open(Request(url_root + "/index.php?title=Special:Userlogin",urlencode(dict(wpName="Wiki parser",wpPassword="",action="submitlogin"))))
f.close()





#list the category pages that the parser needs to cover
categories = [url_root + "/index.php?title=Category:Yeast_Pheromone_Response_Model",
                url_root + "/index.php?title=Category:Reactions_-_Yeast_Pheromone_Response_Model",
                url_root + "/index.php?title=Category:Species_-_Yeast_Pheromone_Response_Model", 
		url_root + "/index.php?title=Category:Parameters_-_Yeast_Pheromone_Response_Model"]




url_list = []
for category_url in categories:
    temp_url_list = []
    #since category pages can only contain 200 links, we have to loop through all
    #parameter category pages
    while (category_url != None):
        print "Opening category page: " + category_url

        #Retrieve Parameters category page
        f=opener.open(category_url)
        doc = f.read()
        f.close()

        #Parse the HTML from string
        soup = BeautifulSoup(doc)

        #find the link to the next 200 members of the category, if such a link
        #exists
        #This is done by first finding the text "previous 200", and then moving
        #forward to it's next sibling - if the sibling is a link entitled "next 200"
        #then that link will take us to the next category page.  Store URL for
        #future use, or set equal to None if there isn't a link.
        
        previous_200_text = soup.find(text=re.compile("previous 200"))
        if previous_200_text == None:
            next_200_a = None
        else:
            next_200_a = previous_200_text.findNextSibling('a')

        if next_200_a == None:
            category_url = None
        else:
            if next_200_a.string == "next 200":
                category_url = url_root + next_200_a['href']
                category_url = re.sub('&amp;', '&', category_url)
            else:
                category_url = None
            
        #locate all the parameter page urls within the current category page
        for table in soup.findAll('table'):
            for a_tag in table.findAll('a', href=True):
                    #get page name from url insteat of from a_tag title because some
                    #page names have 'bad' characters that appear differently in url
                page_name = a_tag['href']           
                page_name = page_name[(page_name.index('wiki/') + 5):]
                temp_url_list.append(page_name)

    url_list.append(temp_url_list)

#put all the urls together into one big list
full_url_list = []
full_url_list.extend(url_list[0])
full_url_list.extend(url_list[1])
full_url_list.extend(url_list[2])
full_url_list.extend(url_list[3])

#We will use different variables to store the contents of the diffeent sections of
#the BNGL file.  The contents of these variables will later be written to a file.

parameters_section = 'begin parameters\n'
param_number = 0

molecule_types_section = ('\n\n\nbegin molecule types\n')
seed_species_section = ('\n\n\nbegin seed species\n')
species_number = 0

reaction_rules_section = '\n\n\nbegin reaction rules\n'
rule_number = 0


#Now we will go through each url, extracting the relevant model elements and formatting
#them for the BNGL file.  We will also make a static HTML copy of each page, so that
#there will be unchanging documenation associated with each model. 
for page_name in full_url_list:


    print page_name

        #open the raw wiki-text version of the current page
    f=opener.open(url_root + "/index.php?title=" + page_name + "&action=raw")

    #Read the HTML contents into a string of text
    doc = f.read()
    f.close()

    #Parse the HTML from string
    soup = BeautifulSoup(doc)

    #find each parameter declared in the page (although with
    #current conventions there should only be one per page)
    #NOTE: Beautifulsoup makes tag names all lowercase!
    for param in soup.findAll('modelparameter'):

        param_number = param_number + 1

            #convert param (which is a sequence) to a string
        param_string = " ".join(["%s" % k for k in param])

            #replace the superscript tags with "^( )"
        param_string = re.sub('<sup>', '^(', param_string)
        param_string = re.sub('</sup>', ')', param_string)

            #remove '[[' and ']]'
        param_string = re.sub(r'\[','',param_string)
        param_string = re.sub(r'\]','',param_string)


            #remove extra spaces
        param_string = re.sub(' ', '', param_string)

            #add tab and newline characters
        parameters_section = parameters_section + '\t' + param_string + '\n'
    
    #find each molecule type declared in the page (although with
    #current conventions there should only be one per page)
    for molec in soup.findAll("modelmoleculetype"):
        species_number = species_number + 1

            #convert molec (which is a sequence) to a string
        molec_string = " ".join(["%s" % k for k in molec])

            #remove extra spaces
        molec_string = re.sub(', ', ',', molec_string)

            #remove '[[' and ']]'
        molec_string = re.sub(r'\[','',molec_string)
        molec_string = re.sub(r'\]','',molec_string)

            #add species_number and newline character
        molecule_types_section = molecule_types_section + '\t' + molec_string + '\n'


    #find each seed species declared in the page (although with
    #current conventions there should only be one per page)
    for species in soup.findAll("modelseedspecies"):

            #convert species (which is a sequence) to a string
        species_string = " ".join(["%s" % k for k in species])

            #remove extra spaces
        species_string = re.sub(', ', ',', species_string)

            #remove '[[' and ']]'
        species_string = re.sub(r'\[','',species_string)
        species_string = re.sub(r'\]','',species_string)


            #add tab and newline characters
        seed_species_section = seed_species_section + '\t' + species_string + '\n'


        #if the current page contains reaction rules, add the
        #name of the page to the reaction rules section
    if soup.find("modelrxnfull"):
        reaction_rules_section = reaction_rules_section + '\n#' + page_name + '\n\n'

    #for each full reaction definition (rxn_full div), parse
    #out the elements of the reaction
    for rxn in soup.findAll("modelrxnfull"):

        #find each reaction equation in the full reaction
        for eqn in rxn.findAll("modelrxnrule"):
            rule_number = rule_number + 1

            #convert reaction equation (which is a sequence) to a string
            rxn_string = " ".join(["%s" % k for k in eqn])

            #remove any newlines within the reaction equation
            rxn_string = re.sub('\n[\ ]*', '', rxn_string)

            #add a '\' and a newline to the end of the reaction equation
            rxn_string = rxn_string + ' \\ \n\t'


            #add parameters to next line
            for rxn_param in rxn.findAll("modelrxnparam"):
                rxn_string = rxn_string + '\t' +  ' '.join(["%s" % k for k in rxn_param])

            #remove all spaces
            rxn_string = re.sub(' ', '', rxn_string)
            #add spaces on either side of "+" (when "+" not preceded by "!")
            rxn_string = re.sub('(?<!!)\+', ' + ', rxn_string)
            #add space before "\"
            rxn_string = re.sub(r'\\', r' \\', rxn_string)

            #add space before reaction arrow
            rxn_string = re.sub('(?<!<)->', ' ->', rxn_string)
            rxn_string = re.sub('<->', ' <->', rxn_string)

            #add "\", newline, and tabs
            rxn_string = re.sub('->', '-> \\ \n\t\t', rxn_string)

            #remove '[[' and ']]'
            rxn_string = re.sub(r'\[','',rxn_string)
            rxn_string = re.sub(r'\]','',rxn_string)

            #add tab and newline characters
            rxn_string = '\t' + rxn_string + '\n\n'
            rxn_string = re.sub('\t + ', '\t', rxn_string)

            reaction_rules_section = reaction_rules_section + rxn_string



## Now make static HTML copies of the current page

            #open the page
    f=opener.open(url_root + "/index.php?title=" + page_name)
    doc = f.read()
    f.close()

    #Parse the HTML from string
    soup = BeautifulSoup(doc)


    #Now we remove un-needed sections of the page

    heads = soup.findAll('head')
    [head.extract() for head in heads]

    junk = soup.findAll('div',id="jump-to-nav")
    [junk_item.extract() for junk_item in junk]

    junk = soup.findAll('div',{"class":"printfooter"})
    [junk_item.extract() for junk_item in junk]

    junk = soup.findAll('div',id="column-one")
    [junk_item.extract() for junk_item in junk]

    junk = soup.findAll('div',{"class":"editsection"})
    [junk_item.extract() for junk_item in junk]

    junk = soup.findAll('script',{"type":"text/javascript"})
    [junk_item.extract() for junk_item in junk]


    #For each link on the page, check to see if it leads to another page
    #that we're extracting.  If it is, change the link such that it will
    #point to the static HTML copy of that page.  If it is a link to another
    #page in the wiki that we are not extracting, then make the link point
    #back to the wiki.  If it is a link to a page off the wiki, leave the
    #link alone.
    for link in soup.findAll('a', href = True):
        temp_url = re.sub('^/wiki/', '', link['href'])
        temp_url = re.sub('[ ]', '_', temp_url)
        #if it's a link to a page in the wiki that we're extracing...
        if full_url_list.count(temp_url):
                #remove '/wiki/' from link, and add '.html' to the end
            link['href'] = re.sub('^/wiki/', '', link['href']) + ".html"
                #get rid of any characters that are illegal in file names
            link['href'] = re.sub('[/\\\: ]', '_', link['href'])
                #parentheses in links are also turned into %28 and %29 in
                #by the wiki. convert back to parentheses
            link['href'] = re.sub('%28', '(', link['href'])
            link['href'] = re.sub('%29', ')', link['href'])


            #if it's a link to a page in the wiki that we're not extracting...
        if re.compile('^/').search(link['href']):
                #add the root url to the front of the link
            link['href'] = url_root + link['href']

    #So that imagemaps will work properly, we need to look for urls in
    #area tags as well, and do the same as above
    for link in soup.findAll('area', href = True):
        temp_url = re.sub('^/wiki/', '', link['href'])
        temp_url = re.sub('[ ]', '_', temp_url)
          #Because parentheses appear correctly in links in imagemap areas, we
            #need to replace the parentheses with ASCII to compare with links in
            #list
        temp_url = re.sub('\(', '%28', temp_url)
        temp_url = re.sub('\)', '%29', temp_url)
          #if it's a link to a page in the wiki that we're extracing...
        if full_url_list.count(temp_url):
                #remove '/wiki/' from link, and add '.html' to the end
            link['href'] = re.sub('^/wiki/', '', link['href']) + ".html"
                #get rid of any characters that are illegal in file names
            link['href'] = re.sub('[/\\\: ]', '_', link['href'])
                #parentheses in links are also turned into %28 and %29 in
                #by the wiki. convert back to parentheses
            link['href'] = re.sub('%28', '(', link['href'])
            link['href'] = re.sub('%29', ')', link['href'])

            #if it's a link to a page in the wiki that we're not extracting...
        if re.compile('^/').search(link['href']):
                #add the root url to the front of the link
            link['href'] = url_root + link['href']

    #We also want to download any images from the wiki
    for img in soup.findAll('img'):
            #if the image is hosted on the wiki
        if re.compile('^/images').search(img['src']):
                #get the filename (remove everything before the last slash)
            new_src = img['src'][img['src'].rfind('/')+1:]
                #save the image to disk
            urlretrieve(url_root + img['src'], docPath + docFolder + new_src)
                #update the link on the page
            img['src'] = new_src

        #change the filename to remove illegal characters, and the HTML file
    save_file_name = re.sub('[/\\\: ]', '_', page_name) + ".html"
    save_file_name = re.sub('%28', '(', save_file_name)
    save_file_name = re.sub('%29', ')', save_file_name)

    outputFile = open(docPath + docFolder + save_file_name, 'w')
    outputFile.write("%s" %soup)
    outputFile.close()




parameters_section = parameters_section + 'end parameters\n'

molecule_types_section = molecule_types_section + 'end  molecule types\n'
seed_species_section = seed_species_section + 'end seed species\n'

reaction_rules_section = reaction_rules_section + 'end reaction rules\n'



#specify the output bng file name
outputFile = open(docPath + docFolder + "model-" + date + ".bngl", 'w')

outputFile.write('#Model extracted on ' + date + '\n')
outputFile.write('#For model documentation, see ' + url_root + '/ModelDocs/' + docFolder + '\n\n\n\n')
outputFile.write(parameters_section)
outputFile.write(molecule_types_section)
outputFile.write(seed_species_section)
outputFile.write(reaction_rules_section)

#add bng code to generate the network and create and SBML file
outputFile.write('\n\n\n\n\n')
outputFile.write('# Generation of the species and reactions\n')
outputFile.write('# with pheromone concentration set to zero.\n')
outputFile.write('generate_network({overwrite=>1});\n')
outputFile.write('# Write unequilibrated model to xml (model.xml)\n')
outputFile.write('writeSBML({prefix=>\"model\"});\n')
outputFile.close()



print  docPath + docFolder + date + ".zip"
print docPath + docFolder
retcode = subprocess.call(["cd " + docPath + "\nzip -r " + docPath + docFolder + date + ".zip " + docFolder], shell=True)


