Difference between revisions of "User:Philaltist"

From PyMOLWiki
Jump to navigation Jump to search
 
 
Line 19: Line 19:
 
     import time
 
     import time
 
     import os
 
     import os
    import string
 
  
  
Line 27: Line 26:
 
         """
 
         """
 
         date=time.localtime()
 
         date=time.localtime()
         fyear="%i" %(date[0])
+
         return "%d%02d%02d" % date[0:2]
        fmonth="%i" %(date[1])
 
        if date[1]<10:
 
            fmonth="0"+"%i" %(date[1])
 
        fday="%i" %(date[2])
 
        if date[2]<10:
 
            fday="0"+"%i" %(date[2])
 
        dateS=fyear+fmonth+fday
 
        return dateS
 
  
 
     def file2list(filename):
 
     def file2list(filename):
Line 73: Line 64:
  
 
         #actual stuff
 
         #actual stuff
         dayI=dayI-1
+
         dayI-=1
 
         if dayI==0:
 
         if dayI==0:
 
             dayI=31
 
             dayI=31
             monthI=monthI-1
+
             monthI-=1
 
             if monthI==0:
 
             if monthI==0:
 
                 monthI=12
 
                 monthI=12
                 yearI=yearI-1
+
                 yearI-=1
        dayS="%i" %(dayI)
 
        monthS="%i" %(monthI)
 
        yearS="%i" %(yearI)
 
        if dayI<10:
 
            dayS="0"+dayS
 
        if monthI<10:
 
            monthS="0"+monthS
 
 
         #and finally...
 
         #and finally...
        dateS=yearS+monthS+dayS
+
         return "%d%02d%02d" % (yearI, monthI, dayI)
         return dateS
 
  
 
     def findlastdir(dateS,f,days):
 
     def findlastdir(dateS,f,days):
Line 95: Line 78:
 
         list named "directoriesL".
 
         list named "directoriesL".
 
         """
 
         """
         directoriesL=['']
+
         directoriesL=[]
         j=p=0
+
         for p in range(days):
        while p<days:
 
 
             dateS=decrementdate(dateS)
 
             dateS=decrementdate(dateS)
 
             attempt="/pub/pdb/data/status/"+dateS
 
             attempt="/pub/pdb/data/status/"+dateS
 
             try :
 
             try :
 
                 f.cwd(attempt)
 
                 f.cwd(attempt)
                 directoriesL[j:j]=[attempt]
+
                 directoriesL.append(attempt)
                j=j+1
 
 
             except :
 
             except :
 
                 pass
 
                 pass
            p=p+1
 
        directoriesL.pop()
 
 
         return directoriesL
 
         return directoriesL
  
Line 132: Line 111:
 
             pdbidsL.append(iter[57:61])
 
             pdbidsL.append(iter[57:61])
 
         for iter in pdbidsL:
 
         for iter in pdbidsL:
             iter=string.lower(iter)
+
             iter=iter.lower()
 
         pdbidsL.sort()
 
         pdbidsL.sort()
 
         return pdbidsL
 
         return pdbidsL
Line 158: Line 137:
 
         extractL=[dictionnaryD,problemL]
 
         extractL=[dictionnaryD,problemL]
 
         for i in dictL:
 
         for i in dictL:
             tempS=string.lower(i[0:4])
+
             tempS=i[0:4].lower()
 
             for ii in pdbidsL:
 
             for ii in pdbidsL:
 
                 if ii == tempS:
 
                 if ii == tempS:
 
                     title=i[14:216]
 
                     title=i[14:216]
 
                     extractL[0][ii]=title
 
                     extractL[0][ii]=title
         if len(extractL[0].keys()) != len(pdbidsL):
+
         if len(extractL[0]) != len(pdbidsL):
 
             print "Dimension mismatch, seeking troublemaker..."
 
             print "Dimension mismatch, seeking troublemaker..."
 
             for i in pdbidsL:
 
             for i in pdbidsL:
 
                 equiv=0
 
                 equiv=0
                 for ii in extractL[0].keys():
+
                 for ii in extractL[0]:
 
                     if i==ii:
 
                     if i==ii:
                         equiv=equiv+1
+
                         equiv+=1
 
                 if equiv==0:
 
                 if equiv==0:
 
                     extractL[1].append(i)
 
                     extractL[1].append(i)
Line 189: Line 168:
 
         relevdicD={}
 
         relevdicD={}
 
         for i in keywL:
 
         for i in keywL:
             for elem in dictionnaryD.keys():
+
             for elem,temp in dictionnaryD.items():
                temp=dictionnaryD[elem]
+
                 if i in temp:
                 if temp.find(i) != -1:
 
 
                     relevdicD[elem]=temp
 
                     relevdicD[elem]=temp
 
         return relevdicD
 
         return relevdicD
Line 259: Line 237:
 
     line=diclengthS+" are relevant to you :\n"
 
     line=diclengthS+" are relevant to you :\n"
 
     reportL.append(line)
 
     reportL.append(line)
     for i in relevdicD.keys():
+
     for i,x in relevdicD.items():
         entry=i+" : "+relevdicD[i]+"\n"
+
         entry=i+" : "+x+"\n"
 
         reportL.append(entry)
 
         reportL.append(entry)
 
     problemS=""
 
     problemS=""
Line 276: Line 254:
 
     for elem in reportL:
 
     for elem in reportL:
 
         print elem
 
         print elem
         elem=elem+'\n'
+
         elem+='\n'
 
         report.writelines(elem)
 
         report.writelines(elem)
 
     report.close()
 
     report.close()

Latest revision as of 03:24, 23 February 2010

Hi everyone !

I have created a script because I was tired of browsing the pdb for endless searches of structures relevant to my field. This scripts takes as input a text file in which you copied your favourite keywords, and the number of days you want to search back (by default, it will look at the structures added the last 50 days). It generates a report text file that contains the pdb id and name of the relevant structures that have been recently added. All you need to do is add 'run pdbsurvey.py' to your startup file, and create a text file called 'keywords.txt' with your keywords separated by an end-of-line character. Then you're ready to go. Just hit 'pdbsurvey' from within your PyMol instance, and the program returns the report file. The pdb ftp server is updated every tuesday.

from pymol import cmd

def pdbsurvey(days=50):

    """USAGE : pdbsurvey (<days>)
    Surveys the updates added to the PDB (ftp.rcsb.org) in the last
    50 days (or otherwise specified when calling this function) for
    entries that contain the words specified in the file
    keywords.txt.
    """
    print days

    import ftplib
    import time
    import os



    def todaymerge():
        """Puts today's date in a pdb format string.
        """
        date=time.localtime()
        return "%d%02d%02d" % date[0:2]

    def file2list(filename):
        """Low-level routine to brainlessly implement
        file.read().
        """
        fq=open(filename,'rb')
        linesS=fq.read()
        fq.close()
        LIST=linesS.splitlines()
        return LIST

    def connect2pdb():
        """Opens an anonymous socket to ftp://ftp.rcsb.org
        """
        f=ftplib.FTP()
        f.connect ('ftp.rcsb.org')
        f.login ()
        print "Remote connection established","\n"
        return f

    def decrementdate(dateS):
        """given a string date (pdb format yyyymmdd)
        this routine returns a string of the day before
        (sadly assuming that every month has 31 days, but
        no big deal here...).
        """
        #decompose dateS into components
        yearS=dateS[0]+dateS[1]+dateS[2]+dateS[3]
        monthS=dateS[4]+dateS[5]
        dayS=dateS[6]+dateS[7]

        #convert each into integers
        yearI=int(yearS)
        monthI=int(monthS)
        dayI=int(dayS)

        #actual stuff
        dayI-=1
        if dayI==0:
            dayI=31
            monthI-=1
            if monthI==0:
                monthI=12
                yearI-=1
        #and finally...
        return "%d%02d%02d" % (yearI, monthI, dayI)

    def findlastdir(dateS,f,days):
        """Puts the names of the "recent" directories in the
        list named "directoriesL".
        """
        directoriesL=[]
        for p in range(days):
            dateS=decrementdate(dateS)
            attempt="/pub/pdb/data/status/"+dateS
            try :
                f.cwd(attempt)
                directoriesL.append(attempt)
            except :
                pass
        return directoriesL

    def compilinfile(directoriesL,f):
        """lists all structures in the added.pdb files
        contained in the directories specified in directoriesL
        """
        command="RETR added.pdb"
        handle=open("donotedit.dat","wrb")
        for k in directoriesL:
            f.cwd(k)
            print "Currently in directory ",f.pwd()
            f.retrbinary(command,handle.write)
        handle.close()
        return len(directoriesL)

    def listparser():
        """Extracts the pdbids from donotedit.dat file,
        and stacks them into the list pdbidsL
        """
        linesL=file2list("donotedit.dat")
        pdbidsL=[]
        for iter in linesL:
            pdbidsL.append(iter[57:61])
        for iter in pdbidsL:
            iter=iter.lower()
        pdbidsL.sort()
        return pdbidsL

    def currentrelease(f):
        """Stores the content of cmpd_res.idx file
        This file contains the equivalencies pdbid<->title
        for all current entries of the PDB.
        """
        command="RETR cmpd_res.idx"
        f.cwd("/pub/pdb/derived_data/index/")
        print "Currently in directory ",f.pwd()
        fq=open("dictionnary.dat",'wrb')
        f.retrbinary(command,fq.write)
        fq.close()
        dictL=file2list("dictionnary.dat")
        return dictL

    def extract(pdbidsL,dictL):
        """Populates dictionnaryD with pdb entries found in the
        latest releases.
        """
        dictionnaryD={}
        problemL=[]
        extractL=[dictionnaryD,problemL]
        for i in dictL:
            tempS=i[0:4].lower()
            for ii in pdbidsL:
                if ii == tempS:
                    title=i[14:216]
                    extractL[0][ii]=title
        if len(extractL[0]) != len(pdbidsL):
            print "Dimension mismatch, seeking troublemaker..."
            for i in pdbidsL:
                equiv=0
                for ii in extractL[0]:
                    if i==ii:
                        equiv+=1
                if equiv==0:
                    extractL[1].append(i)
        return extractL

    def disconnectpdb(f):
        """Diconnects the current ftp session
        """
        f.quit()
        print "Remote connection terminated","\n"
        return f

    def releventries(dictionnaryD):
        """Generates a cleaned dictionnary with only entries
        that have one or more keywords specified in the local
        user-defined keywords.txt file
        """
        keywL=file2list("keywords.txt")
        relevdicD={}
        for i in keywL:
            for elem,temp in dictionnaryD.items():
                if i in temp:
                    relevdicD[elem]=temp
        return relevdicD

    def diskcleanup(filelist=["donotedit.dat","dictionnary.dat"]):
        """Lo-level disk cleanup to free up memory without the user
        """
        for filename in filelist:
            command='DEL '+filename
            os.system(command)
        return "clean"




    print "Welcome in the auto-PDB updater !"

    print "Survey of updates made since",days,"days ago."
    
    print "Acquisition of local time..."
    dateS=todaymerge()                                                 #Initializes dateS
    print "today is ",dateS
    print "Connecting to remote ftp server..."
    f=connect2pdb()                                                    #Connect anonymously to ftp.rcsb.org

    print "Acquisition of latest added remote directories..."
    directoriesL=findlastdir(dateS,f,days)                             #Lists recent directories in directoriesL
    if len(directoriesL)==0:
        print "No updates have been found since",days,"ago. Starting over with 50 days ago."
        directoriesL=findlastdir(dateS,f,50)

    print "Acquisition of latest addedremote files..."
    updatesnumberI=compilinfile(directoriesL,f)                        #Concatenates the corresponding added.pdb into donotedit.dat

    print "Parsing of latest entries..."
    pdbidsL=listparser()                                               #Recent names now present in the pdbidsL list (one name per element)

    print "Acquisition of the current pdb distribution..."
    dictL=currentrelease(f)                                            #Populates dictL with the current entries of the PDB

    print "Parsing of the current pdb distribution into [code,title] tuples..."
    extractL=extract(pdbidsL,dictL)                                    #generates the dictionnary of latest releases key:PDBid ; definition:pdbtitle

    print "Disconnection from the remote ftp server..."
    f=disconnectpdb(f)                                                 #Closes the ftp instance

    print "Extraction of the relevant entries..."
    relevdicD=releventries(extractL[0])                               #Generates a subset of dictionnary D with criterion being "has keywords contained in keywords.txt in its title"

    print "Cleaning program-generated temporary files..."
    clean=diskcleanup()                                                #Cleans the mess generated by the program

    reportL=[]
    reportL.append("\n")
    reportL.append("###############REPORT########################################\n")
    reportL.append("\n")
    lendictS="%i" %(len(dictL))
    chmilblik = 'The current pdb version (as of '+dateS+") has "+lendictS+" entries.\n"
    reportL.append(chmilblik)
    line="The most recent directory is : "+directoriesL[0]+".\n"
    reportL.append(line)
    updatesnumberS="%i" %(updatesnumberI)
    entriesnumber="%i" %(len(extractL[0].keys()))
    line="The "+updatesnumberS+" last updates ("+entriesnumber+" entries) have been examined.\n"
    reportL.append(line)
    diclengthS="%i" %(len(relevdicD.keys()))
    line=diclengthS+" are relevant to you :\n"
    reportL.append(line)
    for i,x in relevdicD.items():
        entry=i+" : "+x+"\n"
        reportL.append(entry)
    problemS=""
    for i in extractL[1]:
        problemS=i+";"+problemS
    problemS="["+problemS
    problemS=problemS.strip(";")
    problemS=problemS+"]"
    lineS="The entries "+problemS+" raised problems,"
    reportL.append(lineS)
    reportL.append("they should be examined manually.")
    reportL.append("\n")
    reportL.append("###############END OF REPORT#################################\n")
    report=open("report.aut","w")
    for elem in reportL:
        print elem
        elem+='\n'
        report.writelines(elem)
    report.close()
    command2='start keywords.txt'
    command3='start report.aut'
    os.system(command2)
    os.system(command3)

cmd.extend("pdbsurvey",pdbsurvey)

Thank you for any feedback, ways of improving it,...