Sunday, April 12, 2009

Scraping Amsat Satellite Status Data

I recently had an interest in scraping out the satellite summary and band information from the Amsat website. As usual, python and beautifulsoup to the rescue! Unfortunately, the html is not structured in a very friendly way when it comes to parsing out the tags of interest. However, after brief analysis, I figured out most of the rules to apply and was able to dump the data of interest out. If Amsat decides to change the structure of their satellite status pages, this code will need to be adjusted.
import types
import urllib
from BeautifulSoup import BeautifulSoup

# strip off all the satellites summary title tokens, like oscar designation,
# oscar number, etc...
def GetSatelliteSummaryTitles(soup):
tds = soup.findAll('td', {'align':'right','valign':'top', 'nowrap':None})
titles = []
for td in tds:
#print td
b = td.find('b')
cleantitle = b.contents[0].replace(' ','').replace(':','')
titles.append(cleantitle)
return titles

# strip off all the satellie summary value tokens, like AO-51, 11.140 Kg, etc..
def GetSatelliteSummaryValues(soup):
tds = soup.findAll('td', {'valign':'top', 'nowrap':None})
values = []
for td in tds:
#print td
b = td.find('b')
if b is None:
cleantitle = td.contents[0]
values.append(cleantitle)
return values

# return a dictionary containing mode data
def GetModeData(soup):
tds = soup.findAll('table',{'width':'75%'})
tds = tds[0].findAll('td')
i=0

currentmode = None
modedata = {}

for td in tds[1:]:
if not ' ' in td:
if not ('valign','top') in td.attrs:
b = td.find('b')
span = td.find('span')
if (not b is None) and (not span is None):
if (not 'Broadcast:' == b.contents[0]) and (not 'BBS:' == b.contents[0]):
currentmode = b.contents[0]
if not modedata.has_key(currentmode):
modedata[currentmode] = []
else:
if 'Callsign(s)' in td.contents[0]:
continue
else:
cleanmode = td.contents[0].replace(':','')
modedata[currentmode].append(cleanmode)
i+=1

for key in modedata:
for i in xrange(len(modedata[key])):
item = modedata[key][i]
if type(item) == types.InstanceType:
modedata[key] = modedata[key][0:i]
break

return modedata

def AppendStatusToFile(outfile,summaryMap, modeMap):

# write all the summary data
for key in summaryMap:
outfile.write(key + '\n')
outfile.write(' ' + summaryMap[key] + '\n')

# get all the mode data
modeMap = GetModeData(soup)
for key in modeMap:
outfile.write(key + '\n')
for mode in modeMap[key]:
outfile.write(' ' + mode + '\n')

outfile.flush()


if __name__ == '__main__':

output = open('allstatus.txt','w')

for i in range(1,200):

try:
urlToProcess = 'http://www.amsat.org/amsat-new/satellites/satInfo.php?satID=' + str(i) + '&retURL=/satellites/status.php'
output.write('=== STATUS URL:' + urlToProcess + '\n')

url = urllib.urlopen(urlToProcess)

pagedata = url.read()
print 'processing url: ' + urlToProcess

filename = str(i) + '.html'
open(filename,'w').write(pagedata)

soup = BeautifulSoup(pagedata)

# get all the summary data
titles = GetSatelliteSummaryTitles(soup)
values = GetSatelliteSummaryValues(soup)
summaryMap = dict(zip(titles,values))

# get all the mode data
modeMap = GetModeData(soup)

AppendStatusToFile(output,summaryMap,modeMap)

except Exception, e:
print 'ERROR processing: ', urlToProcess
print 'ERROR details: ', e


output.close()


For the time being, I plan on using a template engine to format this data into an xml file that will be used as an RSS feed, this will be provided to the Amsat-bb list to see what they think.

If you have any comments or suggestions, let me know.

Cheers,
Joseph Armbruster
KJ4JIO

No comments: