i have to find out what is happeining - and why the script fails to load the data into the db....
see the follwoing
Code: Select all
import urllib
import urlparse
import re
url = "http://search.cpan.org/author/?W"
html = urllib.urlopen(url).read()
for lk, capname, name in re.findall('<a href="(/~.*?/)"><b>(.*?)</b></a><br/><small>(.*?)</small>', html):
alk = urlparse.urljoin(url, lk)
data = { 'url':alk, 'name':name, 'cname':capname }
phtml = urllib.urlopen(alk).read()
memail = re.search('<a href="mailto:(.*?)">', phtml)
if memail:
data['email'] = memail.group(1)
print data
$ python printer.py
{'url': 'http://search.cpan.org/~wac/', 'cname': 'WAC', 'name': 'Wang Aocheng', 'email': 'wangaocheng%40hotmail.com'}
{'url': 'http://search.cpan.org/~wade/', 'cname': 'WADE', 'name': 'James Wade', 'email': 'CENSORED'}
note: the database on the opensuse 13.1 mysql db shows the following struckure:
Code: Select all
--
-- Tabellenstruktur für Tabelle `cname`
--
CREATE TABLE IF NOT EXISTS `cname` (
`cname` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- --------------------------------------------------------
--
-- Tabellenstruktur für Tabelle `name`
--
CREATE TABLE IF NOT EXISTS `name` (
`url` int(11) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- --------------------------------------------------------
--
-- Tabellenstruktur für Tabelle `url`
--
CREATE TABLE IF NOT EXISTS `url` (
`url` int(11) DEFAULT NULL,
`name` int(11) DEFAULT NULL,
`cname` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTIO
and the script is like that:
Code: Select all
import urllib
import urlparse
import re
import MySQLdb
db = MySQLdb.connect(host="localhost", # your host, usually localhost
user="root", # your username
passwd="rimbaud", # your password
db="cpan") # name of the data base
# you must create a Cursor object. It will let
# you execute all the queries you need
cur = db.cursor()
url = "http://search.cpan.org/author/?W"
html = urllib.urlopen(url).read()
for lk, capname, name in re.findall('<a href="(/~.*?/)"><b>(.*?)</b></a><br/><small>(.*?)</small>', html):
alk = urlparse.urljoin(url, lk)
data = { 'url':alk, 'name':name, 'cname':capname }
phtml = urllib.urlopen(alk).read()
memail = re.search('<a href="mailto:(.*?)">', phtml)
if memail:
data['email'] = memail.group(1)
# Use all the SQL you like
cur.execute("SELECT * FROM YOUR_TABLE_NAME")
# print all the first cell of all the rows
for row in cur.fetchall() :
print row[0]
but unfortunatly i get the following errors:
that let me think that ihave some database-errors:
< but wait: the tables and the database exist - (see above)
Code: Select all
martin@linux-70ce:~/perl> python cpan2.py
Traceback (most recent call last):
File "cpan2.py", line 34, in <module>
cur.execute("SELECT * FROM YOUR_TABLE_NAME")
File "/usr/lib/python2.7/site-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.ProgrammingError: (1146, "Table 'cpan.YOUR_TABLE_NAME' doesn't exist")
martin@linux-70ce:~/perl> python cpan2.py
Traceback (most recent call last):
File "cpan2.py", line 34, in <module>
cur.execute("SELECT * FROM YOUR_TABLE_NAME")
File "/usr/lib/python2.7/site-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.ProgrammingError: (1146, "Table 'cpan.YOUR_TABLE_NAME' doesn't exist")
martin@linux-70ce:~/perl> python cpan2.py
Traceback (most recent call last):
File "cpan2.py", line 34, in <module>
cur.execute("SELECT * FROM YOUR_TABLE_NAME")
File "/usr/lib/python2.7/site-packages/MySQLdb/cursors.py", line 174, in execute
self.errorhandler(self, exc, value)
File "/usr/lib/python2.7/site-packages/MySQLdb/connections.py", line 36, in defaulterrorhandler
raise errorclass, errorvalue
_mysql_exceptions.ProgrammingError: (1146, "Table 'cpan.YOUR_TABLE_NAME' doesn't exist")
martin@linux-70ce:~/perl>