extract contact information from html with python -


here sample html

<div class="yui3-u-5-6" id="browse-products"> <div id="kazbah-contact">   <span class="contact-info-title">contact 00nothing:</span>   <a href="mailto:info@00nothing.com">info@00nothing.com</a> | 800-410-2074    | c/o score x score     &nbsp;8118-d statesville rd     ,   charlotte,   nc   28269 </div> <div class="clearfix"></div> 

i want extract contact information here, email, phone, , address. how should python? thanks

i use code extract information

# _*_ coding:utf-8 _*_ import urllib2 import urllib import re bs4 import beautifulsoup import sys reload(sys) sys.setdefaultencoding('utf-8')  def grabhref(url,localfile):     html = urllib2.urlopen(url).read()     html = unicode(html,'gb2312','ignore').encode('utf-8','ignore')     soup = beautifulsoup(html)     myfile = open(localfile,'wb')     link in soup.select("div >            a[href^=http://www.karmaloop.com/kazbah/browse]"):         item in beautifulsoup(urllib2.urlopen(link['href']).read()).select("div > a[href^=mailto]"):             contactinfo = item.get_text()             print link['href']             print contactinfo          myfile.write(link['href'])         myfile.write('\r\n')         myfile.write(contactinfo)         myfile.write('\r\n')     myfile.close()    def main():     url = "http://www.karmaloop.com/brands"     localfile = 'contact.txt'     grabhref(url,localfile) if __name__=="__main__":     main() 

but still can email address here, how can phone number , address?


Comments

Popular posts from this blog

css - SVG using textPath a symbol not rendering in Firefox -

Java 8 + Maven Javadoc plugin: Error fetching URL -

datatable - Matlab struct computations -