getImagesData.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. #!/usr/bin/python
  2. import sys
  3. import urllib
  4. #import http.client
  5. import urllib2
  6. import re
  7. from PIL import Image
  8. from io import BytesIO
  9. import cStringIO
  10. #import URLError, HTTPError
  11. urls = ["http://cnn.com"]
  12. i = 0
  13. regex1 = '<title>(.+?)</title>'
  14. pattern1 = re.compile(regex1)
  15. regex2 = '<img src="(.+?)"'
  16. pattern2 = re.compile(regex2)
  17. while i < len(urls):
  18. htmlfile = urllib.urlopen(urls[i])
  19. htmltext = htmlfile.read()
  20. titles = re.findall(pattern1,htmltext)
  21. images = re.findall(pattern2,htmltext)
  22. size1 = htmlfile.headers.get("content-length")
  23. # size2 = len(htmlfile.read())
  24. print titles
  25. # print images
  26. print "content-length of header: "
  27. print size1
  28. # print size2
  29. print "-------"
  30. i+=1
  31. totalImageSize = 0
  32. print "Output:"
  33. #i = 0
  34. #while i < len(images):
  35. # url = '\''+images[i]+'\''
  36. # print images[i]
  37. # i+=1
  38. # try:
  39. i = 0
  40. while i < len(images):
  41. response = urllib.urlopen(images[i])
  42. # except URLError as e:
  43. # print "ERROR: ", e.code()
  44. # else:
  45. headers = response.info()
  46. data = response.read()
  47. print 'URL: ', response.geturl()
  48. print 'DATE:', headers['date']
  49. print 'LENGTH:', len(data)
  50. totalImageSize+=len(data)
  51. i+=1
  52. print "---------------------"
  53. print "Total bandwidth to download images from %s is %d KB" % (urls, totalImageSize)