Nash's world

Reddit URL Harvester

by on Mar.31, 2011, under cool, Programing

snooI was browsing Reddit and I stumbled  upon a small sub-reddit at /r/Earthporn this small conner of Reddit had higres pictures of earth scenery.  From there i was directed to  /r/CityPorn, /r/SpacePorn , /r/MachinePorn/r/AnimalPorn and /r/BotanicalPorn all HD pic. This got me thinking since i am lazy as fuck i did not want to spend every day going through six sub-reddit download and save them the best way to do is to automate the process.

In order to download the images the first thing that needed to be down was to harvest the urls from the sub-reddits home page. I could have gone and screen scraped with beautiful soup but reddit provides this nifty feature where by if you append .json to end of a url eg http://www.reddit.com/r/earthporn/.json it will return the json file for the corresponding page with posts and urls and other data (also if you append .xml it will return and XML file with the pages data ). This allows me to skip all the dirty crud of passing html of a constenly changing page. Below is the version 0.0.1 of URL harvester code.

#!/usr/bin/python
#
# Copyright (C) 2011  Nashath Rafeeq.
#   Reddit Url Harvester V.0.0.1
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__author__ = 'nash.rafeeg@gmail.com (Nash Rafeeg)'

import os.path
import os
import urllib
from urlparse import urlparse

def dir_list2(dir_name, *args):
        #look into the current working directory and append all files to and array and return it
        fileList = []
        for file in os.listdir(dir_name):
                dirfile = os.path.join(dir_name, file)
                if os.path.isfile(dirfile):
                        if len(args) == 0:
                                fileList.append(dirfile)
                        else:
                                if os.path.splitext(dirfile)[1][1:] in args:
                                        fileList.append(dirfile)

        """
        elif os.path.isdir(dirfile):
        print "Accessing directory:", dirfile
        fileList += dir_list2(dirfile, *args)
        """
        return fileList
def graburlsfiles(fileList):
        #method to look for files that end in urls
        urlfileList = []
        for file in fileList:
            s = os.path.splitext(file)
            if s[1] == ".urls":
                    urlfileList.append(file)
        return urlfileList

def validurls(dirtyurl):
        #try to see if the url grabed is a valid url
        try:
                urllib.urlopen(dirtyurl)        #this is very inefficent way of checking if its a valid url if reddit 404 or 502 will show up as a bad url if this happens re run the script
                print "validated %s" %dirtyurl
                return True
        except IOError:
                print "%s seem to be bad url" %dirtyurl
                return False

def graburls(urlfiles):
        #grab urls from .urls files and return it
        cleanurls = []
        for ufiles in urlfiles:
                file = open(ufiles, "r")
                dirty_urls = file.readlines()
                for url in dirty_urls:
                        print "validating"
                        if validurls(url):
                                cleanurls.append(url)
        return cleanurls

def findeimagelinks(url):
        import urllib2
        import json
        import csv
        import hashlib
        import datetime

        try: 

                downloadfile = csv.writer(open('files.csv', 'w'), delimiter=';',quotechar='"', quoting=csv.QUOTE_MINIMAL) #open csv file for writing will overwrite previouse file
                u = urllib2.urlopen(url) #open the url and get .JSON file

                objects = json.load(u)['data']                                                                  #look for the element 'data'
                children = objects['children']                                                                  #look for element children inside data
                for c in children:
                        keys = c.get("data")                                                                    #look for a data keys
                        imgsrc =  keys.get("url")                                                               #the link
                        imgname = keys.get("title")                                                             #post titile
                        urlparts = urlparse(keys.get("url"))
                        ext = os.path.splitext(urlparts.path)[1]                                                #try to get extention of the file
                        hash = hashlib.sha224(imgsrc).hexdigest()                                               #future use hash the url genrate unique keys
                        adddate = datetime.datetime.now()                                                       #the date and time addeed
                        downloaddate = ""                                                                       #futur use
                        flag = "tbd"                                                                            #futue use
                        csvobj = [imgsrc, imgname.encode('ascii','ignore'), keys.get("domain"), ext, hash, adddate, downloaddate, flag] #creat csv object
                        downloadfile.writerow(csvobj)                                                           #write csv object to file
        except IOError:
                pass 

if __name__ == '__main__':
        fileList = dir_list2('/home/nash/picdownloader')                                                        #path to dir containing the .urls change this
        urlfiles = graburlsfiles(fileList)
        urls = graburls(urlfiles)
        for ur in urls:
                findeimagelinks(ur)
                print "Havested urls from %s" %ur

The program looks for files with extention .urls in the specified directory. from those files it reads the urls  my .url file looks like as follows

http://www.reddit.com/r/EarthPorn/.json
http://www.reddit.com/r/cityPorn/.json
http://www.reddit.com/r/SpacePorn/.json
http://www.reddit.com/r/BotanicalPorn/.json
http://www.reddit.com/r/AnimalPorn/.json

Once the urls are read the json files for these urls is downloaded and and passed. The URL the post title and extensions with other information is stuffed into a CSV. CSV was choosen because how easy it is to use with python later this will be updated to include desktop couch

Now with the urls stuffed into a csv file we can go about downloading image files and saving it a specified folder i wrote a rudimentary image downloader that will check if the downloaded file is a image or not before saving it to correct location the code for that is as below

#!/usr/bin/python
#
# Copyright (C) 2011  Nashath Rafeeq.
#   Reddit URL Harvester V.0.0.1
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import urllib2
import urllib
import random

 __author__ = 'nash.rafeeg@gmail.com (Nash Rafeeg)'

'''
TODO: Read Configarations from config file 

'''

class imggrab:
        '''
                This Class contains main methods that handle download and saving of image files
                TODO: integrate imgur and flicker api so that non direct links can be downloaded
        '''
        def __init__(self,url, name, ext,  path, retry):
                self.path = path
                self.retry = retry
                #self.checkhd = checkhd          future use
                self.url = url
                self.name = name
                self.ext = ext
        def useragent(self):
                #future use to randomized the useragent
               uagen = []
               uagen = [
                       'Mozilla/4.0 (compatible; MSIE 2.0; Windows NT 5.0; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
                       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.25 (KHTML, like Gecko) Chrome/12.0.706.0 Safari/534.25',
                       'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110324 Firefox/4.2a1pre'
               ]
               return uagen[random.randrange(0, len(uagen)-1)]

        def progress_callback(self, blocks, block_size, total_size):
               #call back method for get_image method
               #TODO change to proper progress tracker

               downloaded = float(blocks) * float(block_size)
               percent = (downloaded/float(total_size))*100
               size = (float(total_size) / 8)/1024
               block_sizes = (float(block_size) /8)/1024
               print "Downloaded %i of %fKB size file with %s blockes sized %fKB each" % (int(percent), size,  blocks, block_sizes)

        def imagevarify(self, file):
                #varify and save the the file in path provided
                #TODO add clean up functions
                #TODO change naming conventions so that [ gets escaped

                from PIL import Image
                from StringIO import StringIO

                try:
                        im = Image.open(file)
                        im.verify()
                        print "varified", im.format, "%dx%d" % im.size, im.mode
                        size = "[%dx%d]" % im.size
                        filename = self.name+size+self.ext
                        fullpath = self.path+filename
                        print fullpath
                        s = Image.open(file)
                        s.save(fullpath)
                        print filename
                except Exception as e:
                        print e
                        raise 

        def get_image(self):
                #function to download file
                #TODO add useragents 

                while(self.retry >0):
                        try:
                                #headers = { 'User-Agent' : self.useragent()}
                                #req = urllib2.Request(self.url,None, headers)
                                #print "headers set %s" % headers
                                #response = urllib2.urlopen(req)
                                #print "Response set"
                                (file, headers) = urllib.urlretrieve(self.url,self.name, self.progress_callback)
                                print "downloaded binary object %s Passing to pil for varification" % headers
                                self.imagevarify(file)
                                break
                        except Exception as e:
                                print e
                                print "Retying [%i]" % self.retry
                                self.retry -=1
def getcsv(csvfilename, path, retry):
        #import urls from csv
        #TODO handle urls with out extentions
        import csv
        csvreader = csv.reader(open(csvfilename,'rb'), delimiter=    ';',quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for row in csvreader:
                url = row[0]
                name = row[1]
                ext = row[3]
                try:
                        img = imggrab(url, name, ext, path, retry)
                        img.get_image()
                        print "saved file"
                except Exception as e:
                        print e 

if __name__ == '__main__':
        csvfile = 'files.csv'
        path = "/home/nash/Pictures/"
        retry = 1
        getcsv(csvfile, path, retry)

you can get the source code from https://github.com/nashrafeeg/Reddit-URL-Harvester
below here is a sample of images that was grabbed from last nights run of the code



:, , , , ,

Leave a Reply

*

Looking for something?

Use the form below to search the site:

Still not finding what you're looking for? Drop a comment on a post or contact us so we can take care of it!

Blogroll

A few highly recommended websites...

IDL

Member of The Internet Defense League