import sys, csv, os, stat, re agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10" if len(sys.argv) < 3: print "syntax: %s list urls" % sys.argv[0] sys.exit() list_csv = csv.reader(open(sys.argv[1]), delimiter='\t') urls_csv = csv.reader(open(sys.argv[2]), delimiter='\t') start = 0 if len(sys.argv) >= 4: start = int(sys.argv[3]) def dist(a, b): la, lb = len(a), len(b) c = [[0 for j in range(lb)] for i in range(la)] d0 = 0 if a[0] != b[0]: d0 = 1 for j in range(lb): c[0][j] = d0 + j for i in range(la): c[i][0] = d0 + i for i in range(1, la): for j in range(1, lb): ci = c[i-1][j] + 1 cj = c[i][j-1] + 1 cij = c[i-1][j-1] if a[i] != b[j]: cij += 1 m = ci if m > cj: m = cj if m > cij: m = cij c[i][j] = m return c[la-1][lb-1] urls = {} for row in urls_csv: url_track, name, url_mp3 = row results = urls.get(url_track, []) results.append([name, url_mp3]) urls[url_track] = results def cleanup(s): return s.lower().replace(" ", "") rows = [row for row in list_csv] for pos in range(start, len(rows)): row = rows[pos] url_track, url_artist, track, artist = row results = urls.get(url_track, []) list_name = "%s - %s" % (artist, track) print "saving %d (%s)" % (pos, list_name) for rpos in range(len(results)): result = results[rpos] result.append(rpos) result.append(rpos * 2 + dist(cleanup(list_name), cleanup(result[0]))) results.sort(lambda x, y: cmp(x[3], y[3])) for result in results: print [result[0], result[2], result[3]] for result in results: url = result[1] safename = re.sub("[^a-zA-Z0-9-_ ]", " ", result[0]) outfile = "data/%s.mp3" % safename referer = re.search("http://[^/]+", url).group(0) os.system('wget "%s" --referer="%s" --user-agent="%s" -O "%s"' % (url, referer, agent, outfile)) size = 0 try: st = os.stat(outfile) size = st[stat.ST_SIZE] except: size = 0 if size < 1024*1024: os.remove(outfile) else: print "satisfied" break