Converts all .html files in the folder to .txt files
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | from os import chdir, getcwd, listdir, path from time import strftime import os import codecs import re from bs4 import BeautifulSoup def count_files(filetype): count_files = 0 for files in listdir(folder): if files.endswith(filetype): count_files += 1 return count_files def check_path(prompt): ''' (str) -> str Verifies if the provided absolute path does exist. ''' abs_path = raw_input(prompt) while path.exists(abs_path) != True: print "\nThe specified path does not exist.\n" abs_path = raw_input(prompt) return abs_path print "\n" folder = check_path("Provide absolute path for the folder: ") chdir(folder) num_html = count_files(".html") if num_html == 0: print "\nThe specified folder does not contain Html files.\n" print strftime("%H:%M:%S"), "There are no files to convert!." exit() else: print "\nNumber of Html files: ", num_html, "\n" print strftime("%H:%M:%S"), "Starting to convert files ...\n" list=[] directory=folder for root,dirs,files in os.walk(directory): for filename in files: if filename.endswith('.html'): found=os.path.join(directory,filename) list.append(found) m=len(list) for item in list: path=item head,tail=os.path.split(path) var="\\" tail=tail.replace(".html",".txt") name=head+var+tail print item html=codecs.open(item,'r') soup = BeautifulSoup(html) for script in soup(["script", "style"]): script.extract() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) extractedText = '\n'.join(chunk for chunk in chunks if len(chunk)>6) content=extractedText print strftime("%H:%M:%S"), " Html -> txt " f=open(name,'w') f.write(content.encode("UTF-8")) f.close print "\n", strftime("%H:%M:%S"), "Finished converting .html files." # Count the number of txt files. num_txt = count_files(".txt") print "\nNumber of txt files: ", num_txt |