a

Thursday, 9 March 2017

Converts all .html files in the folder to .txt files

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from os import chdir, getcwd, listdir, path
from time import strftime
import os
import codecs
import re
from bs4 import BeautifulSoup



def count_files(filetype):
    
    count_files = 0
    for files in listdir(folder):
        if files.endswith(filetype):
            count_files += 1
    return count_files


def check_path(prompt):
    ''' (str) -> str
    Verifies if the provided absolute path does exist.
    '''
    abs_path = raw_input(prompt)
    while path.exists(abs_path) != True:
        print "\nThe specified path does not exist.\n"
        abs_path = raw_input(prompt)
    return abs_path    
    
print "\n"

folder = check_path("Provide absolute path for the folder: ")



chdir(folder)



num_html = count_files(".html")

if num_html  == 0:
    print "\nThe specified folder does not contain Html files.\n"
    print strftime("%H:%M:%S"), "There are no files to convert!."
    exit()
else:
    print "\nNumber of Html files: ", num_html, "\n"
    print strftime("%H:%M:%S"), "Starting to convert files ...\n"


list=[]
directory=folder
for root,dirs,files in os.walk(directory):
    for filename in files:
        if filename.endswith('.html'):
            found=os.path.join(directory,filename)
            list.append(found)

m=len(list)
for item in list:
    path=item
    head,tail=os.path.split(path)
    var="\\"
    
    tail=tail.replace(".html",".txt")
    name=head+var+tail
    print item
    html=codecs.open(item,'r')
    soup = BeautifulSoup(html)

    
    for script in soup(["script", "style"]):
        script.extract()    
    
    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    extractedText = '\n'.join(chunk for chunk in chunks if len(chunk)>6)
    
    content=extractedText
    print strftime("%H:%M:%S"), " Html  -> txt "
    f=open(name,'w')
    f.write(content.encode("UTF-8"))
    f.close

print "\n", strftime("%H:%M:%S"), "Finished converting .html files."




# Count the number of txt files.

num_txt = count_files(".txt")   

print "\nNumber of txt files: ", num_txt

No comments:

Post a Comment