資源簡介
該數據集由一系列郵件組成,適用于測試垃圾郵件過濾系統,請勿用作商業目的。
代碼片段和文件信息
#!/usr/bin/python
#?FileName:?Subsampling.py?
#?Version?1.0?by?Tao?Ban?2010.5.26
#?This?function?extract?all?the?contents?ie?subject?and?first?part?from?the?.eml?file?
#?and?store?it?in?a?new?file?with?the?same?name?in?the?dst?dir.?
import?email.parser?
import?os?sys?stat
import?shutil
def?ExtractSubPayload?(filename):
‘‘‘?Extract?the?subject?and?payload?from?the?.eml?file.
‘‘‘
if?not?os.path.exists(filename):?#?dest?path?doesnot?exist
print?“ERROR:?input?file?does?not?exist:“?filename
os.exit(1)
fp?=?open(filename)
msg?=?email.message_from_file(fp)
payload?=?msg.get_payload()
if?type(payload)?==?type(list())?:
payload?=?payload[0]?#?only?use?the?first?part?of?payload
sub?=?msg.get(‘subject‘)
sub?=?str(sub)
if?type(payload)?!=?type(‘‘)?:
payload?=?str(payload)
return?sub?+?payload
def?ExtractBodyFromDir?(?srcdir?dstdir?):
‘‘‘Extract?the?body?information?from?all?.eml?files?in?the?srcdir?and?
save?the?file?to?the?dstdir?with?the?same?name.‘‘‘
if?not?os.path.exists(dstdir):?#?dest?path?doesnot?exist
os.makedirs(dstdir)??
files?=?os.listdir(srcdir)
for?file?in?files:
srcpath?=?os.path.join(srcdir?file)
dstpath?=?os.path.join(dstdir?file)
src_info?=?os.stat(srcpath)
if?stat.S_ISDIR(src_info.st_mode):?#?for?subfolders?recurse
ExtractBodyFromDir(srcpath?dstpath)
else:??#?copy?the?file
body?=?ExtractSubPayload?(srcpath)
dstfile?=?open(dstpath?‘w‘)
dstfile.write(body)
dstfile.close()
###################################################################
#?main?function?start?here
#?srcdir?is?the?directory?where?the?.eml?are?stored
print?‘Input?source?directory:?‘?#ask?for?source?and?dest?dirs
srcdir?=?raw_input()
if?not?os.path.exists(srcdir):
print?‘The?source?directory?%s?does?not?exist?exit...‘?%?(srcdir)
sys.exit()
#?dstdir?is?the?directory?where?the?content?.eml?are?stored
print?‘Input?destination?directory:?‘?#ask?for?source?and?dest?dirs
dstdir?=?raw_input()
if?not?os.path.exists(dstdir):
print?‘The?destination?directory?is?newly?created.‘
os.makedirs(dstdir)
###################################################################
ExtractBodyFromDir?(?srcdir?dstdir?)?
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????目錄???????????0??2010-05-28?16:46??CSDMC2010_SPAM\
?????目錄???????????0??2010-05-28?16:46??CSDMC2010_SPAM\CSDMC2010_SPAM\
?????文件????????2177??2010-05-27?09:28??CSDMC2010_SPAM\CSDMC2010_SPAM\ExtractContent.py
?????文件????????3411??2010-05-27?09:29??CSDMC2010_SPAM\CSDMC2010_SPAM\readme.txt
?????文件???????77886??2010-05-27?06:27??CSDMC2010_SPAM\CSDMC2010_SPAM\SPAMTrain.label
?????目錄???????????0??2010-05-28?16:47??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\
?????文件????????6215??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00000.eml
?????文件????????6484??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00001.eml
?????文件????????7705??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00002.eml
?????文件????????6260??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00003.eml
?????文件???????33094??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00004.eml
?????文件???????49320??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00005.eml
?????文件????????3163??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00006.eml
?????文件????????2519??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00007.eml
?????文件???????30295??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00008.eml
?????文件????????2514??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00009.eml
?????文件???????13698??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00010.eml
?????文件????????5639??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00011.eml
?????文件????????1098??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00012.eml
?????文件????????5555??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00013.eml
?????文件????????6049??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00014.eml
?????文件????????4667??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00015.eml
?????文件????????3945??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00016.eml
?????文件????????7610??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00017.eml
?????文件????????3487??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00018.eml
?????文件????????5110??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00019.eml
?????文件????????5037??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00020.eml
?????文件????????6634??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00021.eml
?????文件????????6406??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00022.eml
?????文件????????2297??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00023.eml
?????文件????????3867??2010-05-27?06:01??CSDMC2010_SPAM\CSDMC2010_SPAM\TESTING\TEST_00024.eml
............此處省略8595個文件信息
評論
共有 條評論