python脚本批量清除文件bom头

Continue Read..

今天天气很好,我们来清除bom头吧!

修改了网上的2个脚本,原版是https://gist.github.com/yhben/5169561,py 2.x版本的,本机环境3.x做完3的兼容还是下不太好用

无法清除utf-8的bom头,翻了几篇文章,改掉了他的清理函数,效果棒棒哒

#coding=utf-8

'''
* 去除指定类型文件的bom头
* 版权所有
* @author      t6760915<t6760915@gmail.com>
* @version     $Id: trim_bom.py $
* 原版         https://gist.github.com/yhben/5169561
* 原版不能检测所有的UTF-8 + bom的文件
'''

import os
import sys
import codecs

class TrimBom:
    
    basePath = ''
    fileList = []
    BUFSIZE = 4096
    BOMLEN = len(codecs.BOM_UTF8)
    #trimExtList = ['php', 'css', 'js', 'py', 'pl', 'html', 'htm']
    trimExtList = ['php']

    def remove_bom(self, filepath):
        with open(filepath, 'r+b') as fp:
            chunk = fp.read(self.BUFSIZE)
            if chunk.startswith(codecs.BOM_UTF8):
                #print(filepath)
                i = 0
                chunk = chunk[self.BOMLEN:]
                while chunk:
                    fp.seek(i)
                    fp.write(chunk)
                    i += len(chunk)
                    fp.seek(self.BOMLEN, os.SEEK_CUR)
                    chunk = fp.read(self.BUFSIZE)
                fp.seek(-self.BOMLEN, os.SEEK_CUR)
                fp.truncate()
                print('Converted: ' + filepath)
            '''
            else:
                print(filepath + " file_encoding is utf8 without BOM.")
            '''

    #获取指定类型文件名列表
    def getFileListByExt(self, path):
        if not path:
            return False

        path = os.path.normpath(path)

        if not os.path.exists(path):
            return False

        if os.path.isfile(path) and path.split('.')[-1] in self.trimExtList:
            trimFlag = self.remove_bom(path)
            #print(path,trimFlag)
            if trimFlag:
                printf('process %s success...' , path.replace(self.basePath, '').replace('\\', '/'))
            
        elif os.path.isdir(path):
            fileNameList = os.listdir(path)
            for fileName in fileNameList:
                fileName = os.path.normpath('%s/%s' % (path,fileName))
                self.getFileListByExt(fileName)
        
        return False

    #运行函数入口
    def run(self, path):
        self.basePath = os.path.normpath(path)
        self.getFileListByExt(path)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('USEAGE:python %s dirName' , __file__)
        sys.exit(0)
    
    tObj = TrimBom()
    tObj.run(sys.argv[1])

声明:此文系舞林cuznwww.wulinlw.org)原创稿件,转载请保留版权

python脚本批量清除文件bom头

Continue Read..

今天天气很好,我们来清除bom头吧!

修改了网上的2个脚本,原版是https://gist.github.com/yhben/5169561,py 2.x版本的,本机环境3.x做完3的兼容还是下不太好用

无法清除utf-8的bom头,翻了几篇文章,改掉了他的清理函数,效果棒棒哒

#coding=utf-8

'''
* 去除指定类型文件的bom头
* 版权所有
* @author      t6760915<t6760915@gmail.com>
* @version     $Id: trim_bom.py $
* 原版         https://gist.github.com/yhben/5169561
* 原版不能检测所有的UTF-8 + bom的文件
'''

import os
import sys
import codecs

class TrimBom:
    
    basePath = ''
    fileList = []
    BUFSIZE = 4096
    BOMLEN = len(codecs.BOM_UTF8)
    #trimExtList = ['php', 'css', 'js', 'py', 'pl', 'html', 'htm']
    trimExtList = ['php']

    def remove_bom(self, filepath):
        with open(filepath, 'r+b') as fp:
            chunk = fp.read(self.BUFSIZE)
            if chunk.startswith(codecs.BOM_UTF8):
                #print(filepath)
                i = 0
                chunk = chunk[self.BOMLEN:]
                while chunk:
                    fp.seek(i)
                    fp.write(chunk)
                    i += len(chunk)
                    fp.seek(self.BOMLEN, os.SEEK_CUR)
                    chunk = fp.read(self.BUFSIZE)
                fp.seek(-self.BOMLEN, os.SEEK_CUR)
                fp.truncate()
                print('Converted: ' + filepath)
            '''
            else:
                print(filepath + " file_encoding is utf8 without BOM.")
            '''

    #获取指定类型文件名列表
    def getFileListByExt(self, path):
        if not path:
            return False

        path = os.path.normpath(path)

        if not os.path.exists(path):
            return False

        if os.path.isfile(path) and path.split('.')[-1] in self.trimExtList:
            trimFlag = self.remove_bom(path)
            #print(path,trimFlag)
            if trimFlag:
                printf('process %s success...' , path.replace(self.basePath, '').replace('\\', '/'))
            
        elif os.path.isdir(path):
            fileNameList = os.listdir(path)
            for fileName in fileNameList:
                fileName = os.path.normpath('%s/%s' % (path,fileName))
                self.getFileListByExt(fileName)
        
        return False

    #运行函数入口
    def run(self, path):
        self.basePath = os.path.normpath(path)
        self.getFileListByExt(path)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('USEAGE:python %s dirName' , __file__)
        sys.exit(0)
    
    tObj = TrimBom()
    tObj.run(sys.argv[1])

声明:此文系舞林cuznwww.wulinlw.org)原创稿件,转载请保留版权

python和php处理手机通讯录vcf文件

Continue Read..

朋友小米手机掉了,幸好有备份通讯录,于是导到以前的诺基亚里面,文件太大,卡的导不进去

所以写个小脚本分割成一条记录一个的小备份文件

vcf备份文件里面是6行一个联系人信息,所以小脚本要做的就是每6行取出来,单独存一个文件里

这里写了php和python的,嘿嘿

 

php版

$f_arr = file('data.vcf');
//print_r($f_arr);
echo $wc = count($f_arr);
for($i=0;$i<$wc;$i++){
	if ( ($i+1)%6 == 0 ){
		unset($str);
		//echo $f_arr[$i];
		echo $str = $f_arr[$i-5].$f_arr[$i-4].$f_arr[$i-3].$f_arr[$i-2].$f_arr[$i-1].$f_arr[$i];
		$fileName = $i-5 .'.vcf';
		writeFile($fileName,$str);
	}
	
}

function writeFile($fileName,$str){
	$f = fopen($fileName,'w+');
	fwrite($f, $str);
	fclose($f);
} 

python版

#-*- coding:UTF-8 -*-

def writeFile(fileName,str):
    def_f = open(fileName,'w')
    def_f.write(str)
    def_f.close()
    
f = open('data.vcf','r')
text=f.readlines()
#test is list

i=0
for line in text:
    #print line
    #print i
    if i%6 == 0 and i!=0:
        print i
        string = text[i-6]+text[i-5]+text[i-4]+text[i-3]+text[i-2]+text[i-1]
        print string
        fileName = str(i)+'.vcf'
        #print name
        writeFile(fileName,string)
        del string
    i=i+1

声明:此文系舞林cuznwww.wulinlw.org)原创稿件,转载请保留版权

python 3 抓取网页保存为html

Continue Read..
#-*- coding utf-8 -*-
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
html = response.read()
#print(type(html))
#这里读到的内容html是bytes类型,中文有问题
htmlStr = str(html,'gbk')
print(htmlStr)

f = open('baidu.html','w')
f.write(htmlStr)
f.close()

声明:此文系舞林cuznwww.wulinlw.org)原创稿件,转载请保留版权