python3+PyQt5实现支持多线程的页面索引器应用程序
作者:basisworker 发布时间:2022-02-17 02:02:11
标签:python3,PyQt5,页面索引器
本文通过Python3+pyqt5实现了python Qt GUI 快速编程的19章的页面索引器应用程序例子。
/home/yrd/eric_workspace/chap19/walker_ans.py
#!/usr/bin/env python3
import codecs
import html.entities
import re
import sys
from PyQt5.QtCore import (QMutex, QThread,pyqtSignal,Qt)
class Walker(QThread):
finished = pyqtSignal(bool,int)
indexed = pyqtSignal(str,int)
COMMON_WORDS_THRESHOLD = 250
MIN_WORD_LEN = 3
MAX_WORD_LEN = 25
INVALID_FIRST_OR_LAST = frozenset("0123456789_")
STRIPHTML_RE = re.compile(r"<[^>]*?>", re.IGNORECASE|re.MULTILINE)
ENTITY_RE = re.compile(r"&(\w+?);|&#(\d+?);")
SPLIT_RE = re.compile(r"\W+", re.IGNORECASE|re.MULTILINE)
def __init__(self, index, lock, files, filenamesForWords,
commonWords, parent=None):
super(Walker, self).__init__(parent)
self.index = index
self.lock = lock
self.files = files
self.filenamesForWords = filenamesForWords
self.commonWords = commonWords
self.stopped = False
self.mutex = QMutex()
self.completed = False
def stop(self):
try:
self.mutex.lock()
self.stopped = True
finally:
self.mutex.unlock()
def isStopped(self):
try:
self.mutex.lock()
return self.stopped
finally:
self.mutex.unlock()
def run(self):
self.processFiles()
self.stop()
self.finished.emit(self.completed,self.index)
def processFiles(self):
def unichrFromEntity(match):
text = match.group(match.lastindex)
if text.isdigit():
return chr(int(text))
u = html.entities.name2codepoint.get(text)
return chr(u) if u is not None else ""
for fname in self.files:
if self.isStopped():
return
words = set()
fh = None
try:
fh = codecs.open(fname, "r", "UTF8", "ignore")
text = fh.read()
except EnvironmentError as e:
sys.stderr.write("Error: {0}\n".format(e))
continue
finally:
if fh is not None:
fh.close()
if self.isStopped():
return
text = self.STRIPHTML_RE.sub("", text)
text = self.ENTITY_RE.sub(unichrFromEntity, text)
text = text.lower()
for word in self.SPLIT_RE.split(text):
if (self.MIN_WORD_LEN <= len(word) <=
self.MAX_WORD_LEN and
word[0] not in self.INVALID_FIRST_OR_LAST and
word[-1] not in self.INVALID_FIRST_OR_LAST):
try:
self.lock.lockForRead()
new = word not in self.commonWords
finally:
self.lock.unlock()
if new:
words.add(word)
if self.isStopped():
return
for word in words:
try:
self.lock.lockForWrite()
files = self.filenamesForWords[word]
if len(files) > self.COMMON_WORDS_THRESHOLD:
del self.filenamesForWords[word]
self.commonWords.add(word)
else:
files.add(str(fname))
finally:
self.lock.unlock()
self.indexed.emit(fname,self.index)
self.completed = True
/home/yrd/eric_workspace/chap19/pageindexer_ans.pyw
#!/usr/bin/env python3
import collections
import os
import sys
from PyQt5.QtCore import (QDir, QReadWriteLock, QMutex,Qt)
from PyQt5.QtWidgets import (QApplication, QDialog, QFileDialog, QFrame,
QHBoxLayout, QLCDNumber, QLabel, QLineEdit, QListWidget,
QPushButton, QVBoxLayout)
import walker_ans as walker
def isAlive(qobj):
import sip
try:
sip.unwrapinstance(qobj)
except RuntimeError:
return False
return True
class Form(QDialog):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.mutex = QMutex()
self.fileCount = 0
self.filenamesForWords = collections.defaultdict(set)
self.commonWords = set()
self.lock = QReadWriteLock()
self.path = QDir.homePath()
pathLabel = QLabel("Indexing path:")
self.pathLabel = QLabel()
self.pathLabel.setFrameStyle(QFrame.StyledPanel|QFrame.Sunken)
self.pathButton = QPushButton("Set &Path...")
self.pathButton.setAutoDefault(False)
findLabel = QLabel("&Find word:")
self.findEdit = QLineEdit()
findLabel.setBuddy(self.findEdit)
commonWordsLabel = QLabel("&Common words:")
self.commonWordsListWidget = QListWidget()
commonWordsLabel.setBuddy(self.commonWordsListWidget)
filesLabel = QLabel("Files containing the &word:")
self.filesListWidget = QListWidget()
filesLabel.setBuddy(self.filesListWidget)
filesIndexedLabel = QLabel("Files indexed")
self.filesIndexedLCD = QLCDNumber()
self.filesIndexedLCD.setSegmentStyle(QLCDNumber.Flat)
wordsIndexedLabel = QLabel("Words indexed")
self.wordsIndexedLCD = QLCDNumber()
self.wordsIndexedLCD.setSegmentStyle(QLCDNumber.Flat)
commonWordsLCDLabel = QLabel("Common words")
self.commonWordsLCD = QLCDNumber()
self.commonWordsLCD.setSegmentStyle(QLCDNumber.Flat)
self.statusLabel = QLabel("Click the 'Set Path' "
"button to start indexing")
self.statusLabel.setFrameStyle(QFrame.StyledPanel|QFrame.Sunken)
topLayout = QHBoxLayout()
topLayout.addWidget(pathLabel)
topLayout.addWidget(self.pathLabel, 1)
topLayout.addWidget(self.pathButton)
topLayout.addWidget(findLabel)
topLayout.addWidget(self.findEdit, 1)
leftLayout = QVBoxLayout()
leftLayout.addWidget(filesLabel)
leftLayout.addWidget(self.filesListWidget)
rightLayout = QVBoxLayout()
rightLayout.addWidget(commonWordsLabel)
rightLayout.addWidget(self.commonWordsListWidget)
middleLayout = QHBoxLayout()
middleLayout.addLayout(leftLayout, 1)
middleLayout.addLayout(rightLayout)
bottomLayout = QHBoxLayout()
bottomLayout.addWidget(filesIndexedLabel)
bottomLayout.addWidget(self.filesIndexedLCD)
bottomLayout.addWidget(wordsIndexedLabel)
bottomLayout.addWidget(self.wordsIndexedLCD)
bottomLayout.addWidget(commonWordsLCDLabel)
bottomLayout.addWidget(self.commonWordsLCD)
bottomLayout.addStretch()
layout = QVBoxLayout()
layout.addLayout(topLayout)
layout.addLayout(middleLayout)
layout.addLayout(bottomLayout)
layout.addWidget(self.statusLabel)
self.setLayout(layout)
self.walkers = []
self.completed = []
self.pathButton.clicked.connect(self.setPath)
self.findEdit.returnPressed.connect(self.find)
self.setWindowTitle("Page Indexer")
def stopWalkers(self):
for walker in self.walkers:
if isAlive(walker) and walker.isRunning():
walker.stop()
for walker in self.walkers:
if isAlive(walker) and walker.isRunning():
walker.wait()
self.walkers = []
self.completed = []
def setPath(self):
self.stopWalkers()
self.pathButton.setEnabled(False)
path = QFileDialog.getExistingDirectory(self,
"Choose a Path to Index", self.path)
if not path:
self.statusLabel.setText("Click the 'Set Path' "
"button to start indexing")
self.pathButton.setEnabled(True)
return
self.statusLabel.setText("Scanning directories...")
QApplication.processEvents() # Needed for Windows
self.path = QDir.toNativeSeparators(path)
self.findEdit.setFocus()
self.pathLabel.setText(self.path)
self.statusLabel.clear()
self.filesListWidget.clear()
self.fileCount = 0
self.filenamesForWords = collections.defaultdict(set)
self.commonWords = set()
nofilesfound = True
files = []
index = 0
for root, dirs, fnames in os.walk(str(self.path)):
for name in [name for name in fnames
if name.endswith((".htm", ".html"))]:
files.append(os.path.join(root, name))
if len(files) == 1000:
self.processFiles(index, files[:])
files = []
index += 1
nofilesfound = False
if files:
self.processFiles(index, files[:])
nofilesfound = False
if nofilesfound:
self.finishedIndexing()
self.statusLabel.setText(
"No HTML files found in the given path")
def processFiles(self, index, files):
thread = walker.Walker(index, self.lock, files,
self.filenamesForWords, self.commonWords, self)
thread.indexed[str,int].connect(self.indexed)
thread.finished[bool,int].connect(self.finished)
thread.finished.connect(thread.deleteLater)
self.walkers.append(thread)
self.completed.append(False)
thread.start()
thread.wait(300) # Needed for Windows
def find(self):
word = str(self.findEdit.text())
if not word:
try:
self.mutex.lock()
self.statusLabel.setText("Enter a word to find in files")
finally:
self.mutex.unlock()
return
try:
self.mutex.lock()
self.statusLabel.clear()
self.filesListWidget.clear()
finally:
self.mutex.unlock()
word = word.lower()
if " " in word:
word = word.split()[0]
try:
self.lock.lockForRead()
found = word in self.commonWords
finally:
self.lock.unlock()
if found:
try:
self.mutex.lock()
self.statusLabel.setText("Common words like '{0}' "
"are not indexed".format(word))
finally:
self.mutex.unlock()
return
try:
self.lock.lockForRead()
files = self.filenamesForWords.get(word, set()).copy()
finally:
self.lock.unlock()
if not files:
try:
self.mutex.lock()
self.statusLabel.setText("No indexed file contains "
"the word '{0}'".format(word))
finally:
self.mutex.unlock()
return
files = [QDir.toNativeSeparators(name) for name in
sorted(files, key=str.lower)]
try:
self.mutex.lock()
self.filesListWidget.addItems(files)
self.statusLabel.setText(
"{0} indexed files contain the word '{1}'".format(
len(files), word))
finally:
self.mutex.unlock()
def indexed(self, fname, index):
try:
self.mutex.lock()
self.statusLabel.setText(fname)
self.fileCount += 1
count = self.fileCount
finally:
self.mutex.unlock()
if count % 25 == 0:
try:
self.lock.lockForRead()
indexedWordCount = len(self.filenamesForWords)
commonWordCount = len(self.commonWords)
finally:
self.lock.unlock()
try:
self.mutex.lock()
self.filesIndexedLCD.display(count)
self.wordsIndexedLCD.display(indexedWordCount)
self.commonWordsLCD.display(commonWordCount)
finally:
self.mutex.unlock()
elif count % 101 == 0:
try:
self.lock.lockForRead()
words = self.commonWords.copy()
finally:
self.lock.unlock()
try:
self.mutex.lock()
self.commonWordsListWidget.clear()
self.commonWordsListWidget.addItems(sorted(words))
finally:
self.mutex.unlock()
def finished(self, completed, index):
done = False
if self.walkers:
self.completed[index] = True
if all(self.completed):
try:
self.mutex.lock()
self.statusLabel.setText("Finished")
done = True
finally:
self.mutex.unlock()
else:
try:
self.mutex.lock()
self.statusLabel.setText("Finished")
done = True
finally:
self.mutex.unlock()
if done:
self.finishedIndexing()
def reject(self):
if not all(self.completed):
self.stopWalkers()
self.finishedIndexing()
else:
self.accept()
def closeEvent(self, event=None):
self.stopWalkers()
def finishedIndexing(self):
self.filesIndexedLCD.display(self.fileCount)
self.wordsIndexedLCD.display(len(self.filenamesForWords))
self.commonWordsLCD.display(len(self.commonWords))
self.pathButton.setEnabled(True)
QApplication.processEvents() # Needed for Windows
app = QApplication(sys.argv)
form = Form()
form.show()
app.exec_()
运行结果:
来源:https://blog.csdn.net/xiaoyangyang20/article/details/71375466
0
投稿
猜你喜欢
- 描述的意思是HTML为中心的前端开发也差不多是web标准的意思。1.HTML是基础2.CSS依靠选择符提供视觉;3.Javascript 依
- 首先:文章用到的解析库介绍BeautifulSoup:Beautiful Soup提供一些简单的、python式的函数用来处理导航、搜索、修
- 本章是前一章的延续,我们使用RSA算法逐步实现加密,并详细讨论它.用于解密密文的函数是as跟随 :def decrypt(ciph
- INI是微软Windows操作系统中的文件扩展名。这些字母表示初始化。正如该术语所表示的,INI文件被用来对操作系统或特定程序初始化或进行参
- 一、安装SSL证书的环境Apache安装目录:E:phpStudyPHPTutorialApache以上为windows下测试SSL证书安装
- 手写数字识别算法import pandas as pdimport numpy as npfrom sklearn.neural_netwo
- 1、使用mysqldump工具将MySql数据库备份mysqldump -u root -p -c --default-character-
- 根据GB11643-1999公民身份证号码是特征组合码,由十七位数字本体码和一位数字校验码组成,排列顺序从左至右依次为:六位数字地址码八位数
- 在开发的过程中,几乎不可能一次性就能写出毫无破绽的程序,断点调试代码是一个普遍的需求。作为前端开发工程师,以往我们开发的JavaScript
- 下面看下python调用函数加括号和不加括号的区别,具体代码如下所示; def bracket(data):return dat
- 本文实例讲述了Python面向对象实现一个对象调用另一个对象操作。分享给大家供大家参考,具体如下:我先总结一下python中的类的特点:1.
- 理解 CPU 工作原理,重要的是理解 pc 不停地自增地址,顺序执行程序指令。当遇到跳转指令时,会将 pc 重置为新地址。在顺序执行程序指令
- 问题创建一个二叉树二叉树有限多个节点的集合,这个集合可能是:空集由一个根节点,和两棵互不相交的,分别称作左子树和右子树的二叉树组成创建二叉树
- 当然可以,我们使用强大的fso对象来获取文件夹的大小请敲入如下代码即可:<%Set MyFileSize =&nb
- 引言经过函数学习之后我们会发现函数不被调用是不会直接执行的,我们在之前的函数调用之后发现运行的结果都是函数体内print()打印出来的结果,
- asp如何将RGB颜色转化成到16进制的?在R G B中输入小于255的数字点击观看即可转换成#开通的16进制。代码如下:<%R_RG
- # encoding:utf-8import re # 使用正则 匹配想要的数据import requests # 使用requests得到
- 本文实例为大家分享了python实现图像拼接的具体代码,供大家参考,具体内容如下一、效果 二、代码1、单张图片拼接# 图片拼接fr
- 大家一定使用过 phpmyadmin 里面的数据库导入,导出功能,非常方便。但是在实际应用中,我发现如下几个问题:1、数据库超过一定尺寸,比
- ThinkPHP CURD方法的limit方法也是模型类的连贯操作方法之一,主要用于指定查询和操作的数量,特别在分页查询的时候使用较多。并且