Skip to content

Commit 8cd7078

Browse files
committed
A lot of bug fixed and two new features: Dou Tu and GaTextHook.
1 parent 3a8503b commit 8cd7078

15 files changed

+377
-10
lines changed

ActivityInfo.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from utilities import *
33
from itchat.content import *
44
from ProcessInterface import ProcessInterface
5-
from pymongo import MongoClient, DESCENDING
5+
from pymongo import DESCENDING
66
import matplotlib
77
matplotlib.use('Agg')
88
import matplotlib.pyplot as pp
@@ -26,7 +26,7 @@ class ActivityInfo(ProcessInterface):
2626
def __init__(self, fontPath):
2727
if not os.path.exists(self.imgDir):
2828
os.mkdir(self.imgDir)
29-
self.client = MongoClient()
29+
self.client = client
3030
self.coll = self.client[dbName][collName]
3131
self.prop = FontProperties(fname=fontPath)
3232
logging.info('ActivityInfo initialized.')
@@ -83,4 +83,4 @@ def generateTmpFileName(self):
8383

8484
if __name__ == '__main__':
8585
ai = ActivityInfo('/usr/share/fonts/truetype/wqy/wqy-microhei.ttc')
86-
ai.generateActivityInfoForGroup('💦人美三观正之嘴炮无下限')
86+
ai.generateActivityInfoForGroup('TestGroup')

DoutuProcessor.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from time import time
2+
from datetime import datetime
3+
from ProcessInterface import ProcessInterface
4+
from utilities import *
5+
from itchat.content import *
6+
from subprocess import call
7+
from ImageSearcher import ImageSearcher
8+
from threading import Thread
9+
from time import time, sleep
10+
import logging
11+
import itchat
12+
import re
13+
import os
14+
15+
def DoutuEnd(destinationChatroomId):
16+
sleep(DoutuProcessor.doutuTimeInterval)
17+
itchat.send('时间到, 斗图结束。', destinationChatroomId)
18+
19+
class DoutuProcessor(ProcessInterface):
20+
doutuTimeInterval = 5 * 60 # seconds
21+
22+
def __init__(self, doutuFeatureFn, whitelist=[]):
23+
self.imgFolder = 'DouTuRobot/dat/gifs/'
24+
self.doutuFolder = 'DoutuImages'
25+
self.whitelist = set(whitelist)
26+
self.activationTime = {}
27+
if not os.path.exists(self.doutuFolder):
28+
os.mkdir(self.doutuFolder)
29+
self.imageSearcher = ImageSearcher(doutuFeatureFn)
30+
logging.info('DoutuProcessor initialized.')
31+
32+
def process(self, msg, type):
33+
# Mode management
34+
groupName = msg['User']['NickName']
35+
destinationChatroomId = msg['FromUserName'] if re.search('@@', msg['FromUserName']) else msg['ToUserName']
36+
if type == TEXT and msg['Content'] == '/doutu':
37+
# Control mode
38+
self.activationTime[groupName] = time() + self.doutuTimeInterval
39+
itchat.send('鸭哥进入斗图模式! {0}分钟内群里所有照片和表情(除了商城表情),鸭哥都会回复斗图!'.format(int(self.doutuTimeInterval / 60)), destinationChatroomId)
40+
Thread(target=DoutuEnd, args=[destinationChatroomId]).start()
41+
return
42+
if type != PICTURE:
43+
return
44+
# If in whitelist. skip the mode check. Otherwise check the activation time.
45+
if groupName not in self.whitelist:
46+
if groupName not in self.activationTime or self.activationTime [groupName] <= time():
47+
return
48+
49+
logging.info('[Doutu] Begin processing...')
50+
fn = msg['FileName']
51+
newfn = os.path.join(self.doutuFolder, fn)
52+
msg['Text'](fn)
53+
os.rename(fn, newfn)
54+
newfnjpg = newfn + '.jpg'
55+
call(['convert', '{0}[0]'.format(newfn), newfnjpg])
56+
if os.path.exists(newfnjpg):
57+
logging.info('[Doutu] imagemagick succeeded.')
58+
else:
59+
itchat.send('鸭哥没办法和腾讯表情商城的表情斗图。。', destinationChatroomId)
60+
logging.info('[Doutu] imagemagick failed.')
61+
return
62+
63+
doutufn = self.imageSearcher.search(newfnjpg)
64+
doutufn = os.path.join(self.imgFolder, doutufn)
65+
itchat.send('@img@{0}'.format(doutufn), destinationChatroomId)
66+
logging.info('Doutu! {0} => {1}.'.format(newfn, doutufn))
67+
68+
if __name__ == '__main__':
69+
processor = DoutuProcessor('./DoutuFeatures.txt')

DoutuRobot/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
## 斗图功能
2+
3+
目前对这个功能并没有官方支持。
4+
这个文档只是为感兴趣的读者做一个参考。
5+
要想部署使用这个系统,需要一些深度学习的知识和经验,并且需要读一下代码。
6+
7+
系统的基本框架是,用Caffe把所有图片的feature抽出来,构成一个数据库。
8+
新的图片进来以后,抽feature,在这个数据库里面进行检索。
9+
最接近的几个图里面随机挑选一个返回。
10+
11+
### 训练
12+
13+
* 安装Caffe。`installCaffe.sh`可以作为一个参考。几个要点:不要忘了`make pycaffe`;用OpenBLAS启用多线程可以减小Latency;如果有GPU的话可以大幅加速。
14+
* 执行`dedupAndCopy.sh`转换文件格式。
15+
* 执行`incrementalExtractFeatures.sh`抽取feature。
16+
* 把生成的feature文件`featuresall.txt`拷贝到父目录,并且在main.py里面指定`DoutuProcessor`的文件路径。

DoutuRobot/dedupAndCopy.sh

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
DedupAndCopyFile() {
2+
LC_ALL=C md5sum $1/*.gif | sort -k1,1 -u | awk '{print $2;}' | sed 's/^.*\///' > files.txt
3+
rsync -av --files-from=files.txt $1 dat/gifs
4+
rm files.txt
5+
}
6+
7+
DedupAndCopyFile '../HistoryImages'
8+
9+
# Convert to jpg for training use
10+
ls ./dat/gifs | xargs -n1 -I{} -P4 bash -c 'echo {}; if [ ! -e "dat/jpgs/{}.jpg" ]; then convert "dat/gifs/{}[0]" "dat/jpgs/{}.jpg"; fi'
11+
# Generate files list for Caffe use
12+
find dat/jpgs > files.txt

DoutuRobot/extractFeatures.py

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import numpy as np
2+
import os, sys, getopt
3+
4+
# Main path to your caffe installation
5+
caffe_root = './caffe/'
6+
7+
# Model prototxt file
8+
model_prototxt = caffe_root + 'models/bvlc_googlenet/deploy.prototxt'
9+
10+
# Model caffemodel file
11+
model_trained = caffe_root + 'models/bvlc_googlenet/bvlc_googlenet.caffemodel'
12+
13+
# File containing the class labels
14+
imagenet_labels = caffe_root + 'data/ilsvrc12/synset_words.txt'
15+
16+
# Path to the mean image (used for input processing)
17+
mean_path = caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy'
18+
19+
# Name of the layer we want to extract
20+
layer_name = 'pool5/7x7_s1'
21+
22+
sys.path.insert(0, caffe_root + 'python')
23+
import caffe
24+
25+
def main(argv):
26+
inputfile = ''
27+
outputfile = ''
28+
29+
try:
30+
opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
31+
except getopt.GetoptError:
32+
print('caffe_feature_extractor.py -i <inputfile> -o <outputfile>')
33+
sys.exit(2)
34+
35+
for opt, arg in opts:
36+
if opt == '-h':
37+
print('caffe_feature_extractor.py -i <inputfile> -o <outputfile>')
38+
sys.exit()
39+
elif opt in ("-i"):
40+
inputfile = arg
41+
elif opt in ("-o"):
42+
outputfile = arg
43+
44+
print('Reading images from "', inputfile)
45+
print('Writing vectors to "', outputfile)
46+
47+
# Setting this to CPU, but feel free to use GPU if you have CUDA installed
48+
caffe.set_mode_cpu()
49+
# Loading the Caffe model, setting preprocessing parameters
50+
net = caffe.Classifier(model_prototxt, model_trained,
51+
mean=np.load(mean_path).mean(1).mean(1),
52+
channel_swap=(2,1,0),
53+
raw_scale=255,
54+
image_dims=(256, 256))
55+
56+
# Loading class labels
57+
with open(imagenet_labels) as f:
58+
labels = f.readlines()
59+
60+
# This prints information about the network layers (names and sizes)
61+
# You can uncomment this, to have a look inside the network and choose which layer to print
62+
#print [(k, v.data.shape) for k, v in net.blobs.items()]
63+
#exit()
64+
65+
# Processing one image at a time, printint predictions and writing the vector to a file
66+
with open(inputfile, 'r') as reader:
67+
with open(outputfile, 'w') as writer:
68+
writer.truncate()
69+
for image_path in reader:
70+
try:
71+
image_path = image_path.strip()
72+
input_image = caffe.io.load_image(image_path)
73+
prediction = net.predict([input_image], oversample=False)
74+
print(os.path.basename(image_path), ' : ' , labels[prediction[0].argmax()].strip() , ' (', prediction[0][prediction[0].argmax()] , ')')
75+
feature = net.blobs[layer_name].data[0].reshape(1,-1)
76+
featureTxt = ' '.join([ str(x) for x in feature.tolist()[0] ])
77+
writer.write('{0}\t{1}\n'.format(image_path, featureTxt))
78+
except Exception as e:
79+
print(e)
80+
print('ERROR: skip {0}.'.format(image_path))
81+
82+
if __name__ == "__main__":
83+
main(sys.argv[1:])
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Generate files.txt
2+
awk '{ printf("dat/jpgs/%s.jpg\n", $1); }' < ./featuresall.txt | sort > extractedFiles.txt
3+
find dat/jpgs -type f | sort > allFiles.txt
4+
comm -23 allFiles.txt extractedFiles.txt > files.txt
5+
rm allFiles.txt
6+
rm extractedFiles.txt
7+
lines=`wc -l files.txt`
8+
echo files.txt generated, with $lines lines.
9+
10+
# Invoke Caffe to extract features
11+
python3 -u ./extractFeatures.py -i files.txt -o newFeatures.txt
12+
sed -i 's/dat\/jpgs\///' newFeatures.txt
13+
sed -i 's/\.jpg\t/\t/' newFeatures.txt
14+
cp featuresall.txt featuresall.txt.bak
15+
cat newFeatures.txt >> featuresall.txt

DoutuRobot/installCaffe.sh

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#Dependencies
2+
sudo apt-get install -y libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
3+
sudo apt-get install -y --no-install-recommends libboost-all-dev
4+
sudo apt-get install -y libatlas-base-dev
5+
sudo apt-get install -y python3-dev
6+
sudo apt-get install -y libgoogle-glog-dev liblmdb-dev
7+
8+
# Caffe
9+
git clone https://github.com/BVLC/caffe
10+
cd caffe
11+
cp Makefile.config.example Makefile.config
12+
echo "ALSO NEED TO MODIFY THE FILE IF YOU WANT CPU_ONLY"
13+
read
14+
15+
# Debian only
16+
echo 'INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial/' >> Makefile.config
17+
echo 'LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config
18+
19+
make all -j4

GaTextHook.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from itchat.content import *
4+
from ProcessInterface import ProcessInterface
5+
from utilities import *
6+
from time import time, sleep
7+
from threading import Timer
8+
from datetime import datetime
9+
import itchat
10+
import re
11+
import logging
12+
13+
def clearGaNumDict():
14+
GaTextHook.gaNumDict = {}
15+
client[dbName][gaCollName].remove({}, {'multi': True})
16+
logging.info('GaNumDict cleared. GaNumDict = {0}.'.format(GaTextHook.gaNumDict))
17+
scheduleTimerToClearGaNumDict()
18+
19+
def scheduleTimerToClearGaNumDict():
20+
t = datetime.today()
21+
t2 = t.replace(day=t.day+1, hour=9, minute=0, second=0, microsecond=0) # 0:00 in China
22+
deltaT = t2 - t
23+
secs = deltaT.seconds + 1
24+
Timer(secs, clearGaNumDict).start()
25+
26+
# The logic is getting more complicated. We make it a seprate processor
27+
class GaTextHook(ProcessInterface):
28+
gaNumDict = {}
29+
def __init__(self, blacklist=[]):
30+
self.blacklist = blacklist
31+
self.client = client
32+
self.gaColl = self.client[dbName][gaCollName]
33+
GaTextHook.gaNumDict = { x['GroupName']: x['CurrentGaNum'] for x in self.gaColl.find() }
34+
self.gaNumMax = 100
35+
self.triggerText = '鸭哥'
36+
self.gaText = '嘎?'
37+
self.forceTriggerText = '鸭哥嘎一个'
38+
self.forceTriggerNextTimestamp = {}
39+
self.forceTriggerInterval = 5 * 60 # 5 minutes
40+
self.forceTriggerGaText = '强力嘎!'
41+
scheduleTimerToClearGaNumDict()
42+
43+
# Set up the clear timer
44+
logging.info('GaTextHook initialized.')
45+
46+
def process(self, msg, type):
47+
if type != TEXT:
48+
return
49+
groupName = msg['User']['NickName']
50+
toSend = None
51+
if any([ re.search(x, groupName) is not None for x in self.blacklist ]):
52+
return
53+
if re.search(self.forceTriggerText, msg['Content']):
54+
currentTime = time()
55+
gaNextTime = self.forceTriggerNextTimestamp.get(groupName, 0)
56+
if currentTime < gaNextTime:
57+
logging.info("Don't force Ga because time {0} < NextTime {1} for group {2}.".format(currentTime, gaNextTime, groupName))
58+
return;
59+
self.forceTriggerNextTimestamp[groupName] = currentTime + self.forceTriggerInterval
60+
toSend = self.forceTriggerGaText
61+
logging.info('{0} => {1}'.format(msg['Content'], toSend))
62+
itchat.send(toSend, msg['FromUserName'])
63+
return
64+
if re.search(self.triggerText, msg['Content']):
65+
# Check the ga time
66+
if groupName not in GaTextHook.gaNumDict:
67+
GaTextHook.gaNumDict[groupName] = 0
68+
GaTextHook.gaNumDict[groupName] += 1
69+
self.gaColl.update({'GroupName': groupName}, {'$set': { 'CurrentGaNum': GaTextHook.gaNumDict[groupName] } }, upsert=True)
70+
if GaTextHook.gaNumDict[groupName] > self.gaNumMax:
71+
logging.info("Don't Ga because GaNum {0} exceeds max {1} for group {2}.".format(GaTextHook.gaNumDict[groupName], self.gaNumMax, groupName))
72+
return
73+
toSend = '{0} x{1}'.format(self.gaText, GaTextHook.gaNumDict[groupName])
74+
logging.info('{0} => {1}'.format(msg['Content'], toSend))
75+
itchat.send(toSend, msg['FromUserName'])
76+
77+
if __name__ == '__main__':
78+
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
79+
hook = GaTextHook()

GroupTagCloud.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -96,5 +96,5 @@ def generateTmpFileName(self):
9696

9797
if __name__ == '__main__':
9898
groupTagCloud = GroupTagCloud('/usr/share/fonts/truetype/wqy/wqy-microhei.ttc')
99-
groupTagCloud.generateTagCloudForGroup('知乎万粉俱乐部', '鸭哥')
100-
groupTagCloud.generateTagCloudForGroupV2('知乎万粉俱乐部', '鸭哥')
99+
groupTagCloud.generateTagCloudForGroup(''TestGroup', '鸭哥')
100+
groupTagCloud.generateTagCloudForGroupV2('TestGroup', '鸭哥')

HistoryRecorder.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
from time import time
22
from datetime import datetime
3-
from pymongo import MongoClient
43
from ProcessInterface import ProcessInterface
54
from utilities import *
65
from itchat.content import *
76
import os
87

98
class HistoryRecorder(ProcessInterface):
109
def __init__(self):
11-
self.client = MongoClient()
10+
self.client = client
1211
self.coll = self.client[dbName][collName]
1312
self.imgFolder = 'HistoryImages'
1413
if not os.path.exists(self.imgFolder):

0 commit comments

Comments
 (0)