A lot of bug fixed and two new features: Dou Tu and GaTextHook.

grapeot · grapeot · commit 8cd7078e2059 · 2017-04-06T22:06:39.000-07:00
diff --git a/ActivityInfo.py b/ActivityInfo.py
@@ -2,7 +2,7 @@
 from utilities import *
 from itchat.content import *
 from ProcessInterface import ProcessInterface
-from pymongo import MongoClient, DESCENDING
+from pymongo import DESCENDING
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as pp
@@ -26,7 +26,7 @@ class ActivityInfo(ProcessInterface):
     def __init__(self, fontPath):
         if not os.path.exists(self.imgDir):
             os.mkdir(self.imgDir)
-        self.client = MongoClient()
+        self.client = client
         self.coll = self.client[dbName][collName]
         self.prop = FontProperties(fname=fontPath)
         logging.info('ActivityInfo initialized.')
@@ -83,4 +83,4 @@ def generateTmpFileName(self):
 
 if __name__ == '__main__':
     ai = ActivityInfo('/usr/share/fonts/truetype/wqy/wqy-microhei.ttc')
-    ai.generateActivityInfoForGroup('💦人美三观正之嘴炮无下限')
+    ai.generateActivityInfoForGroup('TestGroup')
diff --git a/DoutuProcessor.py b/DoutuProcessor.py
@@ -0,0 +1,69 @@
+from time import time
+from datetime import datetime
+from ProcessInterface import ProcessInterface
+from utilities import *
+from itchat.content import *
+from subprocess import call
+from ImageSearcher import ImageSearcher
+from threading import Thread
+from time import time, sleep
+import logging
+import itchat
+import re
+import os
+
+def DoutuEnd(destinationChatroomId):
+    sleep(DoutuProcessor.doutuTimeInterval)
+    itchat.send('时间到， 斗图结束。', destinationChatroomId)
+
+class DoutuProcessor(ProcessInterface):
+    doutuTimeInterval = 5 * 60   # seconds
+    
+    def __init__(self, doutuFeatureFn, whitelist=[]):
+        self.imgFolder = 'DouTuRobot/dat/gifs/'
+        self.doutuFolder = 'DoutuImages'
+        self.whitelist = set(whitelist)
+        self.activationTime = {}
+        if not os.path.exists(self.doutuFolder):
+            os.mkdir(self.doutuFolder)
+        self.imageSearcher = ImageSearcher(doutuFeatureFn)
+        logging.info('DoutuProcessor initialized.')
+
+    def process(self, msg, type):
+        # Mode management
+        groupName = msg['User']['NickName']
+        destinationChatroomId = msg['FromUserName'] if re.search('@@', msg['FromUserName']) else msg['ToUserName']
+        if type == TEXT and msg['Content'] == '/doutu':
+            # Control mode
+            self.activationTime[groupName] = time() + self.doutuTimeInterval
+            itchat.send('鸭哥进入斗图模式！ {0}分钟内群里所有照片和表情（除了商城表情），鸭哥都会回复斗图！'.format(int(self.doutuTimeInterval / 60)), destinationChatroomId)
+            Thread(target=DoutuEnd, args=[destinationChatroomId]).start()
+            return
+        if type != PICTURE:
+            return
+        # If in whitelist. skip the mode check. Otherwise check the activation time.
+        if groupName not in self.whitelist:
+            if groupName not in self.activationTime or self.activationTime [groupName] <= time():
+                return
+
+        logging.info('[Doutu] Begin processing...')
+        fn = msg['FileName']
+        newfn = os.path.join(self.doutuFolder, fn)
+        msg['Text'](fn)
+        os.rename(fn, newfn)
+        newfnjpg = newfn + '.jpg'
+        call(['convert', '{0}[0]'.format(newfn), newfnjpg])
+        if os.path.exists(newfnjpg):
+            logging.info('[Doutu] imagemagick succeeded.')
+        else:
+            itchat.send('鸭哥没办法和腾讯表情商城的表情斗图。。', destinationChatroomId)
+            logging.info('[Doutu] imagemagick failed.')
+            return
+
+        doutufn = self.imageSearcher.search(newfnjpg)
+        doutufn = os.path.join(self.imgFolder, doutufn)
+        itchat.send('@img@{0}'.format(doutufn), destinationChatroomId)
+        logging.info('Doutu! {0} => {1}.'.format(newfn, doutufn))
+
+if __name__ == '__main__':
+    processor = DoutuProcessor('./DoutuFeatures.txt')
diff --git a/DoutuRobot/README.md b/DoutuRobot/README.md
@@ -0,0 +1,16 @@
+## 斗图功能
+
+目前对这个功能并没有官方支持。
+这个文档只是为感兴趣的读者做一个参考。
+要想部署使用这个系统，需要一些深度学习的知识和经验，并且需要读一下代码。
+
+系统的基本框架是，用Caffe把所有图片的feature抽出来，构成一个数据库。
+新的图片进来以后，抽feature，在这个数据库里面进行检索。
+最接近的几个图里面随机挑选一个返回。
+
+### 训练
+
+* 安装Caffe。`installCaffe.sh`可以作为一个参考。几个要点：不要忘了`make pycaffe`；用OpenBLAS启用多线程可以减小Latency；如果有GPU的话可以大幅加速。
+* 执行`dedupAndCopy.sh`转换文件格式。
+* 执行`incrementalExtractFeatures.sh`抽取feature。
+* 把生成的feature文件`featuresall.txt`拷贝到父目录，并且在main.py里面指定`DoutuProcessor`的文件路径。
diff --git a/DoutuRobot/dedupAndCopy.sh b/DoutuRobot/dedupAndCopy.sh
@@ -0,0 +1,12 @@
+DedupAndCopyFile() {
+    LC_ALL=C md5sum $1/*.gif | sort -k1,1 -u | awk '{print $2;}' | sed 's/^.*\///' > files.txt
+    rsync -av --files-from=files.txt $1 dat/gifs
+    rm files.txt
+}
+
+DedupAndCopyFile '../HistoryImages'
+
+# Convert to jpg for training use
+ls ./dat/gifs | xargs -n1 -I{} -P4 bash -c 'echo {}; if [ ! -e "dat/jpgs/{}.jpg" ]; then convert "dat/gifs/{}[0]" "dat/jpgs/{}.jpg"; fi'
+# Generate files list for Caffe use
+find dat/jpgs > files.txt
diff --git a/DoutuRobot/extractFeatures.py b/DoutuRobot/extractFeatures.py
@@ -0,0 +1,83 @@
+import numpy as np
+import os, sys, getopt
+
+# Main path to your caffe installation
+caffe_root = './caffe/'
+
+# Model prototxt file
+model_prototxt = caffe_root + 'models/bvlc_googlenet/deploy.prototxt'
+
+# Model caffemodel file
+model_trained = caffe_root + 'models/bvlc_googlenet/bvlc_googlenet.caffemodel'
+
+# File containing the class labels
+imagenet_labels = caffe_root + 'data/ilsvrc12/synset_words.txt'
+
+# Path to the mean image (used for input processing)
+mean_path = caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy'
+
+# Name of the layer we want to extract
+layer_name = 'pool5/7x7_s1'
+
+sys.path.insert(0, caffe_root + 'python')
+import caffe
+
+def main(argv):
+    inputfile = ''
+    outputfile = ''
+
+    try:
+        opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
+    except getopt.GetoptError:
+        print('caffe_feature_extractor.py -i <inputfile> -o <outputfile>')
+        sys.exit(2)
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print('caffe_feature_extractor.py -i <inputfile> -o <outputfile>')
+            sys.exit()
+        elif opt in ("-i"):
+            inputfile = arg
+        elif opt in ("-o"):
+            outputfile = arg
+
+    print('Reading images from "', inputfile)
+    print('Writing vectors to "', outputfile)
+
+    # Setting this to CPU, but feel free to use GPU if you have CUDA installed
+    caffe.set_mode_cpu()
+    # Loading the Caffe model, setting preprocessing parameters
+    net = caffe.Classifier(model_prototxt, model_trained,
+                           mean=np.load(mean_path).mean(1).mean(1),
+                           channel_swap=(2,1,0),
+                           raw_scale=255,
+                           image_dims=(256, 256))
+
+    # Loading class labels
+    with open(imagenet_labels) as f:
+        labels = f.readlines()
+
+    # This prints information about the network layers (names and sizes)
+    # You can uncomment this, to have a look inside the network and choose which layer to print
+    #print [(k, v.data.shape) for k, v in net.blobs.items()]
+    #exit()
+
+    # Processing one image at a time, printint predictions and writing the vector to a file
+    with open(inputfile, 'r') as reader:
+        with open(outputfile, 'w') as writer:
+            writer.truncate()
+            for image_path in reader:
+                try:
+                    image_path = image_path.strip()
+                    input_image = caffe.io.load_image(image_path)
+                    prediction = net.predict([input_image], oversample=False)
+                    print(os.path.basename(image_path), ' : ' , labels[prediction[0].argmax()].strip() , ' (', prediction[0][prediction[0].argmax()] , ')')
+                    feature = net.blobs[layer_name].data[0].reshape(1,-1)
+                    featureTxt = ' '.join([ str(x) for x in feature.tolist()[0] ])
+                    writer.write('{0}\t{1}\n'.format(image_path, featureTxt))
+                except Exception as e:
+                    print(e)
+                    print('ERROR: skip {0}.'.format(image_path))
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/DoutuRobot/incrementalExtractFeatures.sh b/DoutuRobot/incrementalExtractFeatures.sh
@@ -0,0 +1,15 @@
+# Generate files.txt
+awk '{ printf("dat/jpgs/%s.jpg\n", $1); }' < ./featuresall.txt | sort > extractedFiles.txt
+find dat/jpgs -type f | sort > allFiles.txt
+comm -23 allFiles.txt extractedFiles.txt > files.txt
+rm allFiles.txt
+rm extractedFiles.txt
+lines=`wc -l files.txt`
+echo files.txt generated, with $lines lines.
+
+# Invoke Caffe to extract features
+python3 -u ./extractFeatures.py -i files.txt -o newFeatures.txt
+sed -i 's/dat\/jpgs\///' newFeatures.txt
+sed -i 's/\.jpg\t/\t/' newFeatures.txt
+cp featuresall.txt featuresall.txt.bak
+cat newFeatures.txt >> featuresall.txt
diff --git a/DoutuRobot/installCaffe.sh b/DoutuRobot/installCaffe.sh
@@ -0,0 +1,19 @@
+#Dependencies
+sudo apt-get install -y libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
+sudo apt-get install -y --no-install-recommends libboost-all-dev
+sudo apt-get install -y libatlas-base-dev
+sudo apt-get install -y python3-dev
+sudo apt-get install -y libgoogle-glog-dev liblmdb-dev
+
+# Caffe
+git clone https://github.com/BVLC/caffe
+cd caffe
+cp Makefile.config.example Makefile.config
+echo "ALSO NEED TO MODIFY THE FILE IF YOU WANT CPU_ONLY"
+read
+
+# Debian only
+echo 'INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial/' >> Makefile.config
+echo 'LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config
+
+make all -j4
diff --git a/GaTextHook.py b/GaTextHook.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+from itchat.content import *
+from ProcessInterface import ProcessInterface
+from utilities import *
+from time import time, sleep
+from threading import Timer
+from datetime import datetime
+import itchat
+import re
+import logging
+    
+def clearGaNumDict():
+    GaTextHook.gaNumDict = {}
+    client[dbName][gaCollName].remove({}, {'multi': True})
+    logging.info('GaNumDict cleared. GaNumDict = {0}.'.format(GaTextHook.gaNumDict))
+    scheduleTimerToClearGaNumDict()
+
+def scheduleTimerToClearGaNumDict():
+    t = datetime.today()
+    t2 = t.replace(day=t.day+1, hour=9, minute=0, second=0, microsecond=0) # 0:00 in China
+    deltaT = t2 - t
+    secs = deltaT.seconds + 1
+    Timer(secs, clearGaNumDict).start()
+
+# The logic is getting more complicated. We make it a seprate processor
+class GaTextHook(ProcessInterface):
+    gaNumDict = {}
+    def __init__(self, blacklist=[]):
+        self.blacklist = blacklist
+        self.client = client
+        self.gaColl = self.client[dbName][gaCollName]
+        GaTextHook.gaNumDict = { x['GroupName']: x['CurrentGaNum'] for x in self.gaColl.find() }
+        self.gaNumMax = 100
+        self.triggerText = '鸭哥'
+        self.gaText = '嘎？'
+        self.forceTriggerText = '鸭哥嘎一个'
+        self.forceTriggerNextTimestamp = {}
+        self.forceTriggerInterval = 5 * 60 # 5 minutes
+        self.forceTriggerGaText = '强力嘎！'
+        scheduleTimerToClearGaNumDict()
+
+        # Set up the clear timer
+        logging.info('GaTextHook initialized.')
+
+    def process(self, msg, type):
+        if type != TEXT:
+            return
+        groupName = msg['User']['NickName']
+        toSend = None
+        if any([ re.search(x, groupName) is not None for x in self.blacklist ]):
+            return
+        if re.search(self.forceTriggerText, msg['Content']):
+            currentTime = time()
+            gaNextTime = self.forceTriggerNextTimestamp.get(groupName, 0)
+            if currentTime < gaNextTime:
+                logging.info("Don't force Ga because time {0} < NextTime {1} for group {2}.".format(currentTime, gaNextTime, groupName))
+                return;
+            self.forceTriggerNextTimestamp[groupName] = currentTime + self.forceTriggerInterval
+            toSend = self.forceTriggerGaText
+            logging.info('{0} => {1}'.format(msg['Content'], toSend))
+            itchat.send(toSend, msg['FromUserName'])
+            return
+        if re.search(self.triggerText, msg['Content']):
+            # Check the ga time
+            if groupName not in GaTextHook.gaNumDict:
+                GaTextHook.gaNumDict[groupName] = 0
+            GaTextHook.gaNumDict[groupName] += 1
+            self.gaColl.update({'GroupName': groupName}, {'$set': { 'CurrentGaNum': GaTextHook.gaNumDict[groupName] } }, upsert=True)
+            if GaTextHook.gaNumDict[groupName] > self.gaNumMax:
+                logging.info("Don't Ga because GaNum {0} exceeds max {1} for group {2}.".format(GaTextHook.gaNumDict[groupName], self.gaNumMax, groupName))
+                return
+            toSend = '{0} x{1}'.format(self.gaText, GaTextHook.gaNumDict[groupName])
+            logging.info('{0} => {1}'.format(msg['Content'], toSend))
+            itchat.send(toSend, msg['FromUserName'])
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+    hook = GaTextHook()
diff --git a/GroupTagCloud.py b/GroupTagCloud.py
@@ -96,5 +96,5 @@ def generateTmpFileName(self):
 
 if __name__ == '__main__':
     groupTagCloud = GroupTagCloud('/usr/share/fonts/truetype/wqy/wqy-microhei.ttc')
-    groupTagCloud.generateTagCloudForGroup('知乎万粉俱乐部', '鸭哥')
-    groupTagCloud.generateTagCloudForGroupV2('知乎万粉俱乐部', '鸭哥')
+    groupTagCloud.generateTagCloudForGroup(''TestGroup', '鸭哥')
+    groupTagCloud.generateTagCloudForGroupV2('TestGroup', '鸭哥')
diff --git a/HistoryRecorder.py b/HistoryRecorder.py
@@ -1,14 +1,13 @@
 from time import time
 from datetime import datetime
-from pymongo import MongoClient
 from ProcessInterface import ProcessInterface
 from utilities import *
 from itchat.content import *
 import os
 
 class HistoryRecorder(ProcessInterface):
     def __init__(self):
-        self.client = MongoClient()
+        self.client = client
         self.coll = self.client[dbName][collName]
         self.imgFolder = 'HistoryImages'
         if not os.path.exists(self.imgFolder):
diff --git a/ImageSearcher.py b/ImageSearcher.py
diff --git a/README.md b/README.md
diff --git a/main.py b/main.py
diff --git a/requirements.txt b/requirements.txt
diff --git a/utilities.py b/utilities.py