update

2025-06-07 20:27:09 +00:00 · 2021-02-11 13:34:23 -05:00 · 2021-02-11 13:34:23 -05:00 · fa7961e00d
commit fa7961e00d
6 changed files with 1440 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+/misc/*
+*.csv
--- a/extract2.py
+++ b/extract2.py
@ -0,0 +1,215 @@
+import csv
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+folder_list = [
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/7',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/8',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/9',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/10',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/11',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/12',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/13',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/14',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/15',
+    'PHASE3_HH01_T2_EButtom-402/eButton_Data/Camera/ID0402_Nov.27/18',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/8',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/9',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/10',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/11',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/12',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/13',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/14',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/15',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/16',
+    'PHASE3_HH01_T2_EButtom_411-Mother/eButton_Data/Camera/ID0411_Nov.27/17',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/8',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/9',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/10',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/12',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/13',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/14',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/15',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/16',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/17',
+    'PHASE3_HH02_T2_eButton-402_Mother/eButton_Data/Camera/ID0402_Nov.28/18',
+    'PHASE3_HH02_T2_eButton-411_Adolescent_child/eButton_Data/Camera/ID0411_Nov.28/10',
+    'PHASE3_HH02_T2_eButton-411_Adolescent_child/eButton_Data/Camera/ID0411_Nov.28/13',
+    'PHASE3_HH02_T2_eButton-411_Adolescent_child/eButton_Data/Camera/ID0411_Nov.28/15',
+    'PHASE3_HH02_T2_eButton-411_Adolescent_child/eButton_Data/Camera/ID0411_Nov.28/16',
+    'PHASE3_HH02_T2_eButton-411_Adolescent_child/eButton_Data/Camera/ID0411_Nov.28/17',
+    'PHASE3_HH02_T2_eButton-411_Adolescent_child/eButton_Data/Camera/ID0411_Nov.28/18',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/7',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/8',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/9',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/10',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/11',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/12',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/13',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/14',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/15',
+    'PHASE3_HH02_T4-eButton-411_Mother/eButton_Data/Camera/ID0411_Dec.02/16',
+
+    # new data
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/9',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/10',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/11',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/12',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/13',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/14',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/15',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/16',
+    'PHASE3_HH03_eButton-402_Father/eButton_Data/Camera/ID0402_Nov.30/17',
+
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/8',
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/9',
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/10',
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/13',
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/14',
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/15',
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/16',
+    'PHASE3_HH03_eButton-411_Mother/eButton_Data/Camera/ID0411_Nov.30/17',
+
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/8',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/9',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/10',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/11',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/12',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/13',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/14',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/15',
+    'PHASE3_HH03_T4_eBUTTON_402-ADOLESCENT_BOY/eButton_Data/Camera/ID0402_Dec.03/16',
+
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/7',
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/8',
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/9',
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/10',
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/11',
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/12',
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/13',
+    'PHASE3_HH05_eButton-402_Father/eButton_Data/Camera/ID0402_Dec.05/14',
+
+]
+
+
+from shutil import copyfile
+import time
+
+def construct_vector(folder_path):
+    label_file_name = "label.csv"
+    clarify_result_name = "clarify_result.csv"
+    label_filepath = os.path.join(folder_path, label_file_name)
+    clarify_result_filepath = os.path.join(folder_path, clarify_result_name)
+    if not os.path.exists(label_filepath) and not os.path.exists(
+            clarify_result_filepath):
+        print('no label file and clarify result file')
+    vector_x, vector_y, vector_time = [], [], []
+    label_list = []
+    clarify_list = []
+    with open(label_filepath) as f:
+        label_reader = csv.reader(f, delimiter=',')
+        for row in label_reader:
+            label_list.append(row)
+    with open(clarify_result_filepath) as f:
+        clarify_reader = csv.reader(f, delimiter=',')
+        for row in clarify_reader:
+            clarify_list.append(row)
+    for i in range(len(label_list)):
+        for j in range(len(label_list[i])):
+            label_list[i][j] = label_list[i][j].strip()
+    for i in range(len(clarify_list)):
+        for j in range(len(clarify_list[i])):
+            clarify_list[i][j] = clarify_list[i][j].strip()
+    food_name_list = []
+    no_food_name_list = []
+    
+    food_rectify = []
+    with open("./food_rectify.csv") as f:
+        for line in f:
+            food_rectify.append(line.strip()+'.jpg') 
+
+    for i in clarify_list:
+        for j in label_list:
+            if os.path.basename(i[0]) in j:
+                vector_time.append(j[0])
+                print(j[1])
+                tmp_with_name = [j[1]]
+                tmp_with_name += i[1:]
+                if tmp_with_name[0] in food_rectify:
+                    vector_y.append(1)
+                    # vector_x.append(i[1:])
+                    vector_x.append(tmp_with_name)
+                    food_name_list.append(j[1])
+                else:
+                    if int(j[2]) >= 3:  # 3 and 4 recognized as food
+                        vector_y.append(1)
+                        # vector_x.append(i[1:])
+                        vector_x.append(tmp_with_name)
+                        food_name_list.append(j[1])
+                    else:
+                        vector_y.append(0)
+                        # vector_x.append(i[1:])
+                        vector_x.append(tmp_with_name)
+                        no_food_name_list.append(j[1])
+
+    t = time.time()
+    for root, dirs, files in os.walk('./', topdown=False):
+        for name in files:
+            if name in food_name_list:
+                src = os.path.join(root,name)
+                dst = os.path.join('../food_detection_data/food', name)
+                # if os.path.isfile(dst):
+                #     dst = os.path.join('./food/' , str(int(t)) + name)
+                copyfile(src,dst)
+
+                print(src)
+                print(dst)
+            if name in no_food_name_list:
+                src = os.path.join(root,name)
+                dst = os.path.join('../food_detection_data/no_food', name)
+                # if os.path.isfile(dst):
+                #     dst = os.path.join('./no_food/' ,str(int(t)) + name)
+                print(src)
+                print(dst)
+                copyfile(src,dst)
+
+    return vector_x, vector_y, vector_time
+
+
+def construct_food_no_food(folder_name):
+    vector_x, vector_y, _ = construct_vector(folder_name)
+    # print(_)
+    food_csv = 'food.csv'
+    no_food_csv = 'no_food.csv'
+    food_file = open(food_csv, 'a')
+    no_food_file = open(no_food_csv, 'a')
+    for i in range(len(vector_y)):
+        if vector_y[i] == 1:
+            food_file.write(','.join(vector_x[i]))
+            food_file.write('\n')
+        else:
+            no_food_file.write(','.join(vector_x[i]))
+            no_food_file.write('\n')
+    food_file.close()
+    no_food_file.close()
+
+
+if __name__ == '__main__':
+    section = [10, 10, 10, 6, 10, 9, 8, 9, 8]
+    # for i in range(10):
+    #     vector_x, vector_y, _ = construct_vector(folder_list[i])
+    #     tmp_vector_y = [str(i) for i in vector_y]
+    #     print(' '.join((tmp_vector_y)))
+    #     plt.scatter(range(len(vector_y)), vector_y, s=0.5)
+    #     plt.show()
+    # for i in range(0, 5):
+    #     vector_x, vector_y, vector_time = construct_vector(folder_list[i])
+    #     vector_y = list(map(str, vector_y))
+    #     print(' '.join(vector_y))
+
+    # second = [i for i in range(40)] + [i for i in range(36,46)]
+    # print(second)
+    # for i in second:
+    for i in range(0, sum(section)):
+       construct_food_no_food(folder_list[i])
--- a/get_FN_FP.py
+++ b/get_FN_FP.py
@ -0,0 +1,31 @@
+import shutil
+import os
+
+FP, FN = [], []
+with open('FP.txt') as f:
+    for row in f:
+        FP.append(row.strip())
+with open('FN.txt') as f:
+    for row in f:
+        FN.append(row.strip())
+FP = list(set(FP))
+FN = list(set(FN))
+
+
+for root, dirs, files in os.walk('.'):
+    for f in files:
+        if f in FP:
+            src = os.path.join(root, f)
+            dst = './FP/' + f
+            if not os.path.isfile(dst):
+                shutil.copyfile(src, dst)
+
+        if f in FN:
+            src = os.path.join(root, f)
+            dst = os.path.join('./FN/', f)
+            if not os.path.isfile(dst):
+                shutil.copyfile(src, dst)
+
+
+# print(os.path.join(root, name))
+# shutil.copyfile(src, dst, *, follow_symlinks=True)
--- a/get_dataset.ipynb
+++ b/get_dataset.ipynb
@ -0,0 +1,113 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "from sklearn.utils import shuffle\n",
+    "food_raw_data, non_food_raw_data, food_rectify = [], [], []\n",
+    "food_data, non_food_data = [], []\n",
+    "\n",
+    "with open(\"food_rectify.csv\") as f:\n",
+    "    reader = csv.reader(f)\n",
+    "    for row in reader:\n",
+    "        food_rectify.append(row)\n",
+    "\n",
+    "with open(\"food.csv\") as f:\n",
+    "    reader = csv.reader(f)\n",
+    "    for row in reader:\n",
+    "        food_raw_data.append(row)\n",
+    "        \n",
+    "with open(\"no_food.csv\") as f:\n",
+    "    reader = csv.reader(f)\n",
+    "    for row in reader:\n",
+    "        non_food_raw_data.append(row)\n",
+    "     \n",
+    "food_data = food_raw_data\n",
+    "\n",
+    "for i in non_food_raw_data:\n",
+    "    if i[0] not in food_rectify:\n",
+    "        non_food_data.append(i)\n",
+    "    else:\n",
+    "        food_data.append(i)\n",
+    "\n",
+    "food_data = shuffle(food_data)\n",
+    "non_food_data = shuffle(non_food_data)\n",
+    "\n",
+    "ratio = 0.75   \n",
+    "train_food_len = int(len(food_data) * ratio)\n",
+    "train_non_food_len = train_food_len\n",
+    "\n",
+    "test_food_len = len(food_data) - train_food_len\n",
+    "test_non_food_len = int(len(non_food_data) * (1 - ratio))\n",
+    "\n",
+    "\n",
+    "train_food = food_data[0:train_food_len]\n",
+    "test_food = food_data[train_food_len:train_food_len + test_food_len]\n",
+    "\n",
+    "train_non_food = non_food_data[0:train_non_food_len]\n",
+    "test_non_food = non_food_data[train_non_food_len:train_non_food_len + test_non_food_len]\n",
+    "\n",
+    "with open('train_food.csv', 'w') as f:\n",
+    "    write = csv.writer(f)\n",
+    "    write.writerows(train_food)\n",
+    "    \n",
+    "with open('train_non_food.csv', 'w') as f:\n",
+    "    write = csv.writer(f)\n",
+    "    write.writerows(train_non_food )\n",
+    "\n",
+    "with open('test_food.csv', 'w') as f:\n",
+    "    write = csv.writer(f)\n",
+    "    write.writerows(test_food )\n",
+    "with open('test_non_food.csv', 'w') as f:\n",
+    "    write = csv.writer(f)\n",
+    "    write.writerows(test_non_food)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = [i for i in range(10)]\n",
+    "print(a)\n",
+    "print(a[0:4])\n",
+    "print(a[4:7])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "non_food_raw_data"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/svm3_nb.ipynb
+++ b/svm3_nb.ipynb
--- a/svm_nb_funcs.py
+++ b/svm_nb_funcs.py
@ -0,0 +1,351 @@
+from sklearn import svm
+import csv
+from sklearn.utils import shuffle
+
+from sklearn.metrics import classification_report
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+import matplotlib.pyplot as plt
+
+food_file_path, non_food_file_path = 'food.csv', 'no_food.csv'
+# food_file_path2, non_food_file_path2 = 'food2.csv', 'no_food2.csv'
+food_raw_data, non_food_raw_data = [], []
+# food_raw_data2, non_food_raw_data2 = [], []
+
+train_raw_food, train_raw_non_food = [], []
+test_raw_food, test_raw_non_food = [], []
+
+train_vector_x, train_vector_y, train_vector_num = [], [], []
+test_vector_x, test_vector_y = [], []
+
+useful_tag_list, useful_dict = [], {}
+
+correlation_dict = {}
+
+p_food = 0
+
+
+def get_raw_data():
+    global food_raw_data, non_food_raw_data
+
+    with open(food_file_path) as f:
+        csv_reader = csv.reader(f)
+        for row in csv_reader:
+            food_raw_data.append(row)
+    with open(non_food_file_path) as f:
+        csv_reader = csv.reader(f)
+        for row in csv_reader:
+            non_food_raw_data.append(row)
+
+    # with open(food_file_path2) as f:
+    #     csv_reader = csv.reader(f)
+    #     for row in csv_reader:
+    #         food_raw_data2.append(row)
+    # with open(non_food_file_path2) as f:
+    #     csv_reader = csv.reader(f)
+    #     for row in csv_reader:
+    #         non_food_raw_data2.append(row)
+
+
+def shuffle_raw_data():
+    global food_raw_data, non_food_raw_data
+    # non_food_raw_data = non_food_raw_data[:15000]
+    # non_food_raw_data = non_food_raw_data[:len(food_raw_data)]
+    food_raw_data = shuffle(food_raw_data)
+    non_food_raw_data = shuffle(non_food_raw_data)
+    # non_food_raw_data = non_food_raw_data[:15000]
+    non_food_raw_data = non_food_raw_data[:len(food_raw_data)]
+    # non_food_raw_data = non_food_raw_data[00000]
+
+    
+
+def div_train_test_raw_data(ratio=0.75):
+    global food_raw_data, non_food_raw_data, train_raw_food, \
+        train_raw_non_food, test_raw_food, test_raw_non_food, \
+        food_raw_data2, non_food_raw_data2
+    # remove some non_food_raw_data
+    # non_food_raw_data = non_food_raw_data[:10000]
+    train_food_len = int(len(food_raw_data) * ratio)
+    train_non_food_len = int(len(non_food_raw_data) * ratio)
+    train_raw_food = food_raw_data[0:train_food_len]
+    train_raw_non_food = non_food_raw_data[0:train_non_food_len]
+    test_raw_food = food_raw_data[train_food_len:]
+    test_raw_non_food = non_food_raw_data[train_non_food_len:]
+
+    # train_raw_food = food_raw_data
+    # test_raw_food = food_raw_data2
+
+    # train_raw_non_food = non_food_raw_data
+    # test_raw_non_food = non_food_raw_data2
+
+def save_raw_data_train_test():
+    global food_raw_data, non_food_raw_data, train_raw_food, \
+        train_raw_non_food, test_raw_food, test_raw_non_food
+    with open('train_food.csv', 'w') as f:
+        write = csv.writer(f)
+        write.writerows(train_raw_food)
+    with open('train_non_food.csv', 'w') as f:
+        write = csv.writer(f)
+        write.writerows(train_raw_non_food )
+
+    with open('test_food.csv', 'w') as f:
+        write = csv.writer(f)
+        write.writerows(test_raw_food )
+    with open('test_non_food.csv', 'w') as f:
+        write = csv.writer(f)
+        write.writerows(test_raw_non_food)
+
+    for i in train_raw_food:
+        i = i[1:]
+    for i in train_raw_non_food:
+        i = i[1:]
+    for i in test_raw_food:
+        i = i[1:]
+    for i in test_raw_non_food:
+        i = i[1:]
+    print(len(train_raw_food))
+
+
+    
+
+
+def count_dict(raw_data, threshold=0.5):
+    counter_dict = {}  # only collect from train data
+    for i in raw_data:
+        for j in range(0, len(i) - 1, 2):
+            tmp = str(i[j]).strip()
+            if float(i[j + 1]) > threshold:
+                if tmp not in counter_dict:
+                    counter_dict[tmp] = 1
+                else:
+                    counter_dict[tmp] += 1
+            else:
+                # if tmp not in counter_dict:
+                #     counter_dict[tmp] = 0
+                pass
+    return counter_dict
+
+
+def get_use_tag(use_all=False, threshold=0.5):
+    global useful_tag_list, food_raw_data, non_food_raw_data, useful_dict
+    useful_tag_list, useful_dict = [], {}
+    food_tag_dict = count_dict(train_raw_food)
+    non_food_tag_dict = count_dict(train_raw_non_food)
+
+    if use_all:
+        for i in non_food_tag_dict.keys():
+            if i not in food_tag_dict.keys():
+                food_tag_dict[i] = non_food_tag_dict[i]
+            else:
+                food_tag_dict[i] += non_food_tag_dict[i]
+        # food_tag_dict.update(non_food_tag_dict)
+
+    appear_times = 0
+    appear_list = []
+    for i in food_tag_dict.keys():
+        appear_times += food_tag_dict[i]
+        appear_list.append(food_tag_dict[i])
+    appear_list.sort(reverse=True)
+    useful_bound = int(appear_times * threshold)
+    bound = 0
+    pre_sum = 0
+    for i in range(len(appear_list)):
+        pre_sum += appear_list[i]
+        if pre_sum > useful_bound:
+            bound = appear_list[i]
+            break
+    for i in food_tag_dict.keys():
+        if food_tag_dict[i] > bound:
+            useful_tag_list.append(i)
+    counter = 0
+    for i in useful_tag_list:
+        useful_dict[i] = counter
+        counter += 1
+
+
+def get_correlation():
+    global train_raw_food, correlation_dict
+    food_tag_dict = count_dict(train_raw_food)
+    merged_dict = count_dict(train_raw_non_food)
+
+    for i in food_tag_dict.keys():
+        if i not in merged_dict.keys():
+            merged_dict[i] = food_tag_dict[i]
+        else:
+            merged_dict[i] += food_tag_dict[i]
+
+    for i in food_tag_dict.keys():
+        if i not in correlation_dict.keys():
+            correlation_dict[i] = food_tag_dict[i] / len(food_raw_data)
+            # correlation_dict[i] = food_tag_dict[i] / merged_dict[i]
+        else:
+            print("error in get correlation function")
+
+
+def construct_train_test_set():
+    global train_raw_food, train_raw_non_food, test_raw_food, \
+        test_raw_non_food, train_vector_x, train_vector_y, \
+        test_vector_x, test_vector_y, train_vector_num
+    train_vector_x, train_vector_y, train_vector_num = [], [], []
+    test_vector_x, test_vector_y = [], []
+    vector_x = []
+    vector_y = []
+    for i in train_raw_food:
+        tmp = [0 for i in range(len(useful_tag_list))]
+        for j in range(0, len(i) - 1, 2):
+            if i[j] in useful_dict.keys():
+                tmp[useful_dict[i[j]]] = float(
+                    i[j + 1]) * correlation_dict[i[j]] + p_food * (1 - float(i[j + 1]))
+            else:
+                # TODO: should be changed to random probability
+                pass
+        vector_x.append(tmp)
+        vector_y.append("food")
+        train_vector_num.append(1)
+
+    for i in train_raw_non_food:
+        tmp = [0 for i in range(len(useful_tag_list))]
+        for j in range(0, len(i) - 1, 2):
+            if i[j] in useful_dict.keys():
+                tmp[useful_dict[i[j]]] = float(
+                    i[j + 1]) * correlation_dict[i[j]] + p_food * (1 - float(i[j + 1]))
+            else:
+                # TODO: should be changed to random probability
+                pass
+        vector_x.append(tmp)
+        vector_y.append("no food")
+        train_vector_num.append(-1)
+
+    train_vector_x, train_vector_y = vector_x, vector_y
+
+    vector_x, vector_y = [], []
+
+    for i in test_raw_food:
+        tmp = [0 for i in range(len(useful_tag_list))]
+        for j in range(0, len(i) - 1, 2):
+            if i[j] in useful_dict.keys():
+                tmp[useful_dict[i[j]]] = float(
+                    i[j + 1]) * correlation_dict[i[j]] + p_food * (1 - float(i[j + 1]))
+            else:
+                # TODO: should be changed to random probability
+                pass
+        vector_x.append(tmp)
+        vector_y.append("food")
+
+    for i in test_raw_non_food:
+        tmp = [0 for i in range(len(useful_tag_list))]
+        for j in range(0, len(i) - 1, 2):
+            if i[j] in useful_dict.keys():
+                tmp[useful_dict[i[j]]] = float(
+                    i[j + 1]) * correlation_dict[i[j]] + p_food * (1 - float(i[j + 1]))
+            else:
+                # TODO: should be changed to random probability
+                pass
+        vector_x.append(tmp)
+        vector_y.append("no food")
+    test_vector_x, test_vector_y = vector_x, vector_y
+
+
+def confision_matrix(ground_true, predict, print_result=False):
+    TP, FP, FN, TN = 0, 0, 0, 0
+    for i in range(len(ground_true)):
+        if ground_true[i] == "food" and predict[i] == "food":
+            TP += 1
+        elif ground_true[i] == "no food" and predict[i] == "food":
+            FP += 1
+        elif ground_true[i] == "food" and predict[i] == "no food":
+            FN += 1
+        elif ground_true[i] == "no food" and predict[i] == "no food":
+            TN += 1
+    TPR = TP / (TP + FN)
+    FPR = FP / (FP + TN)
+    if print_result:
+        print("TP: ", TP, "FN: ", FN, "TN: ", TN, "FP :", FP)
+    # print("Sensitivity = ", TP/(TP+FN), end="   ")
+    # print("Specificity = ", TN/(TN+FP))
+    # print("Precision = ", TP/(TP+FP), end="   ")
+    # print("Accuracy = ", (TP + TN)/(TP+TN+FN+FP))
+    return TPR, FPR
+
+
+def clarifai_result():
+    global test_raw_food, test_raw_non_food
+    TPR_list = []
+    FPR_list = []
+    for k in range(10):
+        TP, FP, FN, TN = 0, 0, 0, 0
+        ratio = k / 10
+        # print(ratio)
+        for i in test_raw_food:
+            have = False
+            for j in range(len(i)):
+                if i[j] == "food" and float(i[j + 1]) > ratio:
+                    have = True
+            if not have:
+                FN += 1
+            else:
+                TP += 1
+
+        for i in test_raw_non_food:
+            have = False
+            for j in range(len(i)):
+                if i[j] == "food" and float(i[j + 1]) > ratio:
+                    have = True
+            if not have:
+                TN += 1
+            else:
+                FP += 1
+
+        TPR = TP / (TP + FN)
+        FPR = FP / (FP + TN)
+        TPR_list.append(TPR)
+        FPR_list.append(FPR)
+        if k == 9 or k == 8 or k == 7 or k ==6:
+        # if k == 7 or k == 6 or k == 5 or k == 4:
+            plt.scatter([FPR], [TPR], marker='o', c='green')
+        # print("TRP :", TPR)
+        # print("FPR :", FPR)
+    return TPR_list, FPR_list
+        #
+    #     print("clarify result**********")
+    #     print("TP: ", TP, "FN: ", FN, "TN: ", TN, "FP :", FP)
+    #     print("Sensitivity = ", TP / (TP + FN), end="   ")
+    #     print("Specificity = ", TN / (TN + FP))
+    #     print("Precision = ", TP / (TP + FP), end="   ")
+    #     print("Accuracy = ", (TP + TN) / (TP + TN + FN + FP))
+    #     print("burden = ", (TP + FP) / (TP +TN+FN+FP))
+    # plt.scatter([1 - 0.789866667, 1 - 0.684, 1 - 0.55786, 1-0.4512], [0.584493042, 0.666003976, 0.753479125, 0.833664679], marker='o', c='green')
+
+
+def get_p_food_before_balance():
+    # food_num_ori = 0
+    # with open("#food_ori.csv") as f:
+    #     csv_reader = csv.reader(f)
+    #     for row in csv_reader:
+    #         food_num_ori += 1
+
+    # no_food_num_ori = 0
+    # with open("#no food_ori.csv") as f:
+    #     csv_reader = csv.reader(f)
+    #     for row in csv_reader:
+    #         no_food_num_ori += 1
+    # p_food = food_num_ori / no_food_num_ori
+    # return p_food
+    # p_food = len(train_raw_food) / (len(train_raw_food) + len(train_raw_non_food))
+    # return p_food
+    pass
+
+
+def init(use_all=True):
+    global p_food, train_raw_food, train_raw_non_food
+    get_raw_data()
+    shuffle_raw_data()
+    div_train_test_raw_data(0.75)
+    save_raw_data_train_test()
+    clarifai_result()
+    p_food = len(train_raw_food) / (len(train_raw_food) + len(train_raw_non_food))
+    get_correlation()
+    get_use_tag(use_all)
+    construct_train_test_set()
+