-
Notifications
You must be signed in to change notification settings - Fork 3
/
NER_people.py
81 lines (60 loc) · 2.25 KB
/
NER_people.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import cPickle, numpy as np, pandas as pd , re
from collections import Counter
from polyglot.text import Text
tagged_search = pd.read_csv('processed_search_term_data/tagged_search_terms.csv')
tagged_search_list = list(tagged_search['processed_search_term'])
def removePunctuation(text):
for c in '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~\\':
text = text.replace(c,"").strip().lower()
return text
string_word = removePunctuation(str(tagged_search_list))
def people(string):
'''
This function will work after much better if you have cleaned
and identified relevant search terms.
Input: A string of relevant search terms
Output: pulls out names of people identified from the Named Entity Recognition
software polyglot
'''
NER = Text(string)
NER = NER.entities
ent = [removePunctuation(re.sub('I-PER','',str(entity))) for entity in NER if entity.tag == "I-PER"]
ent =[' '.join(set([w[1:] for w in word.split(' ')])) for word in ent]
return list((ent))
string = people(string_word)
S = Counter(string).most_common()
name = []
count = []
for name_count in (S):
name.append(name_count[0])
count.append(name_count[1])
def LOC(string):
'''
This function will work after much better if you have cleaned
and identified relevant search terms.
Input: A string of relevant search terms
Output: pulls out names of people identified from the Named Entity Recognition
software polyglot
'''
NER = Text(string)
NER = NER.entities
ent = [removePunctuation(re.sub('I-LOC','',str(entity))) for entity in NER if entity.tag == "I-LOC"]
ent =[' '.join(set([w[1:] for w in word.split(' ')])) for word in ent]
return list((ent))
Location = LOC(string_word)
Location_counter = Counter(Location).most_common()
loc = []
cnt = []
for loc_count in (Location_counter):
loc.append(loc_count[0])
cnt.append(loc_count[1])
table1 = pd.DataFrame(columns=['Name','Name_Count'])
table2 = pd.DataFrame(columns=['Location','Location_Count'])
table1['Name'] = name
table1['Count'] = count
table2['Location'] = loc
table2['Location_Count'] = cnt
table1.to_csv('people_count.csv')
table2.to_csv('location_count.csv')
#link:
# http://polyglot.readthedocs.io/en/latest/NamedEntityRecognition.html