From 675e9b31750b36eb738737143ab8662419e584d6 Mon Sep 17 00:00:00 2001 From: ianbstewart Date: Mon, 28 May 2018 18:46:43 +0200 Subject: [PATCH] testing basic interest comparison --- .../check_interest_id_valid.py | 5 +- .../compare_top_interests.ipynb | 3700 +++++++++++++++++ .../get_top_k_interest_query.py | 5 +- src/data_processing/mine_facebook_audience.py | 11 +- 4 files changed, 3712 insertions(+), 9 deletions(-) diff --git a/src/data_processing/check_interest_id_valid.py b/src/data_processing/check_interest_id_valid.py index faadaae..a545dac 100644 --- a/src/data_processing/check_interest_id_valid.py +++ b/src/data_processing/check_interest_id_valid.py @@ -13,6 +13,7 @@ import logging import os import math +import sys ## suppress request INFO messages logging.getLogger("requests").setLevel(logging.WARNING) @@ -142,6 +143,8 @@ def interest_name_query_batch(access_token, user_id, interest_ids): print('rate limit reached at id=%d, sleeping for %d seconds'%(interest_id, RATE_LIMIT_SLEEP_TIME)) sleep(RATE_LIMIT_SLEEP_TIME) success = True + ## try to restart program to dodge rate limit +# os.execl(sys.executable, sys.executable, *sys.argv) else: response_data = response_json['targetingsentencelines'] response_data_matches = filter(lambda x: x['content']=='People Who Match:' or x['content']=='And Must Also Match:', @@ -232,7 +235,7 @@ def main(): if(len(response_names_i) < interest_names_i): fixed_names_i = ['NA' if x not in set(response_names_i) else x for x in interest_names_i] else: - fixed_names_i = list(interest_names_i) + fixed_names_i = list(response_names_i) # print('%d/%d fixed names %s'%(len(response_names_i), len(interest_names_i), fixed_names_i)) ## check for missing names # if(len(response_names_i) != len(fixed_names_i) or any([name_i=='' for name_i in response_names_i])): diff --git a/src/data_processing/compare_top_interests.ipynb b/src/data_processing/compare_top_interests.ipynb index a295b6d..fda2bae 100644 --- a/src/data_processing/compare_top_interests.ipynb +++ b/src/data_processing/compare_top_interests.ipynb @@ -956,6 +956,3706 @@ " l_file = '../../data/query_results/%s_top_%d_%s.csv'%(l, top_k, audience_var)\n", " l_data_k.loc[:, ['interest_name', audience_var]].to_csv(l_file, sep=',', index=False, encoding='utf-8')" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare Ex-pat interests\n", + "We've now mined the top 3000 interests for Hispanic Mexican ex-pats living in the US, so let's see how those stack up against native US Americans." + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original data has 3000 rows\n", + "clean data has 2100 rows\n", + "2100 results total\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interest_idinterest_namelocationages_rangesbehaviordau_audiencemau_audience
06003349442621EntertainmentUS{u'max': 65, u'min': 18}{u'and': [6023676072183], u'or': [600313321237...2619359934000000
16003142505790FacebookUS{u'max': 65, u'min': 18}{u'and': [6023676072183], u'or': [600313321237...2296941130000000
26003342621987Social networkUS{u'max': 65, u'min': 18}{u'and': [6023676072183], u'or': [600313321237...2202839929000000
36003167425934Shopping and fashionUS{u'max': 65, u'min': 18}{u'and': [6023676072183], u'or': [600313321237...2585599932000000
46003985771306TechnologyUS{u'max': 65, u'min': 18}{u'and': [6023676072183], u'or': [600313321237...2585599932000000
\n", + "
" + ], + "text/plain": [ + " interest_id interest_name location ages_ranges \\\n", + "0 6003349442621 Entertainment US {u'max': 65, u'min': 18} \n", + "1 6003142505790 Facebook US {u'max': 65, u'min': 18} \n", + "2 6003342621987 Social network US {u'max': 65, u'min': 18} \n", + "3 6003167425934 Shopping and fashion US {u'max': 65, u'min': 18} \n", + "4 6003985771306 Technology US {u'max': 65, u'min': 18} \n", + "\n", + " behavior dau_audience \\\n", + "0 {u'and': [6023676072183], u'or': [600313321237... 26193599 \n", + "1 {u'and': [6023676072183], u'or': [600313321237... 22969411 \n", + "2 {u'and': [6023676072183], u'or': [600313321237... 22028399 \n", + "3 {u'and': [6023676072183], u'or': [600313321237... 25855999 \n", + "4 {u'and': [6023676072183], u'or': [600313321237... 25855999 \n", + "\n", + " mau_audience \n", + "0 34000000 \n", + "1 30000000 \n", + "2 29000000 \n", + "3 32000000 \n", + "4 32000000 " + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from ast import literal_eval\n", + "expat_interests = pd.read_csv('../../dataframe_collecting_1527418768.csv', sep=',', index_col=0)\n", + "expat_interests = clean_interest_data(expat_interests)\n", + "print('%d results total'%(expat_interests.shape[0]))\n", + "expat_interests.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interest_namemau_audience
524New Tang Dynasty Television38000000
468A.N.S.W.E.R.38000000
705Canadian Albums Chart38000000
1740Province38000000
1756Act-i-vate38000000
1833Indian people38000000
1845Message38000000
1866Suicide awareness38000000
562Allah38000000
558Dieting38000000
530Wide Open West38000000
1193Ton38000000
1919Conservatism38000000
441Realidade38000000
2082Lakh38000000
2005Lady38000000
2018Egyptians38000000
2054Hispanic and latino american culture38000000
331Lewis and Clark-class dry cargo ship38000000
325Entreprise38000000
305Islam38000000
279Fatigue (medical)38000000
273EveR38000000
266Christianity38000000
242Gyms38000000
230Muka38000000
1739Stop consonant38000000
709Mystery meat navigation38000000
718Zumba38000000
739Acne vulgaris38000000
1350family planning38000000
994Hotline38000000
1386Obesity awareness38000000
1391Mosque38000000
1402Communist Party USA38000000
1414Ampere-hour38000000
952Screenshot38000000
1420Securite38000000
1282Arabic language38000000
1458Entity38000000
1469Inflammation38000000
1265List of districts of Turkey38000000
892Canadian Hot 10038000000
1496Bharatiya Janata Party38000000
884Muhammad38000000
862Sin38000000
861Quran38000000
1091Hashtag38000000
841Ultra-prominent peak38000000
1581Infection38000000
809Hiking38000000
1637Ramadan38000000
751Addiction38000000
175Nutrition38000000
0Entertainment34000000
5Hobbies and activities33000000
10Business and industry32000000
3Shopping and fashion32000000
4Technology32000000
16Music31000000
8Sports and outdoors30000000
15Food and drink30000000
1Facebook30000000
9Sports30000000
37Entre Rios Province30000000
13Family and relationships30000000
2Social network29000000
11Consumer electronics29000000
18Food29000000
6Instant messaging28000000
7Facebook Messenger28000000
21Family27000000
12Shopping27000000
22Reading27000000
17Games27000000
28Arts and music26000000
23Love26000000
41Business26000000
19Movies26000000
20Travel26000000
24Televisions26000000
30Education24000000
40Time24000000
36Fitness and wellness24000000
34Vehicles24000000
45TV24000000
31Beauty24000000
39Automobiles23000000
26Video games23000000
29Clothing23000000
38Life23000000
32Friendship23000000
43Finance22000000
35Instagram22000000
94Product (business)21000000
51Sales21000000
97United States21000000
33Online shopping21000000
49Live events21000000
25Computers20000000
\n", + "
" + ], + "text/plain": [ + " interest_name mau_audience\n", + "524 New Tang Dynasty Television 38000000\n", + "468 A.N.S.W.E.R. 38000000\n", + "705 Canadian Albums Chart 38000000\n", + "1740 Province 38000000\n", + "1756 Act-i-vate 38000000\n", + "1833 Indian people 38000000\n", + "1845 Message 38000000\n", + "1866 Suicide awareness 38000000\n", + "562 Allah 38000000\n", + "558 Dieting 38000000\n", + "530 Wide Open West 38000000\n", + "1193 Ton 38000000\n", + "1919 Conservatism 38000000\n", + "441 Realidade 38000000\n", + "2082 Lakh 38000000\n", + "2005 Lady 38000000\n", + "2018 Egyptians 38000000\n", + "2054 Hispanic and latino american culture 38000000\n", + "331 Lewis and Clark-class dry cargo ship 38000000\n", + "325 Entreprise 38000000\n", + "305 Islam 38000000\n", + "279 Fatigue (medical) 38000000\n", + "273 EveR 38000000\n", + "266 Christianity 38000000\n", + "242 Gyms 38000000\n", + "230 Muka 38000000\n", + "1739 Stop consonant 38000000\n", + "709 Mystery meat navigation 38000000\n", + "718 Zumba 38000000\n", + "739 Acne vulgaris 38000000\n", + "1350 family planning 38000000\n", + "994 Hotline 38000000\n", + "1386 Obesity awareness 38000000\n", + "1391 Mosque 38000000\n", + "1402 Communist Party USA 38000000\n", + "1414 Ampere-hour 38000000\n", + "952 Screenshot 38000000\n", + "1420 Securite 38000000\n", + "1282 Arabic language 38000000\n", + "1458 Entity 38000000\n", + "1469 Inflammation 38000000\n", + "1265 List of districts of Turkey 38000000\n", + "892 Canadian Hot 100 38000000\n", + "1496 Bharatiya Janata Party 38000000\n", + "884 Muhammad 38000000\n", + "862 Sin 38000000\n", + "861 Quran 38000000\n", + "1091 Hashtag 38000000\n", + "841 Ultra-prominent peak 38000000\n", + "1581 Infection 38000000\n", + "809 Hiking 38000000\n", + "1637 Ramadan 38000000\n", + "751 Addiction 38000000\n", + "175 Nutrition 38000000\n", + "0 Entertainment 34000000\n", + "5 Hobbies and activities 33000000\n", + "10 Business and industry 32000000\n", + "3 Shopping and fashion 32000000\n", + "4 Technology 32000000\n", + "16 Music 31000000\n", + "8 Sports and outdoors 30000000\n", + "15 Food and drink 30000000\n", + "1 Facebook 30000000\n", + "9 Sports 30000000\n", + "37 Entre Rios Province 30000000\n", + "13 Family and relationships 30000000\n", + "2 Social network 29000000\n", + "11 Consumer electronics 29000000\n", + "18 Food 29000000\n", + "6 Instant messaging 28000000\n", + "7 Facebook Messenger 28000000\n", + "21 Family 27000000\n", + "12 Shopping 27000000\n", + "22 Reading 27000000\n", + "17 Games 27000000\n", + "28 Arts and music 26000000\n", + "23 Love 26000000\n", + "41 Business 26000000\n", + "19 Movies 26000000\n", + "20 Travel 26000000\n", + "24 Televisions 26000000\n", + "30 Education 24000000\n", + "40 Time 24000000\n", + "36 Fitness and wellness 24000000\n", + "34 Vehicles 24000000\n", + "45 TV 24000000\n", + "31 Beauty 24000000\n", + "39 Automobiles 23000000\n", + "26 Video games 23000000\n", + "29 Clothing 23000000\n", + "38 Life 23000000\n", + "32 Friendship 23000000\n", + "43 Finance 22000000\n", + "35 Instagram 22000000\n", + "94 Product (business) 21000000\n", + "51 Sales 21000000\n", + "97 United States 21000000\n", + "33 Online shopping 21000000\n", + "49 Live events 21000000\n", + "25 Computers 20000000" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "audience_var = 'mau_audience'\n", + "expat_interests.sort_values(audience_var, inplace=True, ascending=False)\n", + "expat_interests.loc[:, ['interest_name', audience_var]].head(n=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interest_namemau_audience
2013Tamil cinema270000
972Hacker (computer security)250000
1585Export250000
1640Call centre250000
1737China Central Television240000
2083Departments of France240000
1289Man (Middle-earth)240000
2092Storey230000
1351Bangkok230000
1731Zara (retailer)220000
1822Hard drives210000
1659Sale, Greater Manchester210000
1296Lenovo210000
1688Qatar200000
2093Ultras180000
17454G180000
924Huawei170000
2077Reseller170000
1808Delhi150000
1951Turkish language150000
1660IPhone 5150000
1656Middle Eastern cuisine150000
1055Million140000
1937Multi-core processor140000
460URL shortening130000
1704Bangladesh120000
791truecaller120000
2035Chinese New Year120000
2072Prophets and messengers in Islam120000
2028Nescafe120000
1190Istanbul120000
1793Cairo120000
1987Holi120000
2008Indonesian language120000
1403Hijab100000
1761Jakarta100000
1468Arab world93000
1722Condominio87000
617Academia85000
1980African Union84000
357Government83000
987Cod83000
1387Sari82000
1593Wire transfer81000
1705Limited company80000
1008Indian Premier League79000
2086Urdu74000
2097Nokia72000
552Facebook for Every Phone71000
1478Heel (shoe)55000
1297Samsung Galaxy S III52000
1698My Talking Tom41000
1644Tamil language37000
2041Salman Khan34000
1943Legal personality32000
950Synthpop17000
2053Indo pop16000
1911Telugu language14000
1861Virat Kohli13000
1558Narendra Modi11000
1915BlackBerry Messenger9300
1368Vodafone9000
1451Flipkart4700
1550Supporters of FC Barcelona4400
2003Indian pop4000
1948CCTV News3600
1527British rock3500
859UC Browser2900
1520Value-added tax2400
1729Oppo Electronics2100
810Gender1000
795Leaf1000
65Books1000
1847Motor vehicle1000
801Card games1000
1851Mining1000
789People's Liberation Army Navy1000
458LG Optimus L4 II1000
1854Sydney1000
1842Songwriter1000
1841Gucci1000
75Woman1000
79Newspapers1000
2055India News1000
87WhatsApp1000
1850Venezuela1000
792Skiing1000
91Nature1000
803ITunes Store1000
806Source code1000
903Cinema of India1000
908Mumbai1000
1629Types of business entity1000
174Dogs1000
784Vegetarianism1000
807Musician1000
1397Franchising1000
798Hollywood1000
811Carnival1000
913Pakistan1000
\n", + "
" + ], + "text/plain": [ + " interest_name mau_audience\n", + "2013 Tamil cinema 270000\n", + "972 Hacker (computer security) 250000\n", + "1585 Export 250000\n", + "1640 Call centre 250000\n", + "1737 China Central Television 240000\n", + "2083 Departments of France 240000\n", + "1289 Man (Middle-earth) 240000\n", + "2092 Storey 230000\n", + "1351 Bangkok 230000\n", + "1731 Zara (retailer) 220000\n", + "1822 Hard drives 210000\n", + "1659 Sale, Greater Manchester 210000\n", + "1296 Lenovo 210000\n", + "1688 Qatar 200000\n", + "2093 Ultras 180000\n", + "1745 4G 180000\n", + "924 Huawei 170000\n", + "2077 Reseller 170000\n", + "1808 Delhi 150000\n", + "1951 Turkish language 150000\n", + "1660 IPhone 5 150000\n", + "1656 Middle Eastern cuisine 150000\n", + "1055 Million 140000\n", + "1937 Multi-core processor 140000\n", + "460 URL shortening 130000\n", + "1704 Bangladesh 120000\n", + "791 truecaller 120000\n", + "2035 Chinese New Year 120000\n", + "2072 Prophets and messengers in Islam 120000\n", + "2028 Nescafe 120000\n", + "1190 Istanbul 120000\n", + "1793 Cairo 120000\n", + "1987 Holi 120000\n", + "2008 Indonesian language 120000\n", + "1403 Hijab 100000\n", + "1761 Jakarta 100000\n", + "1468 Arab world 93000\n", + "1722 Condominio 87000\n", + "617 Academia 85000\n", + "1980 African Union 84000\n", + "357 Government 83000\n", + "987 Cod 83000\n", + "1387 Sari 82000\n", + "1593 Wire transfer 81000\n", + "1705 Limited company 80000\n", + "1008 Indian Premier League 79000\n", + "2086 Urdu 74000\n", + "2097 Nokia 72000\n", + "552 Facebook for Every Phone 71000\n", + "1478 Heel (shoe) 55000\n", + "1297 Samsung Galaxy S III 52000\n", + "1698 My Talking Tom 41000\n", + "1644 Tamil language 37000\n", + "2041 Salman Khan 34000\n", + "1943 Legal personality 32000\n", + "950 Synthpop 17000\n", + "2053 Indo pop 16000\n", + "1911 Telugu language 14000\n", + "1861 Virat Kohli 13000\n", + "1558 Narendra Modi 11000\n", + "1915 BlackBerry Messenger 9300\n", + "1368 Vodafone 9000\n", + "1451 Flipkart 4700\n", + "1550 Supporters of FC Barcelona 4400\n", + "2003 Indian pop 4000\n", + "1948 CCTV News 3600\n", + "1527 British rock 3500\n", + "859 UC Browser 2900\n", + "1520 Value-added tax 2400\n", + "1729 Oppo Electronics 2100\n", + "810 Gender 1000\n", + "795 Leaf 1000\n", + "65 Books 1000\n", + "1847 Motor vehicle 1000\n", + "801 Card games 1000\n", + "1851 Mining 1000\n", + "789 People's Liberation Army Navy 1000\n", + "458 LG Optimus L4 II 1000\n", + "1854 Sydney 1000\n", + "1842 Songwriter 1000\n", + "1841 Gucci 1000\n", + "75 Woman 1000\n", + "79 Newspapers 1000\n", + "2055 India News 1000\n", + "87 WhatsApp 1000\n", + "1850 Venezuela 1000\n", + "792 Skiing 1000\n", + "91 Nature 1000\n", + "803 ITunes Store 1000\n", + "806 Source code 1000\n", + "903 Cinema of India 1000\n", + "908 Mumbai 1000\n", + "1629 Types of business entity 1000\n", + "174 Dogs 1000\n", + "784 Vegetarianism 1000\n", + "807 Musician 1000\n", + "1397 Franchising 1000\n", + "798 Hollywood 1000\n", + "811 Carnival 1000\n", + "913 Pakistan 1000" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "expat_interests.loc[:, ['interest_name', audience_var]].tail(n=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like we'll need to filter out the max_pop and min_pop values." + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interest_namemau_audience
0Entertainment34000000
5Hobbies and activities33000000
10Business and industry32000000
3Shopping and fashion32000000
4Technology32000000
16Music31000000
8Sports and outdoors30000000
15Food and drink30000000
1Facebook30000000
9Sports30000000
37Entre Rios Province30000000
13Family and relationships30000000
2Social network29000000
11Consumer electronics29000000
18Food29000000
6Instant messaging28000000
7Facebook Messenger28000000
21Family27000000
12Shopping27000000
22Reading27000000
17Games27000000
28Arts and music26000000
23Love26000000
41Business26000000
19Movies26000000
20Travel26000000
24Televisions26000000
30Education24000000
40Time24000000
36Fitness and wellness24000000
34Vehicles24000000
45TV24000000
31Beauty24000000
39Automobiles23000000
26Video games23000000
29Clothing23000000
38Life23000000
32Friendship23000000
43Finance22000000
35Instagram22000000
94Product (business)21000000
51Sales21000000
97United States21000000
33Online shopping21000000
49Live events21000000
25Computers20000000
61Pets19000000
54Politics and social issues19000000
46Design19000000
44World19000000
47Beverages19000000
52Online18000000
68Photograph18000000
60Child18000000
53Fashion accessories18000000
42Association football (Soccer)18000000
50Photography17000000
154Victory17000000
14Facebook for Android17000000
77Dance17000000
64Price17000000
130Sales promotion16000000
151IPhone16000000
99Home16000000
95Motherhood16000000
92Home and garden16000000
165Twitter16000000
69Mobile app16000000
66Video16000000
63Human16000000
90Restaurants15000000
56Cosmetics15000000
88Cooking15000000
81Personal finance15000000
78Image15000000
67Current events15000000
55Happiness14000000
27Mobile phones14000000
412Mexico14000000
59Shoes14000000
70Pop music14000000
354Viral video14000000
82Brand14000000
89Gratitude13000000
103Coupons13000000
135Rings of Saturn13000000
62Free software13000000
119Outdoor recreation13000000
73Rock music13000000
93House13000000
148Facebook for Iphone13000000
86People13000000
158Physical exercise12000000
201Freight transport12000000
84Website12000000
80Country12000000
133Alcoholic beverages12000000
58Music videos12000000
410Wish12000000
116Learning12000000
\n", + "
" + ], + "text/plain": [ + " interest_name mau_audience\n", + "0 Entertainment 34000000\n", + "5 Hobbies and activities 33000000\n", + "10 Business and industry 32000000\n", + "3 Shopping and fashion 32000000\n", + "4 Technology 32000000\n", + "16 Music 31000000\n", + "8 Sports and outdoors 30000000\n", + "15 Food and drink 30000000\n", + "1 Facebook 30000000\n", + "9 Sports 30000000\n", + "37 Entre Rios Province 30000000\n", + "13 Family and relationships 30000000\n", + "2 Social network 29000000\n", + "11 Consumer electronics 29000000\n", + "18 Food 29000000\n", + "6 Instant messaging 28000000\n", + "7 Facebook Messenger 28000000\n", + "21 Family 27000000\n", + "12 Shopping 27000000\n", + "22 Reading 27000000\n", + "17 Games 27000000\n", + "28 Arts and music 26000000\n", + "23 Love 26000000\n", + "41 Business 26000000\n", + "19 Movies 26000000\n", + "20 Travel 26000000\n", + "24 Televisions 26000000\n", + "30 Education 24000000\n", + "40 Time 24000000\n", + "36 Fitness and wellness 24000000\n", + "34 Vehicles 24000000\n", + "45 TV 24000000\n", + "31 Beauty 24000000\n", + "39 Automobiles 23000000\n", + "26 Video games 23000000\n", + "29 Clothing 23000000\n", + "38 Life 23000000\n", + "32 Friendship 23000000\n", + "43 Finance 22000000\n", + "35 Instagram 22000000\n", + "94 Product (business) 21000000\n", + "51 Sales 21000000\n", + "97 United States 21000000\n", + "33 Online shopping 21000000\n", + "49 Live events 21000000\n", + "25 Computers 20000000\n", + "61 Pets 19000000\n", + "54 Politics and social issues 19000000\n", + "46 Design 19000000\n", + "44 World 19000000\n", + "47 Beverages 19000000\n", + "52 Online 18000000\n", + "68 Photograph 18000000\n", + "60 Child 18000000\n", + "53 Fashion accessories 18000000\n", + "42 Association football (Soccer) 18000000\n", + "50 Photography 17000000\n", + "154 Victory 17000000\n", + "14 Facebook for Android 17000000\n", + "77 Dance 17000000\n", + "64 Price 17000000\n", + "130 Sales promotion 16000000\n", + "151 IPhone 16000000\n", + "99 Home 16000000\n", + "95 Motherhood 16000000\n", + "92 Home and garden 16000000\n", + "165 Twitter 16000000\n", + "69 Mobile app 16000000\n", + "66 Video 16000000\n", + "63 Human 16000000\n", + "90 Restaurants 15000000\n", + "56 Cosmetics 15000000\n", + "88 Cooking 15000000\n", + "81 Personal finance 15000000\n", + "78 Image 15000000\n", + "67 Current events 15000000\n", + "55 Happiness 14000000\n", + "27 Mobile phones 14000000\n", + "412 Mexico 14000000\n", + "59 Shoes 14000000\n", + "70 Pop music 14000000\n", + "354 Viral video 14000000\n", + "82 Brand 14000000\n", + "89 Gratitude 13000000\n", + "103 Coupons 13000000\n", + "135 Rings of Saturn 13000000\n", + "62 Free software 13000000\n", + "119 Outdoor recreation 13000000\n", + "73 Rock music 13000000\n", + "93 House 13000000\n", + "148 Facebook for Iphone 13000000\n", + "86 People 13000000\n", + "158 Physical exercise 12000000\n", + "201 Freight transport 12000000\n", + "84 Website 12000000\n", + "80 Country 12000000\n", + "133 Alcoholic beverages 12000000\n", + "58 Music videos 12000000\n", + "410 Wish 12000000\n", + "116 Learning 12000000" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option('display.max_rows', 100)\n", + "max_expat_audience = expat_interests.loc[:, audience_var].max()\n", + "min_expat_audience = expat_interests.loc[:, audience_var].min()\n", + "expat_interests_clean = expat_interests[(expat_interests.loc[:, audience_var] < max_expat_audience) &\n", + " (expat_interests.loc[:, audience_var] > min_expat_audience)]\n", + "expat_interests_clean.loc[:, ['interest_name', audience_var]].head(n=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These all look pretty reasonable! Who doesn't like `Coupons`?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's match the distribution with the American native interests, normalize for population size and then compare the distributions (overlapping histogram?? yes)." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original data has 4834 rows\n", + "clean data has 4834 rows\n" + ] + } + ], + "source": [ + "US_MX_interests = pd.read_csv('../../data/query_results/US_MX_native_interests_top_3000_interest_new_tmp.tsv', sep='\\t', index_col=False)\n", + "US_MX_interests = clean_interest_data(US_MX_interests)\n", + "US_interests = US_MX_interests[US_MX_interests.loc[:, 'location'] == 'US']\n", + "MX_interests = US_MX_interests[US_MX_interests.loc[:, 'location'] == 'MX']" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "# get rid of max values\n", + "max_US_audience = US_interests.loc[:, audience_var].max()\n", + "max_MX_audience = MX_interests.loc[:, audience_var].max()\n", + "min_US_audience = US_interests.loc[:, audience_var].min()\n", + "min_MX_audience = MX_interests.loc[:, audience_var].min()\n", + "US_interests = US_interests[(US_interests.loc[:, audience_var] < max_US_audience) &\n", + " (US_interests.loc[:, audience_var] > min_US_audience)]\n", + "MX_interests = MX_interests[(MX_interests.loc[:, audience_var] < max_MX_audience) &\n", + " (MX_interests.loc[:, audience_var] > min_MX_audience)]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interest_namemau_audience
2Sports and outdoors103000000
12Automobiles77000000
16Politics and social issues70000000
14Live events68000000
20Home and garden57000000
24Li Ke48000000
28Homo sapiens45000000
26Imagem41000000
1368Walmart39000000
58Pinterest37000000
904Hu Ge35000000
38Real estate30000000
684People (magazine)30000000
604U.S. state29000000
22Women's clothing29000000
126Meme28000000
1314Republican Party (United States)28000000
1374Mother's Day28000000
3362Pandora Radio27000000
560Donald Trump27000000
498Hollywood27000000
18Software27000000
548Symptom26000000
98Netflix26000000
1302Popular music26000000
62Wealth26000000
52Streaming media25000000
36Spotify25000000
630Patient25000000
894Democratic Party (United States)25000000
82Exhibition game25000000
688BuzzFeed24000000
474TV game shows24000000
1404Florida24000000
1634OMG (song)23000000
862National Football League23000000
208Virus22000000
146Trucks22000000
1494Real estate broker22000000
158High school22000000
48Vacations21000000
1364Texas21000000
106Military21000000
218North America21000000
1058College football20000000
462Grandparent20000000
3478The Weather Channel20000000
2386Variety show20000000
290President of the United States20000000
782Tasty19000000
152Hunting19000000
102Barbecue19000000
1502National Football League on television19000000
312Home (2009 film)19000000
316Truth19000000
1628County (United States)19000000
276Job19000000
348Fishing19000000
1962Limited liability company19000000
370Beyin18000000
528New York City18000000
588Nursing18000000
762California18000000
124Health care18000000
230Cancer awareness18000000
92Dinner18000000
68Renting18000000
2056County seat18000000
50Short Message Service17000000
3148NBC17000000
56Music download17000000
1338Quiz17000000
278Farm17000000
758Swimsuit17000000
2404Groupon16000000
532National Basketball Association16000000
3076American folk music16000000
222Acting16000000
1136Boyfriend16000000
1142Food Network16000000
3280Genius16000000
710Internet meme16000000
380Camping16000000
2650Rugby league16000000
994Window16000000
280Bathing16000000
972Character (arts)16000000
2756America (band)16000000
1198Popular culture15000000
734Grilling15000000
324Day school15000000
1694Try15000000
472Lawyer15000000
302Sense15000000
314Performing arts15000000
86Phil Spector15000000
4592Old age15000000
30Men's clothing15000000
70Chef15000000
3842Medical sign15000000
\n", + "
" + ], + "text/plain": [ + " interest_name mau_audience\n", + "2 Sports and outdoors 103000000\n", + "12 Automobiles 77000000\n", + "16 Politics and social issues 70000000\n", + "14 Live events 68000000\n", + "20 Home and garden 57000000\n", + "24 Li Ke 48000000\n", + "28 Homo sapiens 45000000\n", + "26 Imagem 41000000\n", + "1368 Walmart 39000000\n", + "58 Pinterest 37000000\n", + "904 Hu Ge 35000000\n", + "38 Real estate 30000000\n", + "684 People (magazine) 30000000\n", + "604 U.S. state 29000000\n", + "22 Women's clothing 29000000\n", + "126 Meme 28000000\n", + "1314 Republican Party (United States) 28000000\n", + "1374 Mother's Day 28000000\n", + "3362 Pandora Radio 27000000\n", + "560 Donald Trump 27000000\n", + "498 Hollywood 27000000\n", + "18 Software 27000000\n", + "548 Symptom 26000000\n", + "98 Netflix 26000000\n", + "1302 Popular music 26000000\n", + "62 Wealth 26000000\n", + "52 Streaming media 25000000\n", + "36 Spotify 25000000\n", + "630 Patient 25000000\n", + "894 Democratic Party (United States) 25000000\n", + "82 Exhibition game 25000000\n", + "688 BuzzFeed 24000000\n", + "474 TV game shows 24000000\n", + "1404 Florida 24000000\n", + "1634 OMG (song) 23000000\n", + "862 National Football League 23000000\n", + "208 Virus 22000000\n", + "146 Trucks 22000000\n", + "1494 Real estate broker 22000000\n", + "158 High school 22000000\n", + "48 Vacations 21000000\n", + "1364 Texas 21000000\n", + "106 Military 21000000\n", + "218 North America 21000000\n", + "1058 College football 20000000\n", + "462 Grandparent 20000000\n", + "3478 The Weather Channel 20000000\n", + "2386 Variety show 20000000\n", + "290 President of the United States 20000000\n", + "782 Tasty 19000000\n", + "152 Hunting 19000000\n", + "102 Barbecue 19000000\n", + "1502 National Football League on television 19000000\n", + "312 Home (2009 film) 19000000\n", + "316 Truth 19000000\n", + "1628 County (United States) 19000000\n", + "276 Job 19000000\n", + "348 Fishing 19000000\n", + "1962 Limited liability company 19000000\n", + "370 Beyin 18000000\n", + "528 New York City 18000000\n", + "588 Nursing 18000000\n", + "762 California 18000000\n", + "124 Health care 18000000\n", + "230 Cancer awareness 18000000\n", + "92 Dinner 18000000\n", + "68 Renting 18000000\n", + "2056 County seat 18000000\n", + "50 Short Message Service 17000000\n", + "3148 NBC 17000000\n", + "56 Music download 17000000\n", + "1338 Quiz 17000000\n", + "278 Farm 17000000\n", + "758 Swimsuit 17000000\n", + "2404 Groupon 16000000\n", + "532 National Basketball Association 16000000\n", + "3076 American folk music 16000000\n", + "222 Acting 16000000\n", + "1136 Boyfriend 16000000\n", + "1142 Food Network 16000000\n", + "3280 Genius 16000000\n", + "710 Internet meme 16000000\n", + "380 Camping 16000000\n", + "2650 Rugby league 16000000\n", + "994 Window 16000000\n", + "280 Bathing 16000000\n", + "972 Character (arts) 16000000\n", + "2756 America (band) 16000000\n", + "1198 Popular culture 15000000\n", + "734 Grilling 15000000\n", + "324 Day school 15000000\n", + "1694 Try 15000000\n", + "472 Lawyer 15000000\n", + "302 Sense 15000000\n", + "314 Performing arts 15000000\n", + "86 Phil Spector 15000000\n", + "4592 Old age 15000000\n", + "30 Men's clothing 15000000\n", + "70 Chef 15000000\n", + "3842 Medical sign 15000000" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "US_interests.sort_values(audience_var, inplace=True, ascending=False)\n", + "US_interests.loc[:, ['interest_name', audience_var]].head(n=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interest_namemau_audience
2798African Union120000
3682Space age pop120000
4248Worldbeat110000
2070Franco De Vita110000
4788Public sector110000
1444Samsung Galaxy S III110000
3586Handball110000
3724International News Service v. Associated Press110000
1798Heel (shoe)110000
3600Ronaldinho100000
4776Ottoman Empire100000
1778Arab world100000
2724Legal personality98000
4448Comune93000
3864Member states of the Arab League92000
3348Tunisia92000
3672Jangle pop91000
2242Spain national under-21 football team88000
1590F.C. Porto85000
4470Juventus F.C.85000
3680Malaysian pop85000
3178Argentina national football team85000
3480Deepika Padukone81000
3196Kuwait79000
3522Sheikh78000
3518States and union territories of India77000
1432El Clasico76000
4270Sport Club Internacional75000
4478Ali73000
3788Renault70000
4010Audi A769000
3198Thai baht68000
4122A.C. Milan66000
3008Urdu66000
2234My Talking Tom66000
3086Shah Rukh Khan65000
4476Quezon City65000
3896Cable65000
3954Vivo (telecommunications)61000
1452Peso58000
2892Nescafe58000
4386100 metres57000
3534SMS (hydrology software)56000
3326Algeria53000
3378Atletico Madrid53000
4338Xiaomi51000
3866Rede Globo49000
3578Russian pop48000
3890Frases47000
4352Shraddha Kapoor47000
3560Languages of India43000
4148New Delhi40000
4576Alia Bhatt38000
4578Urdu poetry37000
3044Puma SE36000
2918Salman Khan33000
3746Chord names and symbols (popular music)32000
4052Bengali language32000
4332200 metres29000
4792God in Islam29000
4714Passion (Christianity)28000
2126Tamil language28000
1954Narendra Modi28000
1744Flipkart28000
3914Orange S.A.27000
3292Maharashtra26000
3356MercadoLibre.com22000
3792Cifras21000
4710Peugeot21000
4050India national cricket team19000
4250Indian Army19000
4590Arab League18000
2560Virat Kohli17000
2660Telugu language17000
3462Paytm15000
3670Sunshine pop15000
4518Egyptian Arabic14000
2942Indo pop13000
4684Musica sertaneja12000
1584Vodafone12000
3524Bikin9900
2668BlackBerry Messenger9200
3346Carrefour9100
2296Oppo Electronics8900
4460Bandung8800
3662Operatic pop8300
3276Mahendra Singh Dhoni7200
4172BBC News Online7100
3038Grand Prix of Portland6900
1938Supporters of FC Barcelona6700
2734CCTV News6300
3556Wonky pop6100
608UC Browser4300
4382Dari (Persian dialect)4200
3690Sophisti-pop4000
1882Value-added tax3900
2842Indian pop3500
2096Types of business entity2600
2946India News2100
3602V-pop1900
\n", + "
" + ], + "text/plain": [ + " interest_name mau_audience\n", + "2798 African Union 120000\n", + "3682 Space age pop 120000\n", + "4248 Worldbeat 110000\n", + "2070 Franco De Vita 110000\n", + "4788 Public sector 110000\n", + "1444 Samsung Galaxy S III 110000\n", + "3586 Handball 110000\n", + "3724 International News Service v. Associated Press 110000\n", + "1798 Heel (shoe) 110000\n", + "3600 Ronaldinho 100000\n", + "4776 Ottoman Empire 100000\n", + "1778 Arab world 100000\n", + "2724 Legal personality 98000\n", + "4448 Comune 93000\n", + "3864 Member states of the Arab League 92000\n", + "3348 Tunisia 92000\n", + "3672 Jangle pop 91000\n", + "2242 Spain national under-21 football team 88000\n", + "1590 F.C. Porto 85000\n", + "4470 Juventus F.C. 85000\n", + "3680 Malaysian pop 85000\n", + "3178 Argentina national football team 85000\n", + "3480 Deepika Padukone 81000\n", + "3196 Kuwait 79000\n", + "3522 Sheikh 78000\n", + "3518 States and union territories of India 77000\n", + "1432 El Clasico 76000\n", + "4270 Sport Club Internacional 75000\n", + "4478 Ali 73000\n", + "3788 Renault 70000\n", + "4010 Audi A7 69000\n", + "3198 Thai baht 68000\n", + "4122 A.C. Milan 66000\n", + "3008 Urdu 66000\n", + "2234 My Talking Tom 66000\n", + "3086 Shah Rukh Khan 65000\n", + "4476 Quezon City 65000\n", + "3896 Cable 65000\n", + "3954 Vivo (telecommunications) 61000\n", + "1452 Peso 58000\n", + "2892 Nescafe 58000\n", + "4386 100 metres 57000\n", + "3534 SMS (hydrology software) 56000\n", + "3326 Algeria 53000\n", + "3378 Atletico Madrid 53000\n", + "4338 Xiaomi 51000\n", + "3866 Rede Globo 49000\n", + "3578 Russian pop 48000\n", + "3890 Frases 47000\n", + "4352 Shraddha Kapoor 47000\n", + "3560 Languages of India 43000\n", + "4148 New Delhi 40000\n", + "4576 Alia Bhatt 38000\n", + "4578 Urdu poetry 37000\n", + "3044 Puma SE 36000\n", + "2918 Salman Khan 33000\n", + "3746 Chord names and symbols (popular music) 32000\n", + "4052 Bengali language 32000\n", + "4332 200 metres 29000\n", + "4792 God in Islam 29000\n", + "4714 Passion (Christianity) 28000\n", + "2126 Tamil language 28000\n", + "1954 Narendra Modi 28000\n", + "1744 Flipkart 28000\n", + "3914 Orange S.A. 27000\n", + "3292 Maharashtra 26000\n", + "3356 MercadoLibre.com 22000\n", + "3792 Cifras 21000\n", + "4710 Peugeot 21000\n", + "4050 India national cricket team 19000\n", + "4250 Indian Army 19000\n", + "4590 Arab League 18000\n", + "2560 Virat Kohli 17000\n", + "2660 Telugu language 17000\n", + "3462 Paytm 15000\n", + "3670 Sunshine pop 15000\n", + "4518 Egyptian Arabic 14000\n", + "2942 Indo pop 13000\n", + "4684 Musica sertaneja 12000\n", + "1584 Vodafone 12000\n", + "3524 Bikin 9900\n", + "2668 BlackBerry Messenger 9200\n", + "3346 Carrefour 9100\n", + "2296 Oppo Electronics 8900\n", + "4460 Bandung 8800\n", + "3662 Operatic pop 8300\n", + "3276 Mahendra Singh Dhoni 7200\n", + "4172 BBC News Online 7100\n", + "3038 Grand Prix of Portland 6900\n", + "1938 Supporters of FC Barcelona 6700\n", + "2734 CCTV News 6300\n", + "3556 Wonky pop 6100\n", + "608 UC Browser 4300\n", + "4382 Dari (Persian dialect) 4200\n", + "3690 Sophisti-pop 4000\n", + "1882 Value-added tax 3900\n", + "2842 Indian pop 3500\n", + "2096 Types of business entity 2600\n", + "2946 India News 2100\n", + "3602 V-pop 1900" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "US_interests.loc[:, ['interest_name', audience_var]].tail(n=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No more `Coupons`. Let's see how different these distributions are." + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "interest_vars = ['interest_name', audience_var]\n", + "expat_interest_normed = expat_interests_clean.loc[:, interest_vars]\n", + "US_interest_normed = US_interests.loc[:, interest_vars]\n", + "MX_interest_normed = MX_interests.loc[:, interest_vars]\n", + "expat_interest_normed.loc[:, audience_var] = expat_interest_normed.loc[:, audience_var] / max_expat_audience\n", + "US_interest_normed.loc[:, audience_var] = US_interest_normed.loc[:, audience_var] / max_US_audience\n", + "MX_interest_normed.loc[:, audience_var] = MX_interest_normed.loc[:, audience_var] / max_MX_audience" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1447 shared interests\n" + ] + } + ], + "source": [ + "shared_interests = list(set(expat_interest_normed.loc[:, 'interest_name'].unique()) & set(US_interest_normed.loc[:, 'interest_name'].unique()))\n", + "print('%d shared interests'%(len(shared_interests)))" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
interest_namemau_audience_expatmau_audience_USmau_audience_MX
10901,000,000,0000.0242110.0281050.006000
12341080p0.0171050.0104580.011273
134020th Century Fox0.0100000.0056860.076364
4613D computer graphics0.0578950.0372550.021818
13904G0.0047370.0037910.004909
\n", + "
" + ], + "text/plain": [ + " interest_name mau_audience_expat mau_audience_US \\\n", + "1090 1,000,000,000 0.024211 0.028105 \n", + "1234 1080p 0.017105 0.010458 \n", + "1340 20th Century Fox 0.010000 0.005686 \n", + "461 3D computer graphics 0.057895 0.037255 \n", + "1390 4G 0.004737 0.003791 \n", + "\n", + " mau_audience_MX \n", + "1090 0.006000 \n", + "1234 0.011273 \n", + "1340 0.076364 \n", + "461 0.021818 \n", + "1390 0.004909 " + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_interests = pd.merge(expat_interest_normed.rename(columns={audience_var:'%s_expat'%(audience_var)}), \n", + " US_interest_normed.rename(columns={audience_var:'%s_US'%(audience_var)}), on='interest_name')\n", + "combined_interests = pd.merge(combined_interests, \n", + " MX_interest_normed.rename(columns={audience_var:'%s_MX'%(audience_var)}), on='interest_name')\n", + "combined_interests.sort_values('interest_name', inplace=True, ascending=True)\n", + "combined_interests.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ax = plt.subplot(111)\n", + "x = pd.np.arange(combined_interests.shape[0])\n", + "ax.plot(x, combined_interests.loc[:, '%s_expat'%(audience_var)], label='expat', color=[0.05,0.05,0.95,0.3])\n", + "ax.plot(x, combined_interests.loc[:, '%s_US'%(audience_var)], label='US', color=[0.95,0.05,0.05,0.3])\n", + "ax.plot(x, combined_interests.loc[:, '%s_MX'%(audience_var)], label='MX', color=[0.05,0.95,0.05,0.3])\n", + "ax.set_xlabel('Interest index')\n", + "ax.set_ylabel('% of population with interest')\n", + "ax.legend(loc='upper right')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like pretty close overlap on the whole, although the expat counts seem spikier (consistently higher than the other categories on maxima)." + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "matplotlib.rcParams['lines.markeredgewidth'] = 0\n", + "pal = sns.color_palette('Blues')\n", + "g = sns.pairplot(combined_interests.iloc[:, 1:], \n", + " markers='o', palette=pal, \n", + " diag_kind='kde', kind='reg')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rough confirmation that the Ex-pats behave more like US than like MX." + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "US R=9.437E-01 (p=0.000E+00)\n", + "MX R=7.355E-01 (p=3.327E-245)\n" + ] + } + ], + "source": [ + "from scipy.stats import pearsonr\n", + "countries = ['US', 'MX']\n", + "for c in countries:\n", + " corr, pval = pearsonr(combined_interests.loc[:, '%s_%s'%(audience_var, c)],\n", + " combined_interests.loc[:, '%s_expat'%(audience_var)])\n", + " print('%s R=%.3E (p=%.3E)'%(c, corr, pval))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have established a correlation among expat interests and US interests, we should actually define the assimilation metric and compute that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# stolen from display_results.py (https://drive.google.com/file/d/1QzX4Re77H7PJrXk84qGHOB5RZ7FKjVzn)\n", + "\n", + "def global_score(target_file, dest_file, home_file, score_type_, nb_int):\n", + " \"\"\"\n", + " This function computes the assimilation score for a given target populations coming from some home population and\n", + " trying to assimilate to a certain dest population\n", + " :param target_file: File containing interests audiences for the target population\n", + " :param dest_file: File containing interests audiences for the dest population\n", + " :param home_file: File containing interests audiences for the home population\n", + " :param score_type_: String indicating if the score should be computed using subtraction or division\n", + " :param nb_int: Number of interests to consider\n", + " :return: scores: the per-interest assimilation scores for each most german interests\n", + " nb_target: the size of the target population\n", + " \"\"\"\n", + " target_data = pd.read_csv(target_file, index_col=0)\n", + " dest_data = pd.read_csv(dest_file, index_col=0)\n", + " home_data = pd.read_csv(home_file, index_col=0)\n", + "\n", + " # Remove hand-picked interests\n", + " target_audience = target_data['audience'][0:3000]\n", + " dest_audience = dest_data['audience'][0:3000]\n", + " home_audience = home_data['audience'][0:3000]\n", + "\n", + " nb_target = target_audience[0]\n", + " nb_dest = dest_audience[0]\n", + " nb_home = home_audience[0]\n", + "\n", + " # Remove erroneous audiences\n", + " target_errors = (target_audience != nb_target)\n", + " dest_errors = (dest_audience != nb_dest)\n", + " home_errors = (home_audience != nb_home)\n", + " errors = target_errors | dest_errors | home_errors\n", + " target_audience = target_audience[errors]\n", + " dest_audience = dest_audience[errors]\n", + " home_audience = home_audience[errors]\n", + "\n", + " # Select a certain number of interests\n", + " random.seed(0)\n", + " int_ind = random.sample(list(dest_audience.index), nb_int)\n", + " int_ind = np.sort(int_ind)\n", + "\n", + " target_audience = target_audience[int_ind]\n", + " dest_audience = dest_audience[int_ind]\n", + " home_audience = home_audience[int_ind]\n", + "\n", + " # Compute activity level\n", + " target_nb_interests = target_audience.shape[0]\n", + " total_nb_interested_target = target_audience.sum(0)\n", + " dest_nb_interests = dest_audience.shape[0]\n", + " total_nb_interested_dest = dest_audience.sum(0)\n", + " home_nb_interests = home_audience.shape[0]\n", + " total_nb_interested_home = home_audience.sum(0)\n", + "\n", + " # Compute interest ratios\n", + " target_ir = target_audience.values / float(total_nb_interested_target)\n", + " dest_ir = dest_audience.values / float(total_nb_interested_dest)\n", + " home_ir = home_audience.values / float(total_nb_interested_home)\n", + "\n", + " # Keep only 'dest' interests\n", + " dest_indexes = dest_ir > home_ir\n", + " g_dest_ir = dest_ir[dest_indexes]\n", + " g_home_ir = home_ir[dest_indexes]\n", + " g_target_ir = target_ir[dest_indexes]\n", + "\n", + " # Keep only 'very dest' interests\n", + " if score_type_ == '-':\n", + " dest_home_perc = np.percentile(g_dest_ir - g_home_ir, TOP_PERC)\n", + " very_dest_indexes = (g_dest_ir - g_home_ir) > dest_home_perc\n", + " else:\n", + " dest_home_perc = np.percentile(g_dest_ir / g_home_ir, TOP_PERC)\n", + " very_dest_indexes = ((g_dest_ir / g_home_ir) > dest_home_perc)\n", + "\n", + " vg_dest_ir = g_dest_ir[very_dest_indexes]\n", + " vg_target_ir = g_target_ir[very_dest_indexes]\n", + "\n", + " # Compute scores\n", + " if score_type_ == '-':\n", + " scores = vg_target_ir - vg_dest_ir\n", + " else:\n", + " scores = vg_target_ir / vg_dest_ir\n", + "\n", + " return scores, nb_target" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def interest_ratio():\n", + " pass" + ] } ], "metadata": { diff --git a/src/data_processing/get_top_k_interest_query.py b/src/data_processing/get_top_k_interest_query.py index 7ea7dd6..2676232 100644 --- a/src/data_processing/get_top_k_interest_query.py +++ b/src/data_processing/get_top_k_interest_query.py @@ -11,8 +11,8 @@ def main(): parser = ArgumentParser() # parser.add_argument('--interest_count_file', default='data/all_FB_interests_2016/all_FB_interests_2016.csv') - parser.add_argument('--interest_sorted_file', default='data/top_interests_complete.json') - parser.add_argument('--query_file', default='data/queries/US_MX_native_interests.json') + parser.add_argument('--interest_sorted_file', default='data/top_interests_complete_clean.json') + parser.add_argument('--query_file', default='data/queries/hispanic_MX_expats.json') parser.add_argument('--top_k', default=3000) args = parser.parse_args() # interest_count_file = args.interest_count_file @@ -38,7 +38,6 @@ def main(): ## write out_file = query_file.replace('.json', '_top_%d_interest.json'%(top_k)) - print(out_file) json.dump(query, open(out_file, 'w'), indent=4, encoding='latin1') if __name__ == '__main__': diff --git a/src/data_processing/mine_facebook_audience.py b/src/data_processing/mine_facebook_audience.py index 909312a..373ab1f 100644 --- a/src/data_processing/mine_facebook_audience.py +++ b/src/data_processing/mine_facebook_audience.py @@ -17,15 +17,18 @@ def main(): # parser.add_argument('--query_file', default='data/hispanic_expat_lang_age.json') # parser.add_argument('--query_file', default='data/hispanic_lang_age.json') # parser.add_argument('--query_file', default='data/US_MX_native_interests.json') - parser.add_argument('--query_file', default='data/queries/US_MX_native_interests_top_3000_interest_new.json') +# parser.add_argument('--query_file', default='data/queries/US_MX_native_interests_top_3000_interest_new.json') + parser.add_argument('--query_file', default='data/queries/hispanic_MX_expats_top_3000_interest.json') + parser.add_argument('--interest_file', default='data/top_interests_complete_names.csv') parser.add_argument('--out_dir', default='data/query_results/') + parser.add_argument('--response_file', default=None) args = parser.parse_args() query_file = args.query_file out_dir = args.out_dir + response_file = args.response_file ## TEST: try multiple queries at once extra_auth_files = ['data/facebook_auth_ingmar.csv'] -# extra_auth_files = ['data/facebook_auth.csv',]*2 ## temporary: remove interest IDs that we've already queried # response_file = 'dataframe_collecting_1527334686.csv' @@ -39,9 +42,7 @@ def main(): # print(tmp_query_file) # json.dump(leftover_query, open(tmp_query_file, 'w'), indent=4) - query_and_write(query_file, out_dir, extra_auth_files=extra_auth_files) -# query_and_write(query_file, out_dir, extra_auth_files=extra_auth_files) -# query_and_write(query_file, out_dir, extra_auth_files=extra_auth_files, response_file=response_file) + query_and_write(query_file, out_dir, extra_auth_files=extra_auth_files, response_file=response_file) ## TODO: periodically copy response to server ## so we can tell when something goes ## wrong even if we're not on the same machine