{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "

Table of Contents

\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# CDCR\n", "\n", "* We follow the blogpost on and see if we get it to work.\n", "\n", "### adjustments\n", "\n", "1. I could not get Cython to work, so we create the cosine matrix using sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!pip install textpack\n", "#!pip install Cython --upgrade" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(276416, 16)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
case_idtrade_nmlegal_namestreet_addr_1_txtcty_nmst_cdzip_cdnaic_cdnaics_code_descriptioncase_violtn_cntcmp_assd_cntee_violtd_cntbw_atp_amtee_atp_cntfindings_start_datefindings_end_date
01473280.0EMTAErie Metropolitan Transit Authority127 East 14th StreetEriePA16503.09770.0Local Transit Systems3.00.02.0570.001.04/22/054/20/07
11768364.0Heartland Living & Rehab.Heartoland of Greensboro, Inc.1131 North Church St.GreensboroNC27401.0623110.0Nursing Care Facilities3.00.02.04528.982.011/30/1311/28/15
21692721.0Eli's TableJoseph's Cafe, LLC129 Meeting StreetCharlestonSC29401.0722110.0Full-Service Restaurants4.00.04.01033.804.010/1/115/29/13
31768367.0McDowell County Dept. Social ServicesMcDowell County Local Government60 East Court StreetMarionNC28752.09890.0Other Local Governmental Facilities2.00.01.0586.421.011/9/1311/6/15
41768533.0Marietta Aesthetic CenterSanda Gane European Day Spa & Hair Studio3405 Dallas Hwy.MariettaGA30064.0812112.0Beauty Salons15.00.014.03873.6014.08/3/138/8/15
\n", "
" ], "text/plain": [ " case_id trade_nm \\\n", "0 1473280.0 EMTA \n", "1 1768364.0 Heartland Living & Rehab. \n", "2 1692721.0 Eli's Table \n", "3 1768367.0 McDowell County Dept. Social Services \n", "4 1768533.0 Marietta Aesthetic Center \n", "\n", " legal_name street_addr_1_txt \\\n", "0 Erie Metropolitan Transit Authority 127 East 14th Street \n", "1 Heartoland of Greensboro, Inc. 1131 North Church St. \n", "2 Joseph's Cafe, LLC 129 Meeting Street \n", "3 McDowell County Local Government 60 East Court Street \n", "4 Sanda Gane European Day Spa & Hair Studio 3405 Dallas Hwy. \n", "\n", " cty_nm st_cd zip_cd naic_cd naics_code_description \\\n", "0 Erie PA 16503.0 9770.0 Local Transit Systems \n", "1 Greensboro NC 27401.0 623110.0 Nursing Care Facilities \n", "2 Charleston SC 29401.0 722110.0 Full-Service Restaurants \n", "3 Marion NC 28752.0 9890.0 Other Local Governmental Facilities \n", "4 Marietta GA 30064.0 812112.0 Beauty Salons \n", "\n", " case_violtn_cnt cmp_assd_cnt ee_violtd_cnt bw_atp_amt ee_atp_cnt \\\n", "0 3.0 0.0 2.0 570.00 1.0 \n", "1 3.0 0.0 2.0 4528.98 2.0 \n", "2 4.0 0.0 4.0 1033.80 4.0 \n", "3 2.0 0.0 1.0 586.42 1.0 \n", "4 15.0 0.0 14.0 3873.60 14.0 \n", "\n", " findings_start_date findings_end_date \n", "0 4/22/05 4/20/07 \n", "1 11/30/13 11/28/15 \n", "2 10/1/11 5/29/13 \n", "3 11/9/13 11/6/15 \n", "4 8/3/13 8/8/15 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity \n", "#from sparse_dot_topn import awesome_cossim_topn\n", "\n", "# Import your data to a Pandas.DataFrame\n", "df = pd.read_csv('./dol-data.csv')\n", "\n", "print(df.shape)\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Instaniate our lookup hash table\n", "group_lookup = {}\n", "\n", "\n", "# Write a function for cleaning strings and returning an array of ngrams\n", "def ngrams_analyzer(string):\n", " string = re.sub(r'[,-./]', r'', string)\n", " ngrams = zip(*[string[i:] for i in range(3)]) # N-Gram length is 5 # MM CHANGED TO 3 \n", " return [''.join(ngram) for ngram in ngrams]\n", "\n", "\n", "def find_group(row, col):\n", " # If either the row or the col string have already been given\n", " # a group, return that group. Otherwise return none\n", " if row in group_lookup:\n", " return group_lookup[row]\n", " elif col in group_lookup:\n", " return group_lookup[col]\n", " else:\n", " return None\n", "\n", "\n", "def add_vals_to_lookup(group, row, col):\n", " # Once we know the group name, set it as the value\n", " # for both strings in the group_lookup\n", " group_lookup[row] = group\n", " group_lookup[col] = group\n", "\n", "\n", "def add_pair_to_lookup(row, col):\n", " # in this function we'll add both the row and the col to the lookup\n", " group = find_group(row, col) # first, see if one has already been added\n", " if group is not None:\n", " # if we already know the group, make sure both row and col are in lookup\n", " add_vals_to_lookup(group, row, col)\n", " else:\n", " # if we get here, we need to add a new group.\n", " # The name is arbitrary, so just make it the row\n", " add_vals_to_lookup(row, row, col)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "TfidfVectorizer?" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(237573,)\n", "(10000, 9989)\n" ] } ], "source": [ "# Construct your vectorizer for building the TF-IDF matrix\n", "vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,3)) # (analyzer=ngrams_analyzer)\n", "\n", "# Grab the column you'd like to group, filter out duplicate values\n", "# and make sure the values are Unicode\n", "vals = df['legal_name'].unique().astype('U')\n", "print(vals.shape)\n", "# Build the matrix!!!\n", "tfidf_matrix = vectorizer.fit_transform(vals[:10000]) # take not all values to save time in the next step\n", "print(tfidf_matrix.shape)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.48 s, sys: 379 ms, total: 3.86 s\n", "Wall time: 3.9 s\n" ] } ], "source": [ "%%time \n", "#cosine_matrix = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), vals.size, 0.8)\n", "\n", "cosine_matrix= cosine_similarity(tfidf_matrix, dense_output=False)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "14608.310323422866 0.8644993681751015\n", "CPU times: user 6.21 s, sys: 183 ms, total: 6.4 s\n", "Wall time: 6.43 s\n" ] } ], "source": [ "%%time \n", "# set all values lower than .8 to 0\n", "\n", "cosine_matrix=cosine_matrix.multiply(cosine_matrix >= .6)\n", "print((cosine_matrix.data).sum(), (cosine_matrix.data).mean())" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6404" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DF= cosine_matrix.sum(axis=1)\n", "(DF>1).sum()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "235667\n", "CPU times: user 207 ms, sys: 10 ms, total: 217 ms\n", "Wall time: 235 ms\n" ] } ], "source": [ "%%time\n", "# for each row and column in coo_matrix\n", "# if they're not the same string add them to the group lookup\n", "for row, col in zip(coo_matrix.row, coo_matrix.col):\n", " if row != col:\n", " add_pair_to_lookup(vals[row], vals[col])\n", "\n", "df['Group'] = df['legal_name'].map(group_lookup).fillna(df['legal_name'])\n", "\n", "#df.to_csv('./dol-data-grouped.csv')\n", "print(df.Group.unique().size)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "US POSTAL SERVICE 602\n", "Wal-Mart Stores East LP 144\n", "USPS 127\n", "Victory Enterprises, Inc. 119\n", "JAG Construction, Inc 114\n", " ... \n", "Natives Restaurant, LLC 1\n", "Edgemont Enterprises, Inc. 1\n", "Barclays Capital Inc. 1\n", "Standards Home Care, Inc. 1\n", "New North Florida Cooperative Association 1\n", "Name: Group, Length: 235666, dtype: int64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Group.value_counts()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
case_idtrade_nmlegal_namestreet_addr_1_txtcty_nmst_cdzip_cdnaic_cdnaics_code_descriptioncase_violtn_cntcmp_assd_cntee_violtd_cntbw_atp_amtee_atp_cntfindings_start_datefindings_end_dateGroup
3081448316.0Wal-MartWal-Mart Stores East LP103 W. PolkWarsawMO65355.0452910.0Warehouse Clubs and Supercenters0.00.00.00.000.07/17/047/14/06Wal-Mart Stores East LP
18861503906.0Wal-MartWal-Mart Stores, Inc.702 S.W. 8th St.BentonvilleAR72716.045211.0Department Stores154.00.0154.042844.39154.02/3/021/31/07Wal-Mart Stores East LP
32751642363.0WalmartWalmart Stores, Inc.1091 Millcreek RoadAllentownPA18106.0452990.0All Other General Merchandise Stores0.00.01.00.000.03/11/103/10/12Wal-Mart Stores East LP
65181487151.0Wal-MartWal-Mart Stores, Inc.3209 S. Louise AveSioux FallsSD57106.0452990.0All Other General Merchandise Stores1.00.00.00.000.07/23/079/12/07Wal-Mart Stores East LP
85941446927.0Wal-Mart #1069Wal-Mart Stores, Inc.231 Eastside DriveNewtonMS39345.0452910.0Warehouse Clubs and Supercenters0.00.00.00.000.06/18/046/9/06Wal-Mart Stores East LP
......................................................
2490031403735.0Wal-Mart SupercenterWal-Mart Stores, Inc.11250 E. Colonial Dr.OrlandoFL32817.04529.0Other General Merchandise Stores1.00.01.04622.401.02/5/055/6/05Wal-Mart Stores East LP
2552541841415.0Walmart Supercenter #3229Walmart Stores, Inc.3800 U.S. Highway 287 W.VernonTX76384.0452112.0Discount Department Stores0.00.00.00.000.01/11/161/10/18Wal-Mart Stores East LP
2574781855178.0Wal-MartWal-Mart Stores, Inc.1400 S. Lamb BlvdLas VegasNV89104.0452112.0Discount Department Stores0.00.01.00.000.09/16/1712/18/17Wal-Mart Stores East LP
2623001842192.0Walmart Store #2474Walmart Stores, Inc.Railroad 4 Box 82KeyserWV26726.0452910.0Warehouse Clubs and Supercenters1.00.01.00.000.012/17/1612/17/17Wal-Mart Stores East LP
2741931862876.0Wal-Mart SupercenterWal-Mart Stores East LP950 Hwy. 80 EastClintonMS39056.0452112.0Discount Department Stores0.00.01.00.000.03/28/186/4/18Wal-Mart Stores East LP
\n", "

144 rows × 17 columns

\n", "
" ], "text/plain": [ " case_id trade_nm legal_name \\\n", "308 1448316.0 Wal-Mart Wal-Mart Stores East LP \n", "1886 1503906.0 Wal-Mart Wal-Mart Stores, Inc. \n", "3275 1642363.0 Walmart Walmart Stores, Inc. \n", "6518 1487151.0 Wal-Mart Wal-Mart Stores, Inc. \n", "8594 1446927.0 Wal-Mart #1069 Wal-Mart Stores, Inc. \n", "... ... ... ... \n", "249003 1403735.0 Wal-Mart Supercenter Wal-Mart Stores, Inc. \n", "255254 1841415.0 Walmart Supercenter #3229 Walmart Stores, Inc. \n", "257478 1855178.0 Wal-Mart Wal-Mart Stores, Inc. \n", "262300 1842192.0 Walmart Store #2474 Walmart Stores, Inc. \n", "274193 1862876.0 Wal-Mart Supercenter Wal-Mart Stores East LP \n", "\n", " street_addr_1_txt cty_nm st_cd zip_cd naic_cd \\\n", "308 103 W. Polk Warsaw MO 65355.0 452910.0 \n", "1886 702 S.W. 8th St. Bentonville AR 72716.0 45211.0 \n", "3275 1091 Millcreek Road Allentown PA 18106.0 452990.0 \n", "6518 3209 S. Louise Ave Sioux Falls SD 57106.0 452990.0 \n", "8594 231 Eastside Drive Newton MS 39345.0 452910.0 \n", "... ... ... ... ... ... \n", "249003 11250 E. Colonial Dr. Orlando FL 32817.0 4529.0 \n", "255254 3800 U.S. Highway 287 W. Vernon TX 76384.0 452112.0 \n", "257478 1400 S. Lamb Blvd Las Vegas NV 89104.0 452112.0 \n", "262300 Railroad 4 Box 82 Keyser WV 26726.0 452910.0 \n", "274193 950 Hwy. 80 East Clinton MS 39056.0 452112.0 \n", "\n", " naics_code_description case_violtn_cnt cmp_assd_cnt \\\n", "308 Warehouse Clubs and Supercenters 0.0 0.0 \n", "1886 Department Stores 154.0 0.0 \n", "3275 All Other General Merchandise Stores 0.0 0.0 \n", "6518 All Other General Merchandise Stores 1.0 0.0 \n", "8594 Warehouse Clubs and Supercenters 0.0 0.0 \n", "... ... ... ... \n", "249003 Other General Merchandise Stores 1.0 0.0 \n", "255254 Discount Department Stores 0.0 0.0 \n", "257478 Discount Department Stores 0.0 0.0 \n", "262300 Warehouse Clubs and Supercenters 1.0 0.0 \n", "274193 Discount Department Stores 0.0 0.0 \n", "\n", " ee_violtd_cnt bw_atp_amt ee_atp_cnt findings_start_date \\\n", "308 0.0 0.00 0.0 7/17/04 \n", "1886 154.0 42844.39 154.0 2/3/02 \n", "3275 1.0 0.00 0.0 3/11/10 \n", "6518 0.0 0.00 0.0 7/23/07 \n", "8594 0.0 0.00 0.0 6/18/04 \n", "... ... ... ... ... \n", "249003 1.0 4622.40 1.0 2/5/05 \n", "255254 0.0 0.00 0.0 1/11/16 \n", "257478 1.0 0.00 0.0 9/16/17 \n", "262300 1.0 0.00 0.0 12/17/16 \n", "274193 1.0 0.00 0.0 3/28/18 \n", "\n", " findings_end_date Group \n", "308 7/14/06 Wal-Mart Stores East LP \n", "1886 1/31/07 Wal-Mart Stores East LP \n", "3275 3/10/12 Wal-Mart Stores East LP \n", "6518 9/12/07 Wal-Mart Stores East LP \n", "8594 6/9/06 Wal-Mart Stores East LP \n", "... ... ... \n", "249003 5/6/05 Wal-Mart Stores East LP \n", "255254 1/10/18 Wal-Mart Stores East LP \n", "257478 12/18/17 Wal-Mart Stores East LP \n", "262300 12/17/17 Wal-Mart Stores East LP \n", "274193 6/4/18 Wal-Mart Stores East LP \n", "\n", "[144 rows x 17 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.Group=='Wal-Mart Stores East LP']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" }, "toc": { "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }