MountainProject/MPAnalyzer.py at master · rankrh/MountainProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 17 19:51:21 2018

@author: Bob
"""

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sqlalchemy import create_engine
from config import config
import pandas as pd
import numpy as np
import unidecode
import psycopg2
import re
import click
from tqdm import tqdm
from mpproj.routefinder.StyleInformation import *


def MPAnalyzer():
    '''Finishes cleaning routes using formulas that require information about
    the whole database.

    The Bayesian rating system, route clustering algorithm and calculation of
    TFIDF values require information about all routes, and not just one that is
    of interest.  Therefore, this file must be run after all data collection
    has finished. This function is a handler for six functions:
        - bayesian_rating: Calculates the weighted quality rating for each
            route
        - route_clusters: Groups routes together based on geographic distance
        - idf: Calculates inverse-document-frequency for words in the route
            descriptions
        - tfidf: Calclates term-frequency-inverse-document-frequency for words
            in route descriptions
        - normalize: Normalizes vectors for TFIDF values
        - find_route_styles: Compares routes to the ideal to help categorize

    Returns:
        Updated SQL Database
    '''

    print('Connecting to the PostgreSQL database...', end='')
    engine = create_engine(
        'postgresql+psycopg2://postgres:postgres@localhost:5432/routes')
    params = config.config()
    conn = psycopg2.connect(**params)
    cursor = conn.cursor()
    print('Connected')
    tqdm.pandas()

    def tfidf(min_occur=0.001, max_occur=0.9):
        ''' Calculates Term-Frequency-Inverse-Document-Frequency for a body of
        documents.

        Term-Frequency-Inverse-Document-Frequency(TFIDF) is a measure of the
        importance of words in a body of work measured by how well they help to
        distinguish documents.  Words that appear frequently in documents score
        high on the Term-Frequency metric, but if they are common across the
        corpus, they will have low Inverse-Document-Frequency scores.  TFIDF
        can then be used to compare documents to each other, or, in this case,
        to documents with known topics.

                                   TFIDF = TF * IDF

                          TF = Term Frequency
                          IDF = Inverse Document Frequency

        Args:
            min_occur(int): The minimum number of documents that a word has to
                appear in to be counted. Included to ignore words that only
                appear in a few documents, and are therefore not very useful
                for categorization.
            max_occur(int): The maximum number of documents that a word can
                appear in to be counted.  This is included to ignore highly
                common words that don't help with categorization.

        Returns:
            routes(pandas Dataframe): Holds route-document information,
                including term-frequency, inverse-document-frequency, TFIDF,
                and normalized TFIDF values
            Updated SQL Database: Updates the TFIDF table on main DB with the
                routes dataframe
        '''

        print('Getting number of routes', end=' ', flush=True)
        cursor.execute('SELECT COUNT(route_id) FROM Routes')
        num_docs = cursor.fetchone()[0]
        print(num_docs)

        print('Getting route text data', flush=True)
        min_occur *= num_docs
        max_occur *= num_docs
        query = 'SELECT route_id, word, tf FROM Words'
        routes = pd.read_sql(query, con=conn, index_col='route_id')

        print('Removing non-essential words.', flush=True)
        routes = routes.groupby('word', group_keys=False)
        routes = routes.progress_apply(
            weed_out,
            min_occur=min_occur,
            max_occur=max_occur)\
                       .set_index('route_id')

        print('Getting IDF', flush=True)
        routes = routes.groupby('word', group_keys=False)
        routes = routes.progress_apply(
            idf,
            num_docs=num_docs).set_index('route_id')

        print('Calculating TFIDF', flush=True)
        routes['tfidf'] = routes['tf'] * routes['idf']

        print('Normalizing TFIDF values', flush=True)
        routes = routes.groupby(routes.index, group_keys=False)
        routes = routes.progress_apply(lambda x: normalize('tfidf', table=x))

        print('Writing TFIDF scores to SQL', flush=True)
        routes = routes.set_index('route_id')
        routes = routes[['word', 'idf', 'tfidfn']]

        # This will take a long time
        routes.to_sql('TFIDF', con=engine, if_exists='replace', chunksize=1000)

    def weed_out(table, min_occur, max_occur):
        '''Removes words that are too common or too rare

        Args:
            table(Series): Instances of a word
            min_occur: Fewest number acceptable
            max_occur: Greatest number acceptable

        Returns:
            table: updated series'''

        if min_occur < len(table) < max_occur:
            return table.reset_index()

    def idf(word, num_docs):
        ''' Finds inverse document frequency for each word in the selected
        corpus.

        Inverse document frequency(IDF) is a measure of how often a word
        appears in a body of documents.  The value is calculated by:

                            IDF = 1 + log(N / dfj)

             N = Total number of documents in the corpus
             dfj = Document frequency of a certain word, i.e., the number of
                 documents that the word appears in.

        Args:
            word(pandas dataframe): A dataframe composed of all instances of a
                word in a corpus.
            num_docs(int): The total number of documents in the corpus

        Returns:
            word(pandas dataframe): The same document with the calculated IDF
                score appended.
        '''

        word['idf'] = 1 + np.log(num_docs / len(word))
        return word.reset_index()

    def normalize(*columns, table, inplace=False):
        ''' Normalizes vector length.

        Vector values must be normalized to a unit vector to control for
        differences in length.  This process is done by calculating the length
        of a vector and dividing each term by that value.  The resulting
        'unit-vector' will have a length of 1.

        Args:
            table(pandas dataframe): Table hosting vector to be normalized
            *columns(str): Names of columns to be normalized
            inplace(Boolean, default = False):
                If inplace=False, adds new columns with normalized values.
                If inplace=True, replaces the columns.

        Returns:
            table(pandas dataframe): Updated dataframe with normalized values.
        '''
        for column in columns:
            if not inplace:
                column_name = column + 'n'
            elif inplace:
                column_name = column

            length = np.sqrt(np.sum(table[column] ** 2))
            table[column_name] = table[column] / length
        return table.reset_index()

    def fill_null_loc():
        """Fills empty route location data.

        Not all routes have latitude and longitude coordinates, so we must use
        the coordinates of their parent area instead as a rough estimate.  This
        function first grabs all routes with no data, then fills in the data
        with the lowest level area it can, going up as many areas as needed
        until it finds one with proper coordinates.

        Returns:
            Updated SQL Database
            """
        print('Filling in empty locations', flush=True)
        # Select a route without location data
        cursor.execute('''
            SELECT route_id, area_id, name FROM Routes
            WHERE latitude is Null OR longitude is Null
            LIMIT 1''')
        route = cursor.fetchone()

        while route is not None:
            # Route ID
            rid = route[0]
            # From ID
            fid = route[1]
            name = route[2]
            print(f'Finding location information for {name}')

            # Loops until it finds proper data
            lat, long = None, None
            while lat == None or long == None:
                # Gets latitude and longitude from parent area
                cursor.execute(f'''
                    SELECT
                        latitude,
                        longitude,
                        from_id
                    FROM Areas
                    WHERE id = {fid}
                    LIMIT 1''')
                loc = cursor.fetchone()
                lat, long = loc[0], loc[1]
                fid = loc[2]
            # Updates DB
            cursor.execute(f'''
                UPDATE Routes
                SET
                    latitude = {lat},
                    longitude = {long}
                WHERE route_id = {rid}''')
            conn.commit()
            cursor.execute('''
                SELECT
                    route_id,
                    area_id,
                    name
                FROM Routes
                WHERE
                    latitude is Null
                    OR longitude is Null
                LIMIT 1''')
            route = cursor.fetchone()

    def route_clusters(routes):
        ''' Clusters routes into area groups that are close enough to travel
        between when finding climbing areas.

        Routes can be sorted into any number of sub-areas below the 'region'
        parent. By clustering the routes based on latitude and longitude
        instead of the name of the areas and parent areas, the sorting
        algorithm will be able to more accurately determine which routes are
        close together. This function uses SciKit's Density Based Scan
        clustering algorithm. The algorithm works by grouping points together
        in space based on upper-limits of distance and minimum numbers of
        members of a cluster. More generally, the algorithm first finds the
        epsilon neighborhood of a point. This is the set of all points whose
        distance from a given point is less than a specified value epsilon.
        Then, it finds the connected core-points, which are the points that
        have at least the minimum number of connected points in its
        neighborhood. Non-core points are ignored here.  Finally, the
        algorithm assigns each non-core point to a nearby cluster if is within
        epsilon, or assigns it to noise if it is not.

        The advantages of this is that the scan clusters data of any shape, has
        a robust response to outliers and noise, and that the epsilon and min
        points variables can be adjusted.

        This function returns the label/name for the cluster that a route
        appears in, as well as the number of other routes in that same cluster.
        This will allow the sorting algorithm to more heavily weight routes
        that are clustered near others.

        Args:
            routes(pandas df): Pulled from cleaned route SQL DB with columns:
                - route_id (int, unique): Unique route identifies
                - latitude (float)
                - longitude (float)
        Returns:
            routes(pandas df): Updated with clustered area group number:
                - route_id (int, unique): Unique route identifies
                - area_group (int): Cluster id
        '''

        # Route location
        lats = routes['latitude']
        longs = routes['longitude']
        locs = []
        for x in range(len(lats)):
            locs.append((lats.iloc[x], longs.iloc[x]))

        # Converted into df
        locs = StandardScaler().fit_transform(locs)
        # Max distance in latitude
        epsilon = 0.0007
        # Min number of routes in a cluster
        min_routes = 3
        # Distance baced scan
        db = DBSCAN(eps=epsilon, min_samples=min_routes).fit(locs)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        # Cluster names
        labels = db.labels_
        unique, counts = np.unique(labels, return_counts=True)
        counts = dict(zip(unique, counts))
        # Number of routes in the same cluster as a given route
        area_counts = []

        for label in labels:
            if label >= 0:
                # Counts number of routes
                area_counts.append(counts[label])
            # Areas are given a cluster id of -1 if the are not part of a
            # cluster
            elif label == -1:
                # If so, there is only 1 route in their 'cluster'
                area_counts.append(1)

        routes['area_group'] = labels
        routes['area_counts'] = area_counts
        routes = routes[['area_group', 'area_counts']]
        return routes

    def bayesian_rating(routes):
        ''' Updates route quality with weighted average.

        The Bayesian average rating system helps to mitigate the effects of
        user ratings for routes that only have a few reviews.  The weighted
        rating works by first finding the average rating for all routes, and
        using that to bring low-rated routes up and high-rated routes down.
        The result - the Bayes rating - is an updated rating weighted by the
        average number of stars across all routes.  The weight decreases
        according to the number of votes cast.

                Bayesian rating = (r * v) + (a * 10) / (v + 10)

                    r = Route rating
                    v = Number of votes
                    a = Average rating across all routes

        Essentially, the function gives each route phantom-users who all give
        the route the average score.  For routes with a high number of ratings
        the effect of the additional phantom users is minimal, but for routes
        with only one or two actual user ratings, the effect is large.  This
        keeps 4-star rated routes from dominating the sorting algorithm if they
        only have a few votes, and helps promote unrated routes that may be of
        high quality.

        Args:
            routes(pandas df): Pulled from cleaned route SQL DB with columns:
                - route_id (int, unique): Unique route identifiers
                - stars (float): Raw average rating
                - votes (int): Number of user ratings
        Returns:
            routes(pandas df): Updated dataframe with Bayes rating and columns:
                - route_id (int, unique): Unique route identifies
                - bayes (float): Weighted average rating
        '''

        # Average rating of all routes
        stars = pd.read_sql('SELECT stars FROM Routes', con=conn)
        avg_stars = np.mean(stars)['stars']
        # Weighted Bayesian rating
        routes['bayes'] = round((((routes['votes'] * routes['stars'])
                            + avg_stars * 10) / (routes['votes'] + 10)), 1)
        return routes['bayes'].to_frame()

    def find_route_styles(*styles, path='Descriptions/'):
        ''' Returns weighted scores that represent a route's likelihood of
        containing any of a series of features, e.g., a roof, arete, or crack.

        Route names, descriptions, and user comments can indicate the presence
        of rock and route features. Term-Frequency-Inverse-Document-Frequency
        (TFIDF) values for the blocks of text gathered for each route can be
        compared to 'archetypal' routes to glean insight into these features.
        This comparison is further clarified using Bayesian statistics to
        measure the credibility of the comparision, and is then adjusted to
        reflect that.  At present, each route is compared against archetypal
        routes with the following features:

            Aretes - A sharp vertical edge of a block, cliff or boulder
            Chimney - A large vertical crack that a climber can fit in and
                climb using opposing pressure
            Crack - Smaller cracks ranging from finger-sized to a few inches
                wide (off-width)
            Slab - Low-angle rock faces (less than vertical)
            Overhang - Roofs, caves or more-than-vertical rock faces

        More styles or archetypes can be added in the future by creating .txt
        files and adding them to the 'Descriptions' sub-folder, then adding the
        style to the *styles argument.

        Args:
            *styles(str): The name of the files that each route will be
                compared against.
            path(str): Folder location of the Database

        Returns:
            Updated SQL Database with weighted route scores
        '''

        def text_splitter(text):
            '''Splits text into words and removes punctuation.

            Once the text has been scraped it must be split into individual
            words for further processing.  The text is all put in lowercase,
            then stripped of punctuation and accented letters. Tokenizing helps
            to further standardize the text, then converts it to a list of
            words. Each word is then stemmed using a Porter stemmer.  This
            removes suffixes that make similar words look different, turning,
            for example, 'walking' or 'walked' into 'walk'.  Stop words are
            also filtered out at this stage.

            Args:
                text(str): Single string of text to be handled

            Returns:
                text(list): List of processed words.'''

            # Converts to lowercase
            text = text.lower()
            # Strips punctuation and converts accented characters to unaccented
            text = re.sub(r"[^\w\s]", '', text)
            text = unidecode.unidecode(text)
            # Tokenizes words and returns a list
            text = word_tokenize(text)
            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            # Stems each word in the list
            ps = PorterStemmer()
            text = [ps.stem(word) for word in text if word not in stop_words]

            return text

        def archetypal_tf(*styles, path):
            ''' Returns term-frequency data for descriptions of archetypal
            climbing routes and styles.  This will be used later to categorize
            routes.

                            Term-Frequency = t / L

                    t = Number of appearances for a word in a document
                    L = Number of total words in the document

            Args:
                *styles(str): Name of .txt file to parse.  Can either be the
                    plain name or have the .txt suffix
                path(str): Path to folder with route descriptions
            Returns:
                tf.csv(CSV File): CSV File of term frequency for each style.
                    This will help determine if TF values are what is expected
                    when adding new styles.
                archetypes(Pandas Dataframe): Holds words term-frequency values
                    for words in the files.'''

            # Initializes Dataframe
            archetypes = pd.DataFrame()
            for style in styles:
                # Formats suffix
                if style.endswith('.txt'):
                    # Opens .txt file
                    try:
                        file = open(path + style)
                        style = style[:-4]
                    # Returns errors
                    except OSError as e:
                        return e
                else:
                    try:
                        file = open(path + style + '.txt')
                    except OSError as e:
                        return e


                # Creates single block of text
                text = ''
                for line in file:
                    text += line
                # Splits and processes text
                text = text_splitter(text)

                # Length of document in words
                length = len(text)
                # Counts appearances of each word
                text = pd.DataFrame({'word': text})['word']\
                         .value_counts()\
                         .rename('counts')\
                         .to_frame()

                # Calculates Term-Frequency
                text[style] = text['counts'].values / length
                text = text[style]

                # Creates master Dataframe of Termfrequency data for each style
                archetypes = pd.concat([archetypes, text], axis=1, sort=True)
            archetypes.to_csv(path + 'TF.csv')
            return archetypes

        def archetypal_idf(words):
            ''' Findes inverse document frequency (IDF) for each word in the
            archetypal style documents.

            The archetypal documents should not be included in the calculation
            of IDF values, so this function just pulls the IDF values from the
            database after they are calculated. IDF is a measure of how often a
            word appears in a body of documents. The value is calculated by:

                                IDF = 1 + log(N / dfj)

                 N = Total number of documents in the corpus
                 dfj = Document frequency of a certain word, i.e., the number
                     of documents that the word appears in.

            Args:
                word(list): All unique words in all the archetype documents

            Returns:
                archetypes(pandas dataframe): IDF values for each word pulled
                    from the Database.'''

            # Formats query to include list of unique words
            query = f'''
                SELECT
                    DISTINCT(word),
                    idf
                FROM "TFIDF"
                WHERE word IN {words}'''
            # Pulls SQL data into Pandas dataframe
            archetypes = pd.read_sql(query, con=conn, index_col='word')

            return archetypes

        def get_routes(route_ids=None):
            '''Creates Pandas Dataframe of normalized TFIDF values for each
            word in each route description.

            Args:
                route_ids: Optional.  Allows for a slice to be parsed.
            Returns:
                routes(Pandas Series): MultiIndex series with indexes
                'route_id' and 'word' and column 'tfidfn' - Normalized TFIDF'''

            # Pulls route_id, word, and normalized TFIDF value
            if route_ids is None:
                query = '''
                    SELECT
                        route_id,
                        word,
                        tfidfn
                    FROM "TFIDF"'''
            else:
                route_ids = tuple(route_ids)
                query = f'''
                    SELECT
                        route_id,
                        word,
                        tfidfn
                    FROM "TFIDF"
                    WHERE route_id in {route_ids}'''

            # Creates Pandas Dataframe
            routes = pd.read_sql(
                query,
                con=engine,
                index_col=['route_id', 'word'])
            routes = routes.squeeze()

            return routes

        def get_word_count(route_ids=None):
            '''Finds length of route description in words.

            Args:
                route_ids: Optional. Allows for a slice to be parsed
            Returns:
                word_count(Pandas dataframe): Dataframe with index route_id and
                    column 'word_count' - length of a route description in
                    words'''

            # Pulls route_id and word_count for each route
            if route_ids is None:
                query = 'SELECT route_id, word_count FROM Words'
            else:
                route_ids = tuple(route_ids)
                query = f'''
                    SELECT
                        route_id,
                        word_count
                    FROM Words
                    WHERE route_id in {route_ids}'''

            # Calculates document length
            word_count = pd.read_sql(query,
                                     con=conn,
                                     index_col='route_id').groupby(level=0)

            # We will take the log of the word count later, so we cannot leave
            # zeroes in the series
            word_count = word_count.progress_apply(lambda x: np.sum(x) + 0.01)
            word_count.fillna(0.01, inplace=True)

            return word_count

        def cosine_similarity(route, archetypes):
            '''Compares routes to archetypes to help categorize route style.

            Cosine similarity is the angle between two vectors.  Here, the
            normalized TFIDF values for each word in the route description and
            archetype documents serve as the coordinates of the vector. Finding
            the cosine similarity is therefore simply their dot-product.

                    Cosine Similarity = Σ(ai * bi)

                    ai = TFIDF for a word in the route description
                    bi = TFIDF for the same word in the archetype document.

            The similarity will range between 0 and 1, 1 being identical and 0
            having no similarity.

            Args:
                route(Pandas dataframe): MultiIndex frame with indexes route_id
                    and word and columns normalized TFDIF values
                archetypes(Pandas dataframe): Frame with index word and columns
                    normalized TFIDF values.

            Returns:
                terrain(Pandas dataframe): Frame with columns for each style,
                    holding cosine simlarity values.'''

            try:
                rid = route.index[0][0]
            except:
                return

            route = archetypes.multiply(route, axis=0)
            terrain = pd.DataFrame(index=[rid])

            for column in route:
                cosine_sim = np.sum(route[column])
                terrain[column] = cosine_sim

            return terrain


        def score_routes(*styles, word_count, path, routes):
            '''Gets TF, IDF data for archetypes, then finds TFIDF and cosine
            similarity for each route/style combination.

            Finding the raw cosine similarity scores requires the functions
            archetypal_tf, archetypal_idf, and normalize.  This function helps
            organize the retrieval and processing of the data for those functions.

            Args:
                word_count(Pandas dataframe): Dataframe with index route_id and
                    column 'word_count' - length of a route description in
                    words
            Returns:
                TFIDF.csv(CSV file): TFIDF for each word in each style.  This
                    helps users determine if the TFIDF values are what they
                    would expect when adding new styles.
                routes(Pandas dataframe): Holds cosine similarity for each
                    route/style combination'''

            if click.confirm('Rescore archetypes?'):
                # Gets Term-Frequency data for words in archetype documents
                archetypes = archetypal_tf(*styles, path=path)
                # Gets list of unique words in archetype documents
                words = tuple(archetypes.index.tolist())
                # Gets IDF Values for those words from the Database
                idf = archetypal_idf(words)
                # Selects words for archetype documents that have a correpsonding
                # IDF value in the database
                archetypes = archetypes[archetypes.index.isin(idf.index)]

                # Multiplies TF by IDF values to get TFIDF score
                archetypes = archetypes.mul(idf['idf'], axis=0)
                # Normalizes TFIDF scores
                archetypes = normalize(
                    table=archetypes,
                    inplace=True,
                    *styles)
                archetypes = archetypes.rename(
                    columns={'index': 'word'}
                    ).set_index('word')

                # Writes to CSV
                archetypes.to_csv(path + 'TFIDF.csv')

            archetypes = pd.read_csv(path + 'TFIDF.csv', index_col='word')

            # Groups words by route_id, then finds cosine similarity for each
            # route-style combination
            routes = routes.groupby('route_id').progress_apply(
                cosine_similarity,
                archetypes=archetypes)
            # Reformats routes dataframe
            routes.index = routes.index.droplevel(1)
            routes = pd.concat([routes, word_count], axis=1, sort=False)
            routes.fillna(0, inplace=True)

            return routes

        def weighted_scores(*styles, table, inplace=False):
            '''Weights cosine similarity based on credibility.

            The cosine similarity between a route and a style archetype
            measures how close the two documents are.  Depending on the score
            and the word count of the route, however, this score can be more or
            less believable.  Using Bayesian statistics helps weight the scores
            based on the credibility.

            We can plot word count and cosine similarity in two dimensions.
            Normalizing each so that the maximum value is one results in a
            plane with four edge cases:

                            cosine similarity | word count
                                    0               0
                                    1               0
                                    0               1
                                    1               1

            When both word count and cosine similarity is high, the
            believability of the cosine score is at its highest.  This is
            analagous to a route that scores well with the 'overhang' document,
            therefore mentioning words like 'overhang' or 'roof' frequently,
            that also has a lot of words.

            If the word count is high and the cosine similarity is low the
            believability of the score is high, but not as high as before.
            This is analagous to a route that never mentions words associated
            with 'overhang' despite a high word count.  We can be reasonably
            sure in this case that the route does not have an overhang.

            If the word count of a route is low but the cosine score is high,
            we can be reasonably sure that the score is somewhat accurate. This
            is a result of a route called, for instance, 'Overhang Route'.
            Despite the low word count, it is highly likely that the route has
            an overhang on it.

            Finally, for routes that have both low word count and cosine score,
            we have no way to be sure of the presence (or absence) of a
            feature.  In this case, our best guess is that the route is at
            chance of featuring a given style of climbing.

            If we chart word count, cosine similarity, and the credibility of
            the cosine score, we are left with a cone with a point at the
            origin, reaching up at a 45 degree angle along the credibility (z)
            axis. Each route will exist somewhere on the surface of the cone.
            To make use of this, we need to calculate this position. The height
            to the cone gives us the credibility, and can be calculated with:

                    Credibility = sqrt(W ** 2 + C ** 2) * tan(45 degrees)

            Since tan(45 degrees) is 1, this simplifies to:

                            Credibility = sqrt(W ** 2 + C ** 2)

                               W = Word count
                               C = Cosine similarity

            The credibility of a route's score can be fed back into the score
            to find a weighted route score.  As the word count and cosine score
            get close to zero, the average score should play more of a role in
            the outcome. Therefore:


                Score = C * sqrt(W ** 2 + C ** 2) + (1 - C)(1 - W) * Cm

                                W = word count
                                C = cosine Similarity
                                Cm = Average cosine similarity across routes

            Finally, the scores are processed with a Sigmoid function,
            specifically the logistic function.

                            f(x) = L / 1 + e^(-k(x-x'))

                                L = upper bound
                                e = Euler's constant
                                k = logistic growth rate
                                x' = Sigmoid midpoint

            By manipulating the constants in this function, we can find a
            continuous threshold-like set of values that are bounded by 0 and
            1.  The midpoint of the threshold is the mean value of the scores
            plus one standard devaition.  Therefore, the function used here is:

                            f(x) = 1 / (1 + e^(-100(x - x'))

                                x' = mean + sigma
                                e = Euler's constant


            Args:
                *styles(str): Names of the style archetypes
                table(Pandas dataframe): Master dataframe of cosine scores for
                    each route
                inplace(Boolean, default = False):
                    If inplace=False, adds new columns with weighted values.
                    If inplace=True, replaces the columns.

            Returns:
                Updated SQL Database'''

            # Gets name for the columns to write data
            if inplace:
                count = 'word_count'
            else:
                count = 'word_count_norm'

            # As the word count increases, the credibility increases as a
            # logarithmic function
            table[count] = np.log10(table['word_count'])

            table_min = table[count].min()
            table_max = table[count].max()
            table_diff = table_max - table_min

            table[count] = (table[count].values - table_min) / table_diff

            # Gets weighted scores for each style
            for style in styles:
                # Stores name to write data on
                if inplace:
                    column_name = style
                else:
                    column_name = style + '_weighted'


                # Find average cosine similarity across routes
                style_avg = table[style].mean()
                # Calculate weighted rating
                table[column_name] = (
                    table[style].values * np.sqrt(
                        table[style].values ** 2 + table[count].values ** 2)
                    + (1 - table[count].values) * (1 - table[style].values)
                    * style_avg)

                threshold = table[column_name].mean() + table[column_name].std()
                # Calculates final score using Sigmoid function
                table[column_name] = (
                    1 / (1 + np.e ** (-100 *
                                        (table[column_name]
                                    - threshold))))

            return table

        # Run functions

        print('Getting route information')
        routes = get_routes()

        print('Getting word count')
        word_count = get_word_count()

        print('Scoring routes')
        routes = score_routes(
            *styles,
            word_count=word_count,
            path=path,
            routes=routes)

        print('Getting weighted scores')
        routes = weighted_scores(*styles, table=routes, inplace=True)

        # Collects the full database
        query = 'SELECT * FROM Routes'
        all_routes = pd.read_sql(query, conn, index_col='route_id')

        # Combines columns in the routes dataframe with the full database if
        # they don't already exist in the full database
        updated = pd.concat(
            [routes[
                ~routes.index.isin(all_routes.index)],
                all_routes],
            sort=False)

        updated.update(routes)

        updated.rename_axis('id', inplace=True)

        for i in range(5):
            feature = terrain_types[i]
            other_features = terrain_types[:i] + terrain_types[i+1:]
            other_features = updated[other_features]
            updated[feature+'_diff'] = updated[feature] * (
                                            updated[feature]
                                            - other_features.sum(axis=1))

        # Write to Database
        updated.to_sql(
            'Routes_scored',
            con=engine,
            if_exists='replace')

        return

    def get_links(route, route_links):
        """Gets links between a route and all parent areas

        Args:
            route(Series): Route information
            route_links(Series): Links between areas
        Returns:
            Updated SQL"""


        route_id = route.name

        parents = [route.squeeze()]
        base = False

        while not base:
            try:
                grandparent = route_links.loc[parents[-1]]['from_id']
                parents.append(grandparent)
            except:
                base = True

        parents = pd.DataFrame({
            'id': route_id,
            'area': parents,
            })

        parents = parents.dropna(how='any')

        parents.to_sql(
            'route_links',
            con=engine,
            if_exists='append',
            index=False)


    def get_children(area):
        """Gets area children for all areas

        Args:
            area(Series): area information

        Returns:
            Updated SQL"""

        # Checks if child area in area DB
        try:
            children = area_links.loc[area]
        # If not, this is a base level area
        except:
            return

        if type(children) is np.int64:

            children = pd.Series(
                    data=children,
                    index=[area],
                    name='id')
            children.index.name = 'from_id'

        for child in children:
            grandchildren = get_children(child)
            if grandchildren is not None:
                grandchildren.index = [area] * len(grandchildren)
                children = pd.concat([children, grandchildren])

        return children

    def get_area_details(*styles):
        """Gets route data for each area and creates a summary.

        Args:
            styles: terrain styles

        Returns:
            Updated SQL
        """

        routes = pd.read_sql("""
           SELECT *
           FROM routes_scored""",
           con=engine,
           index_col='id')

        terrain_info = routes[terrain_types]

        average_stars = routes.stars.mean()
        other = ['alpine', 'pitches', 'length', 'danger_conv']