反向图像搜索应用程序

问题描述 投票:0回答:1

下面的代码是搜索新闻的程序(以图像和文本形式给出),我们遇到了一些错误,并且数周之内无法修复。我们被困住了,我们找不到出路。下面的代码获取图像和与之关联的文本,然后在各种受信任的网站上搜索结果,然后,如果结果足够相关,则将其标记为真实消息,否则将进行人工验证。

   # -*- coding: utf-8 -*-
    #Importing required libraries
    import os
    import io
    import sys
    from bs4 import BeautifulSoup
    import requests
    import array as arr
    import six as six
    import gensim

    from google.cloud import vision
    from google.cloud import language
    from google.cloud.language import enums
    from google.cloud.language import types

    print("--------------------------------------------------------------------------------")
    print("----- Welcome to the journalists' tool for checking image title relevance ------")
    print("--------------------------------------------------------------------------------")
    print("Please give the system a minute to import the required files...")
    print("--------------------------------------------------------------------------------")

    #Path to the api key to use google Vision and Language API
    credential_path = "AIzaSyDl7rVb5mg17ocd-k_-SHIUC4LluPKdDcE"
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

    #Title that is associated to the image in question
    given_title = "PM on Ayodhya Verdict"

    #Full Path to the image in question
    image_path = "C:\\Users\\Apoorv Jain\\Pictures\\Modi1.jpg"

    #given_title = "PM is most followed leader on instagram."
    #image_path = "/Users/apple/Desktop/Modi3.jpg"

    # given_title = "Footage shows missing Saudi journalist Jamal Kha
    # #Examplesshogg."
    # image_path = "/Users/apple/Desktop/embassy.jpg"

    #Loading the google text corpus trained on word2vec
    model = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\\Apoorv Jain\\Desktop\\GoogleNews-vectors-negative300.bin.gz', binary=True, limit = 500000)

    #Credible list of URLs - can be chnged by user
    credible = ['economictimes.', 'huffingtonpost.', 'theprint.', 'thelogicalindian.', 'thequint.', 'altnews.', 'wsj.', 'nypost.', 'nytimes.', 'bbc.', 'reuters.', 'economist.', 'pbs.', 'aljazeera.', 'thewire.', 'theatlantic.', 'theguardian.', 'edition.cnn',
                'cnbc.', 'scroll.in', 'financialexpress.', 'npr.', 'usatoday.', 'snopes.', 'politifact.']


    #---------------------#--------------------#---------------------#--------------------#
    #Function for entity analysis of the titles
    def entity_sentiment_text(text):
        """Detects entity sentiment in the provided text."""
        client = language.LanguageServiceClient()

        if isinstance(text, six.binary_type):
            text = text.decode('utf-8')

        document = types.Document(
            content=text.encode('utf-8'),
            type=enums.Document.Type.PLAIN_TEXT)

        # Detect and send native Python encoding to receive correct word offsets.
        encoding = enums.EncodingType.UTF32
        if sys.maxunicode == 65535:
            encoding = enums.EncodingType.UTF16

        result = client.analyze_entity_sentiment(document, encoding)

        for entity in result.entities:
            print('Mentions: ')
            print(u'Name: "{}"'.format(entity.name))
            for mention in entity.mentions:
                print(u'  Begin Offset : {}'.format(mention.text.begin_offset))
                print(u'  Content : {}'.format(mention.text.content))
                #print(u'  Magnitude : {}'.format(mention.sentiment.magnitude))
                #print(u'  Sentiment : {}'.format(mention.sentiment.score))
                #print(u'  Type : {}'.format(mention.type))
            print(u'Salience: {}'.format(entity.salience))
            #print(u'Sentiment: {}\n'.format(entity.sentiment))
            print("--------------------------------------------------------------------------------")

    #---------------------#--------------------#---------------------#--------------------#
    #Function for google's clous vision API
    def detect_web(path):
        list = []
        i = 0
        """Detects web annotations given an image."""
        from google.cloud import vision
        client = vision.ImageAnnotatorClient()

        with io.open(path, 'rb') as image_file:
            content = image_file.read()

        image = vision.types.Image(content=content)

        response = client.web_detection(image=image)
        annotations = response.web_detection

        if annotations.best_guess_labels:
            for label in annotations.best_guess_labels:
                print('\nBest guess for the image: {}'.format(label.label))
                print("--------------------------------------------------------------------------------")


        if annotations.pages_with_matching_images:
            print('\n{} Pages with matching images found:'.format(
                len(annotations.pages_with_matching_images)))

            for page in annotations.pages_with_matching_images:
                print('\n\tPage url   : {}'.format(page.url))
                list.append(page.url)

        if annotations.web_entities:
            print('\n{} Web entities found in the image: '.format(
                len(annotations.web_entities)))

            for entity in annotations.web_entities:
                print('\n\tScore      : {}'.format(entity.score))
                print(u'\tDescription: {}'.format(entity.description))

        if annotations.visually_similar_images:
            print('\n{} visually similar images found:\n'.format(
                len(annotations.visually_similar_images)))

            for image in annotations.visually_similar_images:
                print('\tImage url    : {}'.format(image.url))
        print("--------------------------------------------------------------------------------")
        return(list)
    #---------------------#--------------------#---------------------#--------------------#
    #Function to check which URLs belong to credible news sources
    def credible_list(list_of_page_urls):

        c_list = []

        c_length = len(credible)

        url_length = len(list_of_page_urls)

        f = [[0 for j in range(c_length)] for i in range(url_length)]
        for i in range(url_length):
            for j in range(c_length):
                f[i][j] = list_of_page_urls[i].find(credible[j])
                if((list_of_page_urls[i].find(credible[j])) > 0):
                    c_list.append(list_of_page_urls[i])
        if c_list == []:
            print("No credible sources have used this image, please perform human verification.")
            print("--------------------------------------------------------------------------------")
            exit(1)
        return(c_list)
    #---------------------#--------------------#---------------------#--------------------#
    #Function to scrape titles off the given URLs
    def titles(credible_from_url_list):

        title_list = []

        for urls in credible_from_url_list:
            if urls != []:
                r = requests.get(urls)
                html = r.content
                soup = BeautifulSoup(html, 'html.parser')
                title_list.append(soup.title.string)

        return(title_list)

    #---------------------#--------------------#---------------------#--------------------#
    #Function to print the scraped titles
    def print_article_title(title_list):
        print("Credible article titles which use the same image: ")
        print("--------------------------------------------------------------------------------")
        for title in title_list:
            print(title)
            print("--------------------------------------------------------------------------------")
    #---------------------#--------------------#---------------------#--------------------#
    #Function to call google's language API for entity analysis
    def entity_analysis(title_list):
        for title in title_list:
            entity_sentiment_text(title)

    #---------------------#--------------------#---------------------#--------------------#
    #Function to compute the WM distances between titles and associated title and the average distance
    def wmdist(title_list):
        print("Word Mover's Distance for Titles:")
        print("--------------------------------------------------------------------------------")
        distances = []
        for title in title_list:
            dist = model.wmdistance(given_title, title) #determining WM distance
            distances.append(dist)
            #distance = model.WmdSimilarity(given_title, title)

        sum_dist = 0
        for distance in distances:
            sum_dist = sum_dist + distance
            print ('distance = %.3f' % distance)
            print("--------------------------------------------------------------------------------")

        avg_dist = sum_dist/len(distances)
        print("Average Distance: {}".format(avg_dist))
        print("--------------------------------------------------------------------------------")
        return(avg_dist)

    #---------------------#--------------------#---------------------#--------------------#
    #Function to decide whether human verification is required
    def human_ver(avg_dist):
        if(avg_dist >= 1.0):
            print("The title and image are flagged. Please use human verification!")
            print("--------------------------------------------------------------------------------")

        else:
            print("The title associated with this image seems to be right. Human verification is NOT required.")
            print("--------------------------------------------------------------------------------")

    #---------------------#--------------------#---------------------#--------------------#
    #Main function to call the rest of the above functions
    def main():
        list_of_page_urls = []
        credible_from_url_list = []
        title_list = []
        list_of_page_urls = detect_web(image_path)
        credible_from_url_list = credible_list(list_of_page_urls)
        title_list = titles(credible_from_url_list)
        print_article_title(title_list)
        entity_analysis(title_list)
        avg_dist = wmdist(title_list)
        human_ver(avg_dist)

    #---------------------#--------------------#---------------------#--------------------#

    if __name__ == "__main__":
        main()

错误是:screenshot

python image-processing beautifulsoup google-cloud-vision google-vision
1个回答
0
投票

我不确定,因为没有文件或凭证就无法运行实际的代码。

尝试替换此行:

model = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\\Apoorv Jain\\Desktop\\GoogleNews-vectors-negative300.bin.gz', binary=True, limit = 500000)

此行:

model = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\\Apoorv Jain\\Desktop\\GoogleNews-vectors-negative300.bin', binary=True, limit = 500000)
© www.soinside.com 2019 - 2024. All rights reserved.