[AI] 使用Azure AI search(搜尋服務)

2024-06-18

Azure目前有提供文件內容搜尋服務

Azure搜尋服務可以依據你上傳的文件來回答問題,

其背後的運作為: 將你的文件轉為純文字=>文本依字數切割成段=>將各段進行embedding=>GPT收到問題時, 找尋最相似的資料參考以回答問題, 並列出參考段

目前此作法有在玩生成式AI皆會使用, AI search將這些包成一個服務,

若要使用此服務, 需要先建立以下Azure資源:

搜尋服務
Azure OpenAI, 並於 Azure OpenAI Studio 部署 text-embedding-ada-002
儲存體帳戶, 並建立容器, 於容器內上傳AI必須參考的文件

在本機Python環境執行以下:

pip install azure-keyvault-secrets azure-core azure-search-documents azure-search-documents==11.6.0b1 azure-storage-blob azure-identity

以下Python語法用來建立AI Search內的索引等功能:

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient

#設變數, 至Azure上述所建的3項資源中找金鑰copy
#Azure AI Search
endpoint = r'https://xxx.search.windows.net'
key =  '...'#金鑰
credential = AzureKeyCredential(key)
index_name = "..." #請變更為您的Index名稱
index_client = SearchIndexClient(endpoint=endpoint, credential=credential) 
indexer_client = SearchIndexerClient(endpoint, credential) 

#Azuer Blob Storage
blob_connection_string = r'...'#連線字串
blob_container_name = "..." #請變更文檔放置的container name

#Azure OpenAI
azure_openai_endpoint = r'https://....openai.azure.com/'
azure_openai_key = '...'
azure_openai_embedding_deployment = "embedding" #只能用 text-embedding-ada-002 版本
chunk_size_char = 1500 # #單位為Characters,可依需求變更 (中文字約1:1, 英文約4:1或5:1) 
chunk_overlap_char = 150 



from azure.search.documents.indexes.models import (SearchIndexerDataContainer,SearchIndexerDataSourceConnection, )

# Create a data source 

container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)


print(f"Data source '{data_source.name}' created or updated")

from azure.search.documents.indexes.models import (  
    SimpleField,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex,
    )
    
# Create a search index  
fields = [  
    SearchableField(name="id", type=SearchFieldDataType.String,key=True,  sortable=True,  facetable=True, analyzer_name="keyword"),  
    SimpleField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchableField(name="file_name", type=SearchFieldDataType.String,filterable=True), 
    SearchableField(name="doc_url", type=SearchFieldDataType.String),  
    SearchableField(name="contents", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False,analyzer_name="zh-Hant.lucene"), #若文件確定只有中文,可指定繁中語系 
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_deployment,  
                api_key=azure_openai_key,  
            ),  
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        keywords_fields=[SemanticField(field_name="file_name")],
        content_fields=[SemanticField(field_name="contents")]  
    ),  
)  

  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  
  
# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
try:    
    index_client.delete_index(index)
    print ('Index Deleted')
except Exception as ex:
    print (f"OK: Looks like index does not exist")

result = index_client.create_or_update_index(index)
print(f"{result.name} created")  

from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    )


# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=chunk_size_char,  #單位為Characters,可依需求變更 (中文字約1:1, 英文約4:1或5:1) 
    page_overlap_length=chunk_overlap_char,   
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_deployment,  
    api_key=azure_openai_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="content_vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="contents", source="/document/pages/*"),  
                InputFieldMappingEntry(name="content_vector", source="/document/pages/*/content_vector"),  
                InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"),  
                InputFieldMappingEntry(name="doc_url", source="/document/metadata_storage_path"), 
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(endpoint, credential)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  

from azure.search.documents.indexes.models import (
    SearchIndexer,
    )

# Create an indexer  
indexer_name = f"{index_name}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    # Add Additonal Mapping if required
    #field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="file_name")]  
)  
  
try:    
    indexer_client.delete_indexer(indexer)
    print ('Indexer Deleted')
except Exception as ex:
    print (f"OK: Looks like indexer does not exist")

indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')