
    =%i4                    @   d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ  ed      ZejA                  di       Z!ejA                  dd      Z"e!jA                  d      Z# e$e!jA                  d            Z% e&e!jA                  d            Z' e&e!jA                  d            Z( e)e!jA                  d            Z* ed      \  Z+Z,Z- ej\                  d      Z/e#s e0d       ee#e%e'      Z1e,r
 ee+e,      n ee+      Z2	 	 d(	 	 	 	 	 d)dZ3d*dZ4d+dZ5d,dZ6e(ddf	 	 	 	 	 	 	 	 	 d-dZ7d.d Z8 ed!      de(e*df	 	 	 	 	 	 	 	 	 	 	 d/d"       Z9e(dfd0d#Z:e;d$k(  rd%Z<g d&Z= e:e<e(e='       yy)1    )annotationsN)Path)ListDictAnyOptionalUnionIterable)QdrantClient)FilterFieldCondition
MatchValueMatchAny)tool)CrossEncoder)
qdrant_cfg)load_local_stlist_doc_idsload_rag_profilezrag_config.yamlretrieve
collectionmas_manualsembed_model_dirembed_devicemax_seq_lentop_k	min_scoreF)map_distancez(?:\d[\d\W]{0,8}){12,}zLprofiles.eqp_manuals.retrieve.embed_model_dir is required in rag_config.yaml)devicer   )urlapi_key)r    c           	        g }| rt        | t              r'|j                  t        dt	        |                    nit        |       }t        |      dk(  r*|j                  t        dt	        |d                      n&|j                  t        dt        |                   |j                  t        dt        dd	g                   t        |
      S )az  
    Build a Qdrant filter that:
      - matches doc_id (one or many)
      - restricts type to {'page','table'}
      - optionally restricts to specific pages (one or many)

    Args:
        doc_ids: str or iterable of str (e.g., "ABB-ops" or ["ABB-ops","Yokogawa-maint"])
        pages: iterable of ints (e.g., [20, 21])

    Returns:
        qdrant_client.models.Filter
    doc_id)value)keymatch   r   )anytypepagetable)must)	
isinstancestrappendr   r   listlenr   r   )doc_idspagesr,   doc_lists       -C:\Projects\mas-dev\aom\tools\rag\retrieve.py_page_filterr6   L   s    " D gs#KK8:G;TUVG}H8}!NxzPXYZP[?\]^NxxH?UVW 	KK6vw>O1PQR t    c           
     j    t         j                  t        | |ddt        |            }|j                  S )z
    Query Qdrant. Collection should use COSINE distance (as in ingest).
    Normalize embeddings at encoded time; Qdrant returns a relevance score.
    TF)collection_namequerylimitwith_payloadwith_vectorsquery_filter)QDRquery_points_COLLECTIONr6   points)qvecr;   r#   resps       r5   _search_pointsrE   x   s<    
 #!&)  D ;;r7   c                    g }| D ]M  }|j                   xs i j                  dd      xs d}t        j                  |      r=|j	                  |       O |S )Ntext )payloadget_NUMERIC_HEAVYsearchr/   )rB   cleanedptxts       r5   _hygienerP      sT    GyyB##FB/52  %q	 
 Nr7   c                ^   t        |t              r|g}n|}|j                  d       |D cg c]#  }|s|j                         j	                         % }}g }| D ]8  }|j                         t        fd|D              s(|j                  |       : t        t        |            S c c}w )zMReturn doc_ids that contain any of the maker substring(s) (case-insensitive).z
Pump-Guidec              3  &   K   | ]  }|v  
 y w)N ).0r%   	doc_lowers     r5   	<genexpr>z!filter_doc_ids.<locals>.<genexpr>   s     04Csi4s   )r-   r.   r/   casefoldstripr(   sortedset)r2   makermakersmkeysfilteredr#   rU   s          @r5   filter_doc_idsr`      s     % MM, +16&QAAJJL &D6 HOO%	0400OOF# 
 #h-   7s
   B* B*c                   t        t        t              }||t        ||      }|}n|}t        j                  | d      }t        |t        j                        r|j                         }t        |||      }t        |      }g }|D ]  }	|	j                  xs i }
|j                  t        |	j                        |
j!                  d      |
j!                  d      |
j!                  d      xs d|
j!                  d	      xs d|
j!                  d
      |
j!                  d      |
j!                  d      t#        |	j$                        d	        |S )a  
    Encode the query with a local model (normalized), search Qdrant, and return:
      {
        "score": float, # relevance (COSINE-based)
        "doc_id": str or list of str,
        "page": int,
        "title": str,
        "text": str,
        "page_image": str|None, # direct link to the PAGE IMAGE
        "chunk_id": str|None, # original chunk id
        "type": "page"|"table",
        "point_id": str,
        "maker": List str|None,
      }
    )clientr   T)normalize_embeddings)r;   r#   r#   r*   titlerH   rG   
page_imagechunk_idr)   )	scorer#   r*   rd   rG   re   rf   r)   point_id)r   r?   rA   r`   STencode_queryr-   npndarraytolistrE   rP   rI   r/   floatrg   rJ   r.   id)r:   r   r#   r[   
all_doc_iddoc_id_listqvrB   outrN   pls              r5   r   r      s   * S[AJ~(U;K FF	T	:B"bjj!YY[BeF;FfF "CYY_"

177^ffX&FF6NVVG_*FF6N(b&&.z*FF6NADD	

 
	  Jr7   c                n   | syg }| D ]  }d|j                  d      xs dj                          d|j                  dd       d|j                  d       dt        |j                  d	d
            d}|j                  d      xs dj                         }|j                  d      xs dj                         }|j                  d      xs d}d|j                  dd       d|j                  d       d| d| }|j	                  dj                  |d| ||g              dj                  |      S )z
    Build a compact plain-text context for the orchestrator summarizer.
    Each item contains a 'CITE:' line with doc_id, page, title, and page image link.
    rH   [r)   r*   z	] doc_id=r#   u	    · page=u
    · score=rg           .3frd   rG   re   zCITE: doc_id=z; page=z; title=z; image=
zTitle: z

---

)rJ   upperrn   rX   r/   join)hitspartshheaderrd   rG   imgcites           r5   _format_contextr     sN   
 Ev(&//12 3eeHR()155=/ B155-.s35 	
 w%2,,.v%2,,.l#)reeHR() *EE&M? #G E	 	 	TYY'%(94FGH! " e$$r7   retrieve_contextc           	         t        | |||      }|D cg c],  }t        |j                  dd            t        |      k\  s+|. }}t        |      S c c}w )aC  
    Retrieve relevant context (page and summarized tables) for a prompt using cosine similarity.
    Returns a plain-text block ready for the orchestrator summarizer.
    Each chunk includes a mandatory 'CITE:' line with doc_id, page, title, and page image link.
    Only results with score >= min_score are included.
    )r   r#   r[   rg   rw   )r   rn   rJ   r   )promptr#   r   r   r[   r|   r~   s          r5   r   r   8  sU     F%eDDNt!uQUU7C%89U9=MMAtDN4   Os   ,AAc                J   t        | ||      }|st        d       y t        |d      D ]x  \  }}t        |dd|d   dd|d	    d
|d           |j                  d      rt        d|d          |j                  d      xs dj	                  dd      }t        d|d       z y )Nr   r#   zNo results.r'   02dz. score=rg   rx   z  [Page r*   z]  r#   re   z    page_image:rG   rH   ry    z    )r   print	enumeraterJ   replace)r:   r   r#   r|   ir~   snippets          r5   _print_resultsr   `  s    Ev6Dm$"13x'
3/x&	{#ak]ST55#Q|_555=&B//c:fgt$ #r7   __main__z,Show me the solutions for misalignment issue)zPump-Guide-Web-1zPump-Guide-1zPump-Guide-2zEBARA-operation-maintenancezEBARA-operation-maintenance-2r   )NN)r2   #Optional[Union[str, Iterable[str]]]r3   zOptional[Iterable[int]]returnr   )rC   zList[float]r;   intr#   r   r   	List[Any])rB   r   r   r   )r2   	List[str]r[   zUnion[str, List[str]]r   r   )
r:   r.   r   r   r#   r   r[   r   r   List[Dict[str, Any]])r|   r   r   r.   )r   r.   r#   r   r   r   r   rn   r[   r   r   r.   )r:   r.   r   r   r#   zOptional[str]r   None)>
__future__r   repathlibr   typingr   r   r   r   r	   r
   numpyrk   qdrant_clientr   qdrant_client.modelsr   r   r   r   langchain_core.toolsr   sentence_transformersr   aom.utils.utilitiesr   aom.tools.rag.rag_helperr   r   r   _PROFrJ   	_RETRIEVErA   _EMBED_MODEL_DIRr.   _EMBED_DEVICEr   _MAX_SEQ_LEN_TOP_K_DEFAULTrn   _MIN_SCORE_DEFAULT_QDRANT_URL_QDRANT_API_KEY_QDRANT_DISTANCEcompilerK   RuntimeErrorri   r?   r6   rE   rP   r`   r   r   r   r   __name__EXAMPLE_QUERYDOC_ID_FILTERrS   r7   r5   <module>r      s)   # 	  = =  & M M % . * R R 	*+IIj"%	iim4==!23 IMM.129==/0Y]]7+,9==56 1;1O ._.B 56
 
e
ff#M|T@Ol{O<UafqUr
 48%))0)") )X !6  26	333 03 	3
 3`%:  37)!!/! ! 	!
 ! 	! !N -;TX 
% zBMUM=}M	 r7   