{"id":"https://openalex.org/W4281481998","doi":"https://doi.org/10.48550/arxiv.2205.05055","title":"Data Distributional Properties Drive Emergent In-Context Learning in Transformers","display_name":"Data Distributional Properties Drive Emergent In-Context Learning in Transformers","publication_year":2022,"publication_date":"2022-04-22","ids":{"openalex":"https://openalex.org/W4281481998","doi":"https://doi.org/10.48550/arxiv.2205.05055"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2205.05055","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.05055","pdf_url":"https://arxiv.org/pdf/2205.05055","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2205.05055","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101543629","display_name":"Stephanie C. Y. Chan","orcid":"https://orcid.org/0000-0003-2590-7832"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chan, Stephanie C. Y.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112705664","display_name":"Adam Santoro","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Santoro, Adam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030015839","display_name":"Andrew K. Lampinen","orcid":"https://orcid.org/0000-0002-6988-8437"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lampinen, Andrew K.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054833182","display_name":"Jane X. Wang","orcid":"https://orcid.org/0000-0003-1339-5040"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jane X.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102781536","display_name":"Aaditya Singh","orcid":"https://orcid.org/0009-0004-5461-9594"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Aaditya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091349569","display_name":"Pierre H. Richemond","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Richemond, Pierre H.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103869934","display_name":"Jay McClelland","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McClelland, Jay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5010396788","display_name":"Felix Hill","orcid":"https://orcid.org/0000-0002-6712-1718"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hill, Felix","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5101543629"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":52,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9868000149726868,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9782999753952026,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6099573373794556},{"id":"https://openalex.org/keywords/burstiness","display_name":"Burstiness","score":0.517841637134552},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5159933567047119},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.509067952632904},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.45275259017944336},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09704437851905823}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6099573373794556},{"id":"https://openalex.org/C2781023610","wikidata":"https://www.wikidata.org/wiki/Q17006304","display_name":"Burstiness","level":3,"score":0.517841637134552},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5159933567047119},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.509067952632904},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.45275259017944336},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09704437851905823},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2205.05055","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.05055","pdf_url":"https://arxiv.org/pdf/2205.05055","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2205.05055","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2205.05055","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2205.05055","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2205.05055","pdf_url":"https://arxiv.org/pdf/2205.05055","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7599999904632568}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2961085424","https://openalex.org/W4306674287","https://openalex.org/W3046775127","https://openalex.org/W3107602296","https://openalex.org/W3170094116","https://openalex.org/W4386462264","https://openalex.org/W4364306694","https://openalex.org/W4312192474","https://openalex.org/W4283697347","https://openalex.org/W4210805261"],"abstract_inverted_index":{"Large":[0],"transformer-based":[1],"models":[2,151],"are":[3,92,99,105],"able":[4,206],"to":[5,28,108,154,207,236],"perform":[6],"in-context":[7,141,209,241,255],"few-shot":[8],"learning,":[9,149],"without":[10],"being":[11,68],"explicitly":[12],"trained":[13,178],"for":[14,130],"it.":[15],"This":[16],"observation":[17],"raises":[18],"the":[19,24,41,44,52,123,164,224,233,238],"question:":[20],"what":[21],"aspects":[22],"of":[23,43,77,115,167,190,232,244],"training":[25,45,53,126,234],"regime":[26],"lead":[27],"this":[29,36],"emergent":[30,240],"behavior?":[31],"Here,":[32],"we":[33,138,198],"show":[34],"that":[35,140,163,200],"behavior":[37],"is":[38],"driven":[39],"by":[40,101],"distributions":[42,127,203],"data":[46,54,110,180,202,235],"itself.":[47],"In-context":[48,81],"learning":[49,82,142,168,210,242,258],"emerges":[50,84],"when":[51,87,175],"exhibits":[55],"particular":[56,230],"distributional":[57],"properties":[58,98,231],"such":[59],"as":[60],"burstiness":[61],"(items":[62],"appear":[63],"in":[64,111,171,211,215,259],"clusters":[65],"rather":[66,94],"than":[67,95],"uniformly":[69],"distributed":[70],"over":[71],"time)":[72],"and":[73,150,213,248,256],"having":[74],"large":[75,245],"numbers":[76],"rarely":[78],"occurring":[79],"classes.":[80],"also":[83,106,119],"more":[85,146],"strongly":[86],"item":[88],"meanings":[89],"or":[90],"interpretations":[91],"dynamic":[93],"fixed.":[96],"These":[97],"exemplified":[100],"natural":[102],"language,":[103],"but":[104],"inherent":[107],"naturalistic":[109,191,201],"a":[112,172,182],"wide":[113],"range":[114],"other":[116],"domains.":[117],"They":[118],"depart":[120],"significantly":[121],"from":[122],"uniform,":[124],"i.i.d.":[125],"typically":[128],"used":[129],"standard":[131],"supervised":[132],"learning.":[133],"In":[134,195,218],"our":[135,159,220],"initial":[136],"experiments,":[137,197],"found":[139,199],"traded":[143],"off":[144],"against":[145],"conventional":[147],"weight-based":[148],"were":[152,204],"unable":[153],"achieve":[155],"both":[156,254],"simultaneously.":[157],"However,":[158],"later":[160],"experiments":[161],"uncovered":[162],"two":[165],"modes":[166],"could":[169],"co-exist":[170],"single":[173],"model":[174],"it":[176],"was":[177],"on":[179],"following":[181],"skewed":[183],"Zipfian":[184],"distribution":[185],"--":[186],"another":[187],"common":[188],"property":[189],"data,":[192],"including":[193],"language.":[194,262],"further":[196],"only":[205],"elicit":[208],"transformers,":[212],"not":[214],"recurrent":[216],"models.":[217],"sum,":[219],"findings":[221],"indicate":[222],"how":[223,249],"transformer":[225],"architecture":[226],"works":[227],"together":[228],"with":[229],"drive":[237],"intriguing":[239],"behaviour":[243],"language":[246],"models,":[247],"future":[250],"work":[251],"might":[252],"encourage":[253],"in-weights":[257],"domains":[260],"beyond":[261]},"counts_by_year":[{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":22},{"year":2023,"cited_by_count":20},{"year":2022,"cited_by_count":4}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2022-05-26T00:00:00"}
